s4_server/
service.rs

1//! `s3s::S3` 実装 — `s3s_aws::Proxy` への delegation を default にしつつ、
2//! `put_object` / `get_object` 経路で `s4_codec::CodecRegistry` を呼ぶ。
3//!
4//! ## カバー範囲 (Phase 1 月 2)
5//!
6//! - 圧縮 hook あり: `put_object`, `get_object`
7//! - 純 delegation (圧縮なし): `head_bucket`, `list_buckets`, `create_bucket`, `delete_bucket`,
8//!   `head_object`, `delete_object`, `delete_objects`, `copy_object`, `list_objects`,
9//!   `list_objects_v2`, `create_multipart_upload`, `upload_part`,
10//!   `complete_multipart_upload`, `abort_multipart_upload`, `list_multipart_uploads`,
11//!   `list_parts`
12//! - 未対応 (デフォルトで NotImplemented): その他 80+ ops (Tagging / ACL / Lifecycle 等は Phase 2)
13//!
14//! ## アーキテクチャ
15//!
16//! - `S4Service<B>` は backend (B: S3) と `Arc<CodecRegistry>` と `Arc<dyn CodecDispatcher>`
17//!   を保持する。`CodecRegistry` 経由で複数 codec を抱えられるので、ひとつの S4 インスタンスが
18//!   複数 codec で書かれた object を透過的に GET できる
19//! - PUT: dispatcher が body の先頭 sample から codec を選び、registry で compress、
20//!   manifest を S3 metadata に書いて backend に forward
21//! - GET: backend から取得 → metadata から manifest を復元 → registry.decompress で
22//!   manifest 指定の codec で解凍 → 元の bytes を return
23//!
24//! ## 既知の制限事項
25//!
26//! - **Multipart Upload は per-part 圧縮が未実装**: 現状は upload_part を素通し。
27//!   Phase 1 月 2 後半で per-part compress + complete_multipart_upload で manifest 集約。
28//! - **PUT body は memory に collect**: max_body_bytes 上限あり (default 5 GiB = S3 単発 PUT 上限)。
29//!   Streaming-aware 圧縮は Phase 2。
30
31use std::sync::Arc;
32
33use base64::Engine as _;
34use bytes::BytesMut;
35use s3s::dto::*;
36use s3s::{S3, S3Error, S3ErrorCode, S3Request, S3Response, S3Result};
37use s4_codec::index::{FrameIndex, build_index_from_body, decode_index, encode_index, sidecar_key};
38use s4_codec::multipart::{
39    FRAME_HEADER_BYTES, FrameHeader, FrameIter, S3_MULTIPART_MIN_PART_BYTES, pad_to_minimum,
40    write_frame,
41};
42use s4_codec::{ChunkManifest, CodecDispatcher, CodecKind, CodecRegistry, CompressTelemetry};
43use std::time::Instant;
44use tracing::{debug, info};
45
46use crate::blob::{
47    bytes_to_blob, chain_sample_with_rest, collect_blob, collect_with_sample, peek_sample,
48};
49use crate::streaming::{
50    Crc32cVerifyingReader, async_read_to_blob, blob_to_async_read, cpu_zstd_decompress_stream,
51    pick_chunk_size, streaming_compress_to_frames, supports_streaming_compress,
52    supports_streaming_decompress,
53};
54
55/// PUT body の先頭 sampling で渡す最大 byte 数。
56const SAMPLE_BYTES: usize = 4096;
57
58/// v0.8 #55: stamp the GPU pipeline metrics (`s4_gpu_compress_seconds`,
59/// `s4_gpu_throughput_bytes_per_sec`, `s4_gpu_oom_total`) from a
60/// `CompressTelemetry` returned by `CodecRegistry::compress_with_telemetry`.
61/// CPU codecs (`gpu_seconds = None`) are no-ops here — they're already
62/// covered by the existing `s4_request_latency_seconds` / `s4_bytes_*`
63/// counters in the request-level `record_put` / `record_get` calls.
64#[inline]
65fn stamp_gpu_compress_telemetry(tel: &CompressTelemetry) {
66    if let Some(secs) = tel.gpu_seconds {
67        crate::metrics::record_gpu_compress(tel.codec, secs, tel.bytes_in, tel.bytes_out);
68    }
69    if tel.oom {
70        crate::metrics::record_gpu_oom(tel.codec);
71    }
72}
73
74/// v0.7 #49: percent-encoding set covering everything that is **not** an
75/// `unreserved` character per RFC 3986 §2.3, **plus** we additionally
76/// encode the path-reserved sub-delims that `http::Uri` rejects in a
77/// path segment (`?`, `#`, `%`, control bytes, space, etc.). We
78/// deliberately keep `/` un-encoded because S3 keys legally use `/` as
79/// a logical separator and the rest of the synthetic URI relies on the
80/// path layout `/{bucket}/{key}` round-tripping byte-for-byte.
81const URI_KEY_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
82    .add(b' ')
83    .add(b'"')
84    .add(b'#')
85    .add(b'<')
86    .add(b'>')
87    .add(b'?')
88    .add(b'`')
89    .add(b'{')
90    .add(b'}')
91    .add(b'|')
92    .add(b'\\')
93    .add(b'^')
94    .add(b'[')
95    .add(b']')
96    .add(b'%');
97
98/// v0.7 #49: build the synthetic `/{bucket}/{key}` request URI used by
99/// the sidecar / replication helpers when they re-enter the backend
100/// trait without going through the HTTP layer. S3 object keys can
101/// contain spaces, control bytes, and arbitrary Unicode that would
102/// make `format!(...).parse::<http::Uri>()` panic; we percent-encode
103/// the key bytes (RFC 3986 path segment) and the bucket name (defensive
104/// — bucket names are normally DNS-safe, but the helper is the single
105/// choke-point) before splicing them in. If the encoded form *still*
106/// fails to parse (extremely unlikely once everything outside the
107/// unreserved set is escaped) we surface a typed `400 InvalidObjectName`
108/// instead of crashing the worker.
109pub(crate) fn safe_object_uri(bucket: &str, key: &str) -> S3Result<http::Uri> {
110    use percent_encoding::utf8_percent_encode;
111    let bucket_enc = utf8_percent_encode(bucket, URI_KEY_ENCODE_SET);
112    let key_enc = utf8_percent_encode(key, URI_KEY_ENCODE_SET);
113    let raw = format!("/{bucket_enc}/{key_enc}");
114    raw.parse::<http::Uri>().map_err(|e| {
115        // S3 spec uses `InvalidObjectName` (HTTP 400) for keys that
116        // can't be represented in a request URI. The generated
117        // `S3ErrorCode` enum doesn't expose a typed variant for it,
118        // so we round-trip through `from_bytes` which preserves the
119        // canonical wire string while falling back to InvalidArgument
120        // if even that lookup fails (cannot happen at runtime — kept
121        // as a belt-and-suspenders branch so this helper never
122        // panics).
123        let code =
124            S3ErrorCode::from_bytes(b"InvalidObjectName").unwrap_or(S3ErrorCode::InvalidArgument);
125        S3Error::with_message(
126            code,
127            format!("object key cannot be encoded as a request URI: {e}"),
128        )
129    })
130}
131
132/// v0.8.12 HIGH-12 fix: verify a client-supplied integrity checksum
133/// against the received body BEFORE we strip the header on the way
134/// to the backend. Returns `Err(BadDigest)` on mismatch (matches
135/// AWS S3 wire behaviour); `Ok(())` when the supplied digest matches
136/// OR when the supplied algorithm is one we don't yet implement
137/// (the latter is logged so operators see the gap — fail-open on
138/// unsupported algorithms is the documented trade in the v0.8.11
139/// CHANGELOG, with full coverage tracked as a follow-up issue).
140///
141/// Algorithms covered: `Content-MD5` (base64 MD5),
142/// `x-amz-checksum-crc32c` (base64 big-endian u32),
143/// `x-amz-checksum-sha256` (base64 SHA-256). The remaining S3
144/// checksum algorithms (CRC32 non-Castagnoli, SHA-1, CRC64-NVME)
145/// are accepted and silently passed; verifying them needs new
146/// dependencies and was held back to keep the v0.8.12 surface
147/// bounded.
148#[allow(clippy::too_many_arguments)]
149fn verify_client_body_checksums(
150    body: &[u8],
151    content_md5_b64: Option<&str>,
152    checksum_crc32_b64: Option<&str>,
153    checksum_crc32c_b64: Option<&str>,
154    checksum_sha1_b64: Option<&str>,
155    checksum_sha256_b64: Option<&str>,
156    checksum_crc64nvme_b64: Option<&str>,
157) -> S3Result<()> {
158    use base64::Engine as _;
159    use md5::Md5;
160    use sha2::Sha256;
161    // `Digest` from md-5 / sha2 brings the `new`, `update`, `finalize`
162    // trait methods into scope. Bind anonymously so this `use` is
163    // never flagged as unused while still serving its real purpose.
164    use md5::Digest as _;
165    let b64 = base64::engine::general_purpose::STANDARD;
166    let bad = |what: &str| {
167        let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
168        S3Error::with_message(
169            code,
170            format!("client-supplied {what} did not match the received body"),
171        )
172    };
173    if let Some(claimed) = content_md5_b64 {
174        let want = b64.decode(claimed).map_err(|_| {
175            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed Content-MD5")
176        })?;
177        if want.len() != 16 {
178            return Err(S3Error::with_message(
179                S3ErrorCode::InvalidDigest,
180                "Content-MD5 must decode to 16 bytes",
181            ));
182        }
183        let mut h = Md5::new();
184        h.update(body);
185        let got = h.finalize();
186        // `subtle::ConstantTimeEq` would be ideal but the existing
187        // `constant_time_eq` helper in sse.rs is private; use a
188        // straightforward byte compare. The attacker doesn't get to
189        // choose the body retroactively, so a timing oracle here
190        // doesn't help them. `&got[..]` derefs the GenericArray
191        // into a `&[u8]` (the deprecated `.as_slice()` is gone in
192        // generic-array 1.x; CI runs `-D warnings`).
193        if got[..] != *want.as_slice() {
194            return Err(bad("Content-MD5"));
195        }
196    }
197    if let Some(claimed) = checksum_crc32c_b64 {
198        let want = b64.decode(claimed).map_err(|_| {
199            S3Error::with_message(
200                S3ErrorCode::InvalidDigest,
201                "malformed x-amz-checksum-crc32c",
202            )
203        })?;
204        if want.len() != 4 {
205            return Err(S3Error::with_message(
206                S3ErrorCode::InvalidDigest,
207                "x-amz-checksum-crc32c must decode to 4 bytes (big-endian u32)",
208            ));
209        }
210        let got = crc32c::crc32c(body).to_be_bytes();
211        if got != want.as_slice() {
212            return Err(bad("x-amz-checksum-crc32c"));
213        }
214    }
215    if let Some(claimed) = checksum_sha256_b64 {
216        let want = b64.decode(claimed).map_err(|_| {
217            S3Error::with_message(
218                S3ErrorCode::InvalidDigest,
219                "malformed x-amz-checksum-sha256",
220            )
221        })?;
222        if want.len() != 32 {
223            return Err(S3Error::with_message(
224                S3ErrorCode::InvalidDigest,
225                "x-amz-checksum-sha256 must decode to 32 bytes",
226            ));
227        }
228        let mut h = Sha256::new();
229        h.update(body);
230        let got = h.finalize();
231        if got[..] != *want.as_slice() {
232            return Err(bad("x-amz-checksum-sha256"));
233        }
234    }
235    // v0.8.12 #128 (MED-C): CRC32 (IEEE 802.3 — the non-Castagnoli
236    // variant AWS uses for `x-amz-checksum-crc32`). 4-byte
237    // big-endian value, base64-encoded.
238    if let Some(claimed) = checksum_crc32_b64 {
239        let want = b64.decode(claimed).map_err(|_| {
240            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-crc32")
241        })?;
242        if want.len() != 4 {
243            return Err(S3Error::with_message(
244                S3ErrorCode::InvalidDigest,
245                "x-amz-checksum-crc32 must decode to 4 bytes (big-endian u32)",
246            ));
247        }
248        let mut h = crc32fast::Hasher::new();
249        h.update(body);
250        let got = h.finalize().to_be_bytes();
251        if got != want.as_slice() {
252            return Err(bad("x-amz-checksum-crc32"));
253        }
254    }
255    // v0.8.12 #128 (MED-C): SHA-1. 20-byte digest, base64-encoded.
256    if let Some(claimed) = checksum_sha1_b64 {
257        use sha1::Sha1;
258        let want = b64.decode(claimed).map_err(|_| {
259            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-sha1")
260        })?;
261        if want.len() != 20 {
262            return Err(S3Error::with_message(
263                S3ErrorCode::InvalidDigest,
264                "x-amz-checksum-sha1 must decode to 20 bytes",
265            ));
266        }
267        let mut h = Sha1::new();
268        h.update(body);
269        let got = h.finalize();
270        if got[..] != *want.as_slice() {
271            return Err(bad("x-amz-checksum-sha1"));
272        }
273    }
274    // v0.8.12 #128 (MED-C): CRC64-NVME — AWS's newest checksum
275    // algorithm. NVMe spec: poly 0xad93d23594c93659, init / xorout
276    // 0xffffffffffffffff, refin / refout true. The reflected
277    // polynomial + 256-entry lookup table are computed lazily on
278    // first call (small enough to inline rather than pull in a
279    // dedicated crc64 crate).
280    if let Some(claimed) = checksum_crc64nvme_b64 {
281        let want = b64.decode(claimed).map_err(|_| {
282            S3Error::with_message(
283                S3ErrorCode::InvalidDigest,
284                "malformed x-amz-checksum-crc64nvme",
285            )
286        })?;
287        if want.len() != 8 {
288            return Err(S3Error::with_message(
289                S3ErrorCode::InvalidDigest,
290                "x-amz-checksum-crc64nvme must decode to 8 bytes (big-endian u64)",
291            ));
292        }
293        let got = crc64_nvme(body).to_be_bytes();
294        if got != want.as_slice() {
295            return Err(bad("x-amz-checksum-crc64nvme"));
296        }
297    }
298    Ok(())
299}
300
301/// v0.9 #106-audit-R2 P2-INT-2: verify SigV4-streaming **trailer**-supplied
302/// checksums against an already-finalised [`ComputedDigests`].
303///
304/// Shared between the streaming-framed branch (digests computed via the
305/// tee wrapper) and the buffered branch (digests computed in one shot
306/// over the in-memory body via [`crate::streaming_checksum::compute_digests`]).
307/// Centralising the logic prevents the pre-#106 fail-open shape —
308/// where one branch verified trailers and the other silently skipped
309/// them — from regressing. Both branches now go through the same
310/// announce-parsing / fail-closed / per-name `compare_b64` pipeline.
311///
312/// Fail-closed posture (matches the streaming branch's behaviour):
313///
314/// - No `x-amz-trailer` header → returns Ok (no verification claimed).
315/// - Header announces only non-checksum trailers (`x-amz-trailer-signature`,
316///   custom) → returns Ok (filter selects checksum names only).
317/// - Header announces `x-amz-checksum-*` but the trailing-headers handle
318///   was absent → `BadDigest`.
319/// - Handle present but trailers were never delivered (`read` returns
320///   None) → `BadDigest`.
321/// - Trailer announced but value missing in the delivered block → `BadDigest`.
322/// - Value present but malformed / mismatched / refers to an unhashed
323///   algorithm → `BadDigest` / `InvalidDigest` per [`ComputedDigests::compare_b64`].
324fn verify_client_trailer_checksums(
325    announced: Option<&str>,
326    trailers_handle: Option<&s3s::TrailingHeaders>,
327    computed: &crate::streaming_checksum::ComputedDigests,
328) -> S3Result<()> {
329    let Some(announced) = announced else {
330        return Ok(());
331    };
332    let promised_checksum_trailers: Vec<String> = announced
333        .split(',')
334        .map(|s| s.trim().to_string())
335        .filter(|n| {
336            // RFC 9110 §5.1: HTTP header names are
337            // case-insensitive — match accordingly.
338            n.to_ascii_lowercase().starts_with("x-amz-checksum-")
339        })
340        .collect();
341    if promised_checksum_trailers.is_empty() {
342        return Ok(());
343    }
344    let bad_digest = |msg: String| -> S3Error {
345        let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
346        S3Error::with_message(code, msg)
347    };
348    let Some(th) = trailers_handle else {
349        return Err(bad_digest(
350            "client announced checksum trailer(s) via x-amz-trailer but \
351             no trailing-headers handle was attached to the request"
352                .into(),
353        ));
354    };
355    let result = th.read(|hmap| {
356        for name in &promised_checksum_trailers {
357            match hmap.get(name.as_str()).and_then(|v| v.to_str().ok()) {
358                Some(val) => {
359                    computed.compare_b64(name, val)?;
360                }
361                None => {
362                    return Err(bad_digest(format!(
363                        "client announced trailer {name} via \
364                         x-amz-trailer but the trailer value was \
365                         missing or unparseable"
366                    )));
367                }
368            }
369        }
370        Ok::<(), S3Error>(())
371    });
372    match result {
373        Some(Ok(())) => Ok(()),
374        Some(Err(e)) => Err(e),
375        None => Err(bad_digest(
376            "client announced checksum trailer(s) via x-amz-trailer \
377             but no trailing-headers block was delivered with the body"
378                .into(),
379        )),
380    }
381}
382
383/// v0.8.12 #128 (MED-C): CRC-64/NVME (AWS S3 `x-amz-checksum-crc64nvme`).
384/// NVMe spec: poly 0xad93d23594c93659, init 0xffffffffffffffff, refin
385/// true, refout true, xorout 0xffffffffffffffff. The reflected
386/// polynomial table is computed lazily on first call via
387/// [`std::sync::OnceLock`]; subsequent calls share the 256-entry table.
388fn crc64_nvme(bytes: &[u8]) -> u64 {
389    use std::sync::OnceLock;
390    static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
391    let tbl = TABLE.get_or_init(|| {
392        // Reflected polynomial (bit-reverse of 0xad93d23594c93659).
393        const POLY_REFLECTED: u64 = 0x9a6c_9329_ac4b_c9b5;
394        let mut t = [0u64; 256];
395        let mut i = 0usize;
396        while i < 256 {
397            let mut c = i as u64;
398            let mut j = 0;
399            while j < 8 {
400                c = if c & 1 != 0 {
401                    (c >> 1) ^ POLY_REFLECTED
402                } else {
403                    c >> 1
404                };
405                j += 1;
406            }
407            t[i] = c;
408            i += 1;
409        }
410        t
411    });
412    let mut crc: u64 = !0u64;
413    for &b in bytes {
414        let idx = ((crc as u8) ^ b) as usize;
415        crc = (crc >> 8) ^ tbl[idx];
416    }
417    !crc
418}
419
420/// v0.4 #20: captured at the start of a handler, before the request is
421/// consumed by the backend call, so the matching `record_access` at
422/// end-of-request can fill in the structured access log entry.
423struct AccessLogPreamble {
424    remote_ip: Option<String>,
425    requester: Option<String>,
426    request_uri: String,
427    user_agent: Option<String>,
428}
429
430pub struct S4Service<B: S3> {
431    /// Wrapped in `Arc` so the v0.6 #40 cross-bucket replication
432    /// dispatcher can clone it into a detached `tokio::spawn` task
433    /// (Arc::clone is cheap; backend trait methods take `&self` so no
434    /// other handler is affected by the indirection).
435    backend: Arc<B>,
436    registry: Arc<CodecRegistry>,
437    dispatcher: Arc<dyn CodecDispatcher>,
438    max_body_bytes: usize,
439    policy: Option<crate::policy::SharedPolicy>,
440    /// v0.3 #13: surfaced as the `aws:SecureTransport` Condition key. Set
441    /// to `true` when the listener is wrapped in TLS (or ACME), so policies
442    /// gating "deny if not over TLS" can do their job. Defaults to `false`
443    /// (HTTP); set via [`S4Service::with_secure_transport`] at boot.
444    secure_transport: bool,
445    /// v0.4 #19: optional per-(principal, bucket) token-bucket limiter.
446    rate_limits: Option<crate::rate_limit::SharedRateLimits>,
447    /// v0.4 #20: optional S3-style access log emitter.
448    access_log: Option<crate::access_log::SharedAccessLog>,
449    /// v0.4 #21 / v0.5 #29: optional server-side encryption keyring
450    /// (AES-256-GCM). When set, every PUT body gets wrapped in S4E2
451    /// (with the keyring's active key id) after the compress + framing
452    /// steps; every GET that sniffs as S4E1/S4E2 is decrypted before
453    /// frame parsing. A `with_sse_key(...)` call wraps the supplied
454    /// key in a 1-slot keyring so single-key (v0.4) operators get the
455    /// same behaviour they had before, just on the v2 frame.
456    sse_keyring: Option<crate::sse::SharedSseKeyring>,
457    /// v0.5 #34: optional first-class versioning state machine. When
458    /// `Some(...)`, S4-server itself owns the per-bucket versioning
459    /// state + per-(bucket, key) version chain; PUT / GET / DELETE /
460    /// list_object_versions / get_bucket_versioning /
461    /// put_bucket_versioning handlers consult the manager instead of
462    /// passing through. When `None` (default), the legacy
463    /// backend-passthrough behaviour applies so existing v0.4
464    /// deployments are unaffected until they explicitly call
465    /// `with_versioning(...)`.
466    versioning: Option<Arc<crate::versioning::VersioningManager>>,
467    /// v0.5 #28: optional SSE-KMS envelope-encryption backend. When
468    /// `Some(...)`, PUTs carrying `x-amz-server-side-encryption: aws:kms`
469    /// generate a fresh DEK via the backend, encrypt the body with it
470    /// (S4E4 frame), and persist only the wrapped DEK. GETs sniffing as
471    /// S4E4 unwrap the DEK through the same backend before decrypt.
472    /// `kms_default_key_id` is used when the request omits an explicit
473    /// `x-amz-server-side-encryption-aws-kms-key-id` (mirrors AWS S3
474    /// bucket-default behaviour).
475    kms: Option<Arc<dyn crate::kms::KmsBackend>>,
476    kms_default_key_id: Option<String>,
477    /// v0.5 #30: optional Object Lock (WORM) enforcement layer. When
478    /// `Some(...)`, `delete_object` and overwrite-style `put_object`
479    /// consult the manager and refuse the operation with HTTP 403
480    /// `AccessDenied` while the object is locked (Compliance until
481    /// expiry, Governance unless the bypass header is set, or any time
482    /// a legal hold is on). PUT also auto-applies the bucket-default
483    /// retention to brand-new objects when configured. When `None`
484    /// (default), the legacy backend-passthrough behaviour applies, so
485    /// existing v0.4 deployments are unaffected until they explicitly
486    /// call `with_object_lock(...)`.
487    object_lock: Option<Arc<crate::object_lock::ObjectLockManager>>,
488    /// v0.6 #38: optional first-class CORS bucket configuration manager.
489    /// When `Some(...)`, S4-server itself owns per-bucket CORS rules and
490    /// `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
491    /// consult the manager instead of passing through to the backend.
492    /// `handle_preflight` (public method on `S4Service`) routes OPTIONS-
493    /// style preflight matching through the same store; the actual HTTP
494    /// OPTIONS routing wire-up at the listener level is a follow-up
495    /// (s3s framework does not surface OPTIONS as a typed handler).
496    cors: Option<Arc<crate::cors::CorsManager>>,
497    /// v0.6 #36: optional first-class S3 Inventory manager. When
498    /// `Some(...)`, S4-server itself owns per-(bucket, id) inventory
499    /// configurations and `put_bucket_inventory_configuration` /
500    /// `get_bucket_inventory_configuration` /
501    /// `list_bucket_inventory_configurations` /
502    /// `delete_bucket_inventory_configuration` consult the manager
503    /// instead of passing through to the backend. The actual periodic
504    /// CSV emission is driven by a tokio task in `main.rs` that calls
505    /// `InventoryManager::run_once_for_test` on a fixed cadence; the
506    /// service handlers below only deal with config-level CRUD.
507    inventory: Option<Arc<crate::inventory::InventoryManager>>,
508    /// v0.6 #35: optional first-class S3 bucket-notification manager.
509    /// When `Some(...)`, S4-server itself owns per-bucket notification
510    /// configurations and `put_bucket_notification_configuration` /
511    /// `get_bucket_notification_configuration` consult the manager
512    /// instead of passing through to the backend. Successful PUT /
513    /// DELETE handlers fire matching destinations on a detached tokio
514    /// task (best-effort; see `crate::notifications::dispatch_event`).
515    notifications: Option<Arc<crate::notifications::NotificationManager>>,
516    /// v0.6 #37: optional first-class S3 Lifecycle configuration
517    /// manager. When `Some(...)`, S4-server itself owns per-bucket
518    /// lifecycle rules and `put_bucket_lifecycle_configuration` /
519    /// `get_bucket_lifecycle_configuration` /
520    /// `delete_bucket_lifecycle` consult the manager instead of
521    /// passing through to the backend. The actual background scanner
522    /// (list_objects_v2 -> evaluate -> delete / metadata-rewrite per
523    /// rule) is a v0.7+ follow-up; the test path
524    /// `S4Service::run_lifecycle_once_for_test` exercises the
525    /// evaluator end-to-end so this v0.6 #37 wiring is enough to ship
526    /// the configuration-management half without putting a
527    /// half-wired bucket-walk in front of users.
528    lifecycle: Option<Arc<crate::lifecycle::LifecycleManager>>,
529    /// v0.6 #39: optional first-class object + bucket Tagging manager.
530    /// When `Some(...)`, S4-server itself owns per-(bucket, key) and
531    /// per-bucket tag state — `PutObjectTagging` /
532    /// `GetObjectTagging` / `DeleteObjectTagging` /
533    /// `PutBucketTagging` / `GetBucketTagging` /
534    /// `DeleteBucketTagging` route through the manager (replacing the
535    /// previous backend-passthrough behaviour). `put_object` also
536    /// pre-parses the `x-amz-tagging` header / `Tagging` input field
537    /// so the IAM policy evaluator can gate on
538    /// `s3:RequestObjectTag/<key>` and `s3:ExistingObjectTag/<key>`.
539    /// On a successful PUT the parsed tags are persisted; on a
540    /// successful DELETE the matching tag entry is dropped.
541    tagging: Option<Arc<crate::tagging::TagManager>>,
542    /// v0.6 #40: optional first-class cross-bucket replication manager.
543    /// When `Some(...)`, S4-server itself owns per-bucket replication
544    /// rules; `PutBucketReplication` / `GetBucketReplication` /
545    /// `DeleteBucketReplication` route through the manager (replacing
546    /// the previous backend-passthrough behaviour). On every successful
547    /// `put_object` the manager's rule list is consulted; the
548    /// highest-priority matching enabled rule wins, the per-key status
549    /// is recorded as `Pending`, and the source body and metadata are
550    /// handed to a detached tokio task that PUTs to the destination
551    /// bucket through the same backend. The replica is stamped with
552    /// `x-amz-replication-status: REPLICA` in its metadata; the
553    /// source-side status is updated to `Completed` on success or
554    /// `Failed` after the 3-attempt retry budget is exhausted (drop
555    /// counter bumps in either-side case so dashboards see the loss).
556    /// `head_object` / `get_object` echo the recorded status back as
557    /// `x-amz-replication-status` so consumers can poll progress.
558    /// Limited to single-instance (same `S4Service`) replication; true
559    /// cross-region (multi-instance) is a v0.7+ follow-up.
560    replication: Option<Arc<crate::replication::ReplicationManager>>,
561    /// v0.6 #42: optional MFA-Delete enforcement layer. When `Some(...)`,
562    /// every DELETE / DELETE-version / delete-marker / `PutBucketVersioning`
563    /// request against a bucket whose MFA-Delete state is `Enabled`
564    /// must carry `x-amz-mfa: <serial> <code>` (RFC 6238 6-digit TOTP);
565    /// missing or invalid tokens return HTTP 403 `AccessDenied`. When
566    /// `None` (default), the gate is a no-op so existing v0.4 / v0.5
567    /// deployments are unaffected until they explicitly call
568    /// `with_mfa_delete(...)`.
569    mfa_delete: Option<Arc<crate::mfa::MfaDeleteManager>>,
570    /// v0.5 #32: when `true`, every PUT must carry an SSE indicator
571    /// (`x-amz-server-side-encryption`, the SSE-C customer-key headers,
572    /// or be matched against a configured server-managed keyring/KMS).
573    /// Set by `--compliance-mode strict` after the boot-time
574    /// prerequisite check passes.
575    compliance_strict: bool,
576    /// v0.7 #47: optional SigV4a (asymmetric ECDSA-P256-SHA256) verify
577    /// gate. When `Some(...)`, the listener-side middleware (see
578    /// [`crate::routing::try_sigv4a_verify`]) inspects every incoming
579    /// request and short-circuits SigV4a-signed ones — verifying the
580    /// signature against the credential store and returning 403
581    /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` on failure. Plain
582    /// SigV4 (HMAC-SHA256) requests pass through to s3s untouched. When
583    /// `None`, the middleware is a no-op so the existing SigV4 path is
584    /// unaffected (operators opt in via `--sigv4a-credentials <DIR>`).
585    sigv4a_gate: Option<Arc<SigV4aGate>>,
586    /// v0.8 #54 BUG-5..10: per-`upload_id` side-table that ferries the
587    /// SSE / Tagging / Object-Lock context captured at
588    /// `CreateMultipartUpload` time through to `UploadPart` /
589    /// `CompleteMultipartUpload`. Always-on (no `with_*` flag) — the
590    /// store is gateway-internal and idle when no multipart is in
591    /// flight. See [`crate::multipart_state`] for rationale.
592    multipart_state: Arc<crate::multipart_state::MultipartStateStore>,
593    /// v0.8 #52: plaintext bytes per S4E5 chunk on the SSE-S4 PUT
594    /// path. `0` (default) → use the legacy buffered S4E2 path
595    /// (whole-body AES-GCM tag, GET buffers + verifies before
596    /// emitting). Non-zero → use the chunked S4E5 frame so GET can
597    /// stream-decrypt chunk-by-chunk. Wired by `--sse-chunk-size`
598    /// in `main.rs`. SSE-C and SSE-KMS are intentionally unaffected
599    /// (chunked variants tracked in a follow-up issue).
600    sse_chunk_size: usize,
601    /// v0.8.5 #86 (audit M-2): bounded permit pool gating the detached
602    /// replication dispatcher in [`Self::spawn_replication_if_matched`].
603    /// Without this cap, a high-volume PUT workload (1k req/s × N enabled
604    /// rules × slow destination = O(10k) in-flight tokio tasks) could
605    /// exhaust process memory before the destination drains. Each
606    /// dispatcher spawn `acquire_owned`s one permit and holds it for the
607    /// lifetime of the destination PUT + status stamp; once the cap is
608    /// reached the dispatcher async-blocks on `acquire_owned()` so the
609    /// listener path itself never stalls — only the in-flight replica
610    /// queue depth is bounded. Default 1024 (operator-tunable via
611    /// `--replication-max-concurrent`).
612    replication_semaphore: Arc<tokio::sync::Semaphore>,
613    /// v0.8.11 CRIT-4 fix: trust the `X-Forwarded-For` header for the
614    /// `aws:SourceIp` Condition key only when the operator has
615    /// explicitly opted in via `--trust-x-forwarded-for`. Default
616    /// (`false`) makes the policy evaluator see `source_ip = None`
617    /// for incoming requests, so a public-internet client can no
618    /// longer spoof an internal CIDR by setting `X-Forwarded-For`
619    /// themselves. Operators behind a trusted reverse proxy that
620    /// scrubs / sets `X-Forwarded-For` enable the flag; gateways
621    /// listening directly on the public internet leave it off and
622    /// gain a clear fail-closed default. A future release plumbs
623    /// the TCP peer address through the s3s service trait so we can
624    /// validate the forwarded header against a `--trusted-proxies`
625    /// CIDR list; until then the boolean opt-in closes the immediate
626    /// auth-bypass surface.
627    trust_x_forwarded_for: bool,
628    /// v0.8.17 G-4 (#161): migration escape hatch. When `true`,
629    /// the v0.8.16 F-13 reserved-name guard does NOT block GET /
630    /// HEAD / DELETE on keys ending in `.s4index` — the operator
631    /// is asserting that the deployment may carry pre-v0.8.15
632    /// user objects with that suffix and wants a window to
633    /// migrate them off. Writes (PUT / Copy / Create-Multipart)
634    /// stay blocked regardless of this flag, so attacker
635    /// injection from M-1 / F-13 stays closed. Default
636    /// `false` matches the v0.8.16 behaviour.
637    allow_legacy_reserved_key_reads: bool,
638}
639
640/// v0.8.17 G-2: which AWS error shape the reserved-name guard
641/// should emit on hit. `Read`-mode endpoints (GET / HEAD /
642/// Attributes / Tagging-read) return `NoSuchKey` — consistent
643/// with the listing filter hiding the sidecar. `Mutating`-mode
644/// endpoints (PUT / Copy / DELETE / Tagging-write / ACL-write)
645/// return `InvalidObjectName` so the client sees the suffix is
646/// reserved by-design rather than coincidentally missing.
647#[derive(Clone, Copy, Debug)]
648enum ReservedKeyMode {
649    Read,
650    Mutating,
651}
652
653impl<B: S3> S4Service<B> {
654    /// AWS S3 単発 PUT の API 上限 (5 GiB)。
655    ///
656    /// v0.9 #106 (32-bit target support): `target_pointer_width` で gating して
657    /// 32-bit target の const-overflow を回避。 32-bit では `isize::MAX as usize`
658    /// (≈ 2 GiB on 32-bit) に collapse ── Rust 言語仕様で `Vec` / `Bytes`
659    /// 1 回の allocation は `isize::MAX` byte が上限 (`usize::MAX` ではない) で、
660    /// `usize::MAX` を cap にすると oversized-body guard を通過した後で
661    /// `Vec::with_capacity` 側が panic することがある (Codex review P2 で発覚)。
662    /// s4-server runtime は 64-bit only (README §"Supported targets") だが、
663    /// workspace-wide `cargo check --target wasm32-*` 等で blocking しない + 32-bit
664    /// build で SSE buffered-decrypt が OOM panic しないためのガード。
665    #[cfg(target_pointer_width = "64")]
666    pub const DEFAULT_MAX_BODY_BYTES: usize = 5 * 1024 * 1024 * 1024;
667    #[cfg(target_pointer_width = "32")]
668    pub const DEFAULT_MAX_BODY_BYTES: usize = isize::MAX as usize;
669
670    /// v0.8.5 #86 (audit M-2): default cap on simultaneously-in-flight
671    /// replication dispatcher tasks. See the `replication_semaphore`
672    /// field doc for the rationale + override path.
673    pub const DEFAULT_REPLICATION_MAX_CONCURRENT: usize = 1024;
674
675    pub fn new(
676        backend: B,
677        registry: Arc<CodecRegistry>,
678        dispatcher: Arc<dyn CodecDispatcher>,
679    ) -> Self {
680        Self {
681            backend: Arc::new(backend),
682            registry,
683            dispatcher,
684            max_body_bytes: Self::DEFAULT_MAX_BODY_BYTES,
685            policy: None,
686            secure_transport: false,
687            rate_limits: None,
688            access_log: None,
689            sse_keyring: None,
690            versioning: None,
691            kms: None,
692            kms_default_key_id: None,
693            object_lock: None,
694            cors: None,
695            inventory: None,
696            notifications: None,
697            lifecycle: None,
698            tagging: None,
699            replication: None,
700            mfa_delete: None,
701            compliance_strict: false,
702            sigv4a_gate: None,
703            multipart_state: Arc::new(crate::multipart_state::MultipartStateStore::new()),
704            // v0.8 #52: chunked SSE-S4 disabled by default — opt
705            // in via `S4Service::with_sse_chunk_size(...)` /
706            // `--sse-chunk-size <BYTES>`. Default keeps the legacy
707            // S4E2 buffered path so existing deployments are
708            // bit-for-bit unchanged.
709            sse_chunk_size: 0,
710            // v0.8.5 #86 (audit M-2): default cap of 1024 in-flight
711            // replication tasks. Picked to be (a) ample headroom over a
712            // typical steady-state replication rate (the v0.8.3 #66
713            // status-sweep doc cites 1k keys/hour as a "steady" rate, so
714            // even a 100x burst lands well under 1024), (b) small enough
715            // that the worst-case memory pinned by stalled dispatchers
716            // — body bytes + metadata — stays bounded (1024 × 5 MiB
717            // typical S3 PUT ≈ 5 GiB, recoverable). Operators with
718            // wider cross-region fan-out can override via
719            // `--replication-max-concurrent`.
720            replication_semaphore: Arc::new(tokio::sync::Semaphore::new(
721                Self::DEFAULT_REPLICATION_MAX_CONCURRENT,
722            )),
723            // v0.8.11 CRIT-4: default fail-closed — ignore client-
724            // supplied `X-Forwarded-For` until the operator opts in
725            // through `with_trust_x_forwarded_for(true)`.
726            trust_x_forwarded_for: false,
727            // v0.8.17 G-4: closed by default; opt in via
728            // `with_allow_legacy_reserved_key_reads(true)` for the
729            // migration window only.
730            allow_legacy_reserved_key_reads: false,
731        }
732    }
733
734    /// v0.8.17 G-4: opt in to a migration window where GET / HEAD /
735    /// DELETE on `<key>.s4index` are allowed even though new
736    /// writes against that suffix stay rejected. Used by operators
737    /// upgrading from pre-v0.8.15 deployments that may carry
738    /// legacy user-owned objects with the now-reserved suffix.
739    /// Defaults to `false`; turn off again once the legacy data
740    /// has been migrated.
741    #[must_use]
742    pub fn with_allow_legacy_reserved_key_reads(mut self, on: bool) -> Self {
743        self.allow_legacy_reserved_key_reads = on;
744        self
745    }
746
747    /// v0.8.11 CRIT-4 fix: opt in to consuming the leftmost token of
748    /// the `X-Forwarded-For` header as `aws:SourceIp`. Only enable
749    /// when the gateway sits behind a trusted reverse proxy that
750    /// strips (or rewrites) any client-supplied value. When left
751    /// off (default), the policy evaluator sees `source_ip = None`
752    /// regardless of what the client sends — closing the
753    /// public-internet `X-Forwarded-For: 10.0.0.1` IAM-allowlist
754    /// bypass.
755    #[must_use]
756    pub fn with_trust_x_forwarded_for(mut self, on: bool) -> Self {
757        self.trust_x_forwarded_for = on;
758        self
759    }
760
761    /// v0.7 #47: attach the SigV4a verify gate. Once set, the
762    /// listener-side middleware (`crate::routing::try_sigv4a_verify`)
763    /// short-circuits any incoming `AWS4-ECDSA-P256-SHA256` request,
764    /// verifying it against the supplied credential store and
765    /// returning 403 on failure. Plain SigV4 (HMAC-SHA256) requests
766    /// are unaffected. When the gate is unset (default), the
767    /// middleware skips entirely so existing SigV4 deployments keep
768    /// working.
769    #[must_use]
770    pub fn with_sigv4a_gate(mut self, gate: Arc<SigV4aGate>) -> Self {
771        self.sigv4a_gate = Some(gate);
772        self
773    }
774
775    /// v0.7 #47: borrow the attached SigV4a gate. Used by `main.rs`
776    /// to snapshot the gate `Arc` before the s3s `ServiceBuilder`
777    /// consumes the `S4Service` (the listener-side middleware needs
778    /// the same `Arc` because s3s' SigV4 verifier rejects SigV4a
779    /// algorithm tokens with "unknown algorithm" — match has to
780    /// happen at the hyper layer instead).
781    #[must_use]
782    pub fn sigv4a_gate(&self) -> Option<&Arc<SigV4aGate>> {
783        self.sigv4a_gate.as_ref()
784    }
785
786    /// v0.8.2 #62: borrow the multipart state store so `main.rs` can
787    /// snapshot the `Arc` before the s3s `ServiceBuilder` consumes
788    /// the `S4Service`. The background `sweep_stale` task in `main.rs`
789    /// holds this `Arc` and ticks once an hour to drop abandoned
790    /// upload contexts (and their `Zeroizing<[u8; 32]>` SSE-C keys).
791    #[must_use]
792    pub fn multipart_state(&self) -> &Arc<crate::multipart_state::MultipartStateStore> {
793        &self.multipart_state
794    }
795
796    /// v0.6 #39: attach the in-memory object + bucket Tagging manager.
797    /// Once set, `Put/Get/Delete` `Object/Bucket Tagging` route
798    /// through the manager (instead of forwarding to the backend),
799    /// and `put_object`'s `x-amz-tagging` parse path becomes the
800    /// source of `s3:RequestObjectTag/<key>` for the IAM policy
801    /// evaluator. The manager itself is shared via `Arc`.
802    #[must_use]
803    pub fn with_tagging(mut self, mgr: Arc<crate::tagging::TagManager>) -> Self {
804        self.tagging = Some(mgr);
805        self
806    }
807
808    /// v0.6 #39: borrow the attached tagging manager (test /
809    /// introspection — the snapshotter in `main.rs`, when wired,
810    /// will keep its own `Arc` clone).
811    #[must_use]
812    pub fn tag_manager(&self) -> Option<&Arc<crate::tagging::TagManager>> {
813        self.tagging.as_ref()
814    }
815
816    /// v0.6 #36: attach the in-memory S3 Inventory manager. Once set,
817    /// `put_bucket_inventory_configuration` /
818    /// `get_bucket_inventory_configuration` /
819    /// `list_bucket_inventory_configurations` /
820    /// `delete_bucket_inventory_configuration` route through the
821    /// manager. The actual periodic CSV / manifest emission is
822    /// orchestrated by a tokio task started in `main.rs`; the manager
823    /// itself is shared between the handler and the scheduler via
824    /// `Arc`.
825    #[must_use]
826    pub fn with_inventory(mut self, mgr: Arc<crate::inventory::InventoryManager>) -> Self {
827        self.inventory = Some(mgr);
828        self
829    }
830
831    /// v0.6 #36: borrow the attached inventory manager (test /
832    /// introspection — the background scheduler in `main.rs` keeps its
833    /// own `Arc` clone, so this accessor is for the test path that
834    /// invokes `run_once_for_test` directly).
835    #[must_use]
836    pub fn inventory_manager(&self) -> Option<&Arc<crate::inventory::InventoryManager>> {
837        self.inventory.as_ref()
838    }
839
840    /// v0.6 #37: attach the in-memory S3 Lifecycle configuration
841    /// manager. Once set, `put_bucket_lifecycle_configuration` /
842    /// `get_bucket_lifecycle_configuration` / `delete_bucket_lifecycle`
843    /// route through the manager (replacing the previous backend-
844    /// passthrough behaviour). The actual periodic scanner that walks
845    /// the source bucket and invokes Expiration / Transition /
846    /// NoncurrentExpiration actions is a v0.7+ follow-up — see
847    /// [`Self::run_lifecycle_once_for_test`] for the in-memory test
848    /// path that exercises the evaluator end-to-end.
849    #[must_use]
850    pub fn with_lifecycle(mut self, mgr: Arc<crate::lifecycle::LifecycleManager>) -> Self {
851        self.lifecycle = Some(mgr);
852        self
853    }
854
855    /// v0.6 #37: borrow the attached lifecycle manager (test /
856    /// introspection — the background scheduler in `main.rs` keeps its
857    /// own `Arc` clone, so this accessor is for the test path that
858    /// invokes the evaluator directly).
859    #[must_use]
860    pub fn lifecycle_manager(&self) -> Option<&Arc<crate::lifecycle::LifecycleManager>> {
861        self.lifecycle.as_ref()
862    }
863
864    /// v0.6 #37: synchronous test entry that runs the lifecycle evaluator
865    /// against a caller-provided list of `(key, age, size, tags)` tuples
866    /// and returns the `(key, action)` pairs that should fire. The actual
867    /// backend invocation (S3.delete_object / metadata rewrite) is left
868    /// to the caller — the unit + E2E tests use this to verify the
869    /// evaluator without spawning the (deferred) background scanner.
870    /// Returns an empty `Vec` when no lifecycle manager is attached or
871    /// no rule matches.
872    #[must_use]
873    pub fn run_lifecycle_once_for_test(
874        &self,
875        bucket: &str,
876        objects: &[crate::lifecycle::EvaluateBatchEntry],
877    ) -> Vec<(String, crate::lifecycle::LifecycleAction)> {
878        let Some(mgr) = self.lifecycle.as_ref() else {
879            return Vec::new();
880        };
881        crate::lifecycle::evaluate_batch(mgr, bucket, objects)
882    }
883
884    /// v0.6 #35: attach the in-memory bucket-notification manager. Once
885    /// set, `put_bucket_notification_configuration` /
886    /// `get_bucket_notification_configuration` route through the manager
887    /// (replacing the previous backend-passthrough behaviour); successful
888    /// `put_object` / `delete_object` calls fire matching destinations
889    /// on a detached tokio task via
890    /// `crate::notifications::dispatch_event` (best-effort, fire-and-
891    /// forget — failures bump the manager's `dropped_total` counter and
892    /// log at warn but do NOT fail the originating S3 request).
893    #[must_use]
894    pub fn with_notifications(
895        mut self,
896        mgr: Arc<crate::notifications::NotificationManager>,
897    ) -> Self {
898        self.notifications = Some(mgr);
899        self
900    }
901
902    /// v0.6 #35: borrow the attached notifications manager (test /
903    /// introspection — used by the metrics layer to read
904    /// `dropped_total`).
905    #[must_use]
906    pub fn notifications_manager(&self) -> Option<&Arc<crate::notifications::NotificationManager>> {
907        self.notifications.as_ref()
908    }
909
910    /// v0.6 #35: internal helper used by the DELETE handlers to fire a
911    /// matching notification on a detached tokio task. No-op when no
912    /// manager is attached or no rule on the bucket matches the given
913    /// (event, key) tuple.
914    fn fire_delete_notification(
915        &self,
916        bucket: &str,
917        key: &str,
918        event: crate::notifications::EventType,
919        version_id: Option<String>,
920    ) {
921        let Some(mgr) = self.notifications.as_ref() else {
922            return;
923        };
924        let dests = mgr.match_destinations(bucket, &event, key);
925        if dests.is_empty() {
926            return;
927        }
928        tokio::spawn(crate::notifications::dispatch_event(
929            Arc::clone(mgr),
930            bucket.to_owned(),
931            key.to_owned(),
932            event,
933            None,
934            None,
935            version_id,
936            format!("S4-{}", uuid::Uuid::new_v4()),
937        ));
938    }
939
940    /// v0.6 #40: attach the in-memory cross-bucket replication manager.
941    /// Once set, `put_bucket_replication` / `get_bucket_replication` /
942    /// `delete_bucket_replication` route through the manager (replacing
943    /// the previous backend-passthrough behaviour); a successful
944    /// `put_object` whose key matches an enabled rule fires a detached
945    /// tokio task that PUTs the same body + metadata to the rule's
946    /// destination bucket, stamping the replica with
947    /// `x-amz-replication-status: REPLICA`. Failures after the retry
948    /// budget bump the manager's `dropped_total` counter and are
949    /// surfaced in the `s4_replication_dropped_total` Prometheus
950    /// counter; successes bump `s4_replication_replicated_total`.
951    #[must_use]
952    pub fn with_replication(mut self, mgr: Arc<crate::replication::ReplicationManager>) -> Self {
953        self.replication = Some(mgr);
954        self
955    }
956
957    /// v0.6 #40: borrow the attached replication manager (test /
958    /// introspection — used by the metrics layer to read
959    /// `dropped_total`).
960    #[must_use]
961    pub fn replication_manager(&self) -> Option<&Arc<crate::replication::ReplicationManager>> {
962        self.replication.as_ref()
963    }
964
965    /// v0.6 #40: internal helper used by the PUT handlers to fire a
966    /// detached cross-bucket replication task. No-op when no manager
967    /// is attached, the source backend PUT failed, or no rule on the
968    /// source bucket matches the (key, tags) tuple. The `body` is the
969    /// post-compression / post-encryption `Bytes` that was sent to
970    /// the source backend (refcount-cloned), and `metadata` is the
971    /// metadata map that already includes the manifest /
972    /// `s4-encrypted` markers — the replica decodes through the same
973    /// path. The destination PUT runs through `Arc<B>::put_object`.
974    ///
975    /// ## v0.8.2 #61: generation token + shadow-key destination
976    ///
977    /// `pending_version` is the source-side `PutOutcome` minted by the
978    /// caller's versioning branch (or `None` for unversioned /
979    /// suspended buckets). When `pending_version.versioned_response`
980    /// is `true`, the dispatcher writes the destination under the same
981    /// shadow path the source uses (`<key>.__s4ver__/<vid>`) so the
982    /// destination's version chain receives the new version the same
983    /// way `?versionId=` GET resolves it. Closes audit C-1.
984    ///
985    /// The dispatcher also mints a fresh `generation` token before
986    /// spawning, threaded through to [`crate::replication::
987    /// replicate_object`]. Closes audit C-3 — a stale retry of an
988    /// older PUT can no longer overwrite the destination's newer bytes
989    /// because the CAS guard sees the higher stored generation and
990    /// drops its destination write.
991    ///
992    /// ## Asymmetric versioning policy (out of scope)
993    ///
994    /// We assume source + destination buckets share the same
995    /// versioning policy (both Enabled or both Suspended /
996    /// Unversioned). Cross-bucket policy queries would require a
997    /// backend round-trip per replication, which is not worth it for
998    /// the single-instance scope. Operators who configure asymmetric
999    /// versioning will see destination-side `?versionId=` lookups
1000    /// miss — documented as out-of-scope until a future per-rule
1001    /// `destination_versioning_policy` knob lands.
1002    // 8 args is the post-#61 shape: replication needs the
1003    // source bucket+key, the canonical tag set for rule-matching,
1004    // the post-codec body+metadata for the destination PUT, the
1005    // backend-success gate, and the pending version-id for the
1006    // shadow-key destination override. A shape struct would just
1007    // split the (single) call site so opt for the inline form.
1008    #[allow(clippy::too_many_arguments)]
1009    fn spawn_replication_if_matched(
1010        &self,
1011        source_bucket: &str,
1012        source_key: &str,
1013        request_tags: &Option<crate::tagging::TagSet>,
1014        body: &bytes::Bytes,
1015        metadata: &Option<std::collections::HashMap<String, String>>,
1016        backend_ok: bool,
1017        pending_version: Option<&crate::versioning::PutOutcome>,
1018    ) where
1019        B: Send + Sync + 'static,
1020    {
1021        if !backend_ok {
1022            return;
1023        }
1024        let Some(mgr) = self.replication.as_ref() else {
1025            return;
1026        };
1027        // Pull the request's tags into the (k, v) shape the matcher
1028        // expects. The tagging manager would have the canonical
1029        // post-PUT view but at this point in the pipeline it's
1030        // already been written above; for the rule-match decision
1031        // the request's tags are sufficient (= the tags this PUT
1032        // applies, S3 PutObject is full-replace on tags).
1033        let object_tags: Vec<(String, String)> = request_tags
1034            .as_ref()
1035            .map(|ts| ts.iter().cloned().collect())
1036            .unwrap_or_default();
1037        let Some(rule) = mgr.match_rule(source_bucket, source_key, &object_tags) else {
1038            return;
1039        };
1040        // v0.8.2 #61: mint the per-PUT generation BEFORE the eager
1041        // Pending stamp so the stamp itself carries the right
1042        // generation (the CAS in `record_status_if_newer` would
1043        // otherwise see a `generation=0` Pending and accept any
1044        // stale retry).
1045        let generation = mgr.next_generation();
1046        // Eagerly mark the source key as Pending so a HEAD between
1047        // the source PUT returning and the spawned task completing
1048        // surfaces the in-flight state. CAS-guarded so a slower
1049        // older PUT can't downgrade a newer Completed back to Pending.
1050        let _ = mgr.record_status_if_newer(
1051            source_bucket,
1052            source_key,
1053            generation,
1054            crate::replication::ReplicationStatus::Pending,
1055        );
1056        // v0.8.2 #61: derive the destination storage key. For a
1057        // versioning-Enabled source the destination receives the
1058        // same shadow-key path so a `?versionId=<vid>` GET on the
1059        // destination resolves through the same lookup the source
1060        // uses. Suspended / Unversioned sources keep the logical
1061        // key (= `None` override = dispatcher uses `source_key`).
1062        let destination_key_override = pending_version
1063            .filter(|pv| pv.versioned_response)
1064            .map(|pv| versioned_shadow_key(source_key, &pv.version_id));
1065        // v0.8.3 #68 (audit M-1): capture the source object's Object
1066        // Lock state so the dispatcher can decorate the destination
1067        // PUT with the matching AWS-wire lock headers. Without this,
1068        // a Compliance / Governance / legal-hold protected source
1069        // would replicate to a destination where DELETE succeeds
1070        // (the WORM posture would only hold on the source).
1071        let source_lock_state = self
1072            .object_lock
1073            .as_ref()
1074            .and_then(|mgr| mgr.get(source_bucket, source_key));
1075        // v0.8.3 #68: hand the destination-side ObjectLockManager to
1076        // the dispatcher closure so we can persist the propagated
1077        // lock state on successful destination PUT (the destination
1078        // PUT below bypasses S4Service::put_object — we drive the
1079        // backend directly — so the explicit_lock_mode commit block
1080        // in put_object never fires for replicas. We replay it here
1081        // against the destination key.)
1082        let dest_lock_mgr = self.object_lock.as_ref().map(Arc::clone);
1083        let mgr_cl = Arc::clone(mgr);
1084        let backend = Arc::clone(&self.backend);
1085        let body_cl = body.clone();
1086        let metadata_cl = metadata.clone();
1087        let source_bucket_cl = source_bucket.to_owned();
1088        let source_key_cl = source_key.to_owned();
1089        let source_lock_state_for_closure = source_lock_state.clone();
1090        let source_bucket_for_warn = source_bucket.to_owned();
1091        // v0.8.5 #86 (audit M-2): bound the in-flight replication queue
1092        // depth. Acquire happens INSIDE the spawned task (not on the
1093        // listener path) so a saturated semaphore back-pressures the
1094        // dispatcher pool without stalling the source PUT response —
1095        // the source has already returned 200 to the client by the time
1096        // the spawn body runs. A failed `acquire_owned` only happens
1097        // when the semaphore is closed (we never close it, so the
1098        // logged-and-skipped fallback is unreachable in practice).
1099        let semaphore = Arc::clone(&self.replication_semaphore);
1100        tokio::spawn(async move {
1101            let _permit = match semaphore.acquire_owned().await {
1102                Ok(p) => p,
1103                Err(e) => {
1104                    tracing::warn!(
1105                        bucket = %source_bucket_cl,
1106                        key = %source_key_cl,
1107                        "S4 replication dispatcher could not acquire semaphore permit (closed? {e}); skipping replica"
1108                    );
1109                    return;
1110                }
1111            };
1112            let do_put = move |dest_bucket: String,
1113                               dest_key: String,
1114                               dest_body: bytes::Bytes,
1115                               dest_meta: Option<std::collections::HashMap<String, String>>| {
1116                let backend = Arc::clone(&backend);
1117                let dest_lock_mgr = dest_lock_mgr.clone();
1118                let lock_state = source_lock_state_for_closure.clone();
1119                let warn_src = source_bucket_for_warn.clone();
1120                async move {
1121                    let req = S3Request {
1122                        input: PutObjectInput {
1123                            bucket: dest_bucket.clone(),
1124                            key: dest_key.clone(),
1125                            body: Some(bytes_to_blob(dest_body)),
1126                            metadata: dest_meta,
1127                            ..Default::default()
1128                        },
1129                        method: http::Method::PUT,
1130                        uri: "/".parse().unwrap(),
1131                        headers: http::HeaderMap::new(),
1132                        extensions: http::Extensions::new(),
1133                        credentials: None,
1134                        region: None,
1135                        service: None,
1136                        trailing_headers: None,
1137                    };
1138                    let put_result = backend
1139                        .put_object(req)
1140                        .await
1141                        .map(|_| ())
1142                        .map_err(|e| format!("destination put_object: {e}"));
1143                    // v0.8.3 #68: on successful destination PUT,
1144                    // persist the propagated lock state into the
1145                    // destination's ObjectLockManager so a subsequent
1146                    // DELETE on the destination is refused. Three cases:
1147                    //   - PUT failed     → skip (no replica to protect)
1148                    //   - lock_state None → nothing to propagate
1149                    //   - dest manager None (operator misconfig)
1150                    //                     → log warn-once + bump skip metric
1151                    if put_result.is_ok()
1152                        && let Some(state) = lock_state
1153                    {
1154                        match dest_lock_mgr {
1155                            Some(ref mgr) => {
1156                                mgr.set(&dest_bucket, &dest_key, state);
1157                            }
1158                            None => {
1159                                crate::replication::warn_lock_propagation_skipped(
1160                                    &warn_src,
1161                                    &dest_bucket,
1162                                );
1163                            }
1164                        }
1165                    }
1166                    put_result
1167                }
1168            };
1169            // v0.8.5 #81 (audit H-7): wrap the dispatcher body in
1170            // `futures::FutureExt::catch_unwind` so a panic inside
1171            // `replicate_object` (or any of the user-supplied closures
1172            // it drives — `do_put`, the destination backend, the lock
1173            // manager) does NOT bubble out of the detached task as a
1174            // `JoinError` that no operator dashboard scrapes. Caught
1175            // panics bump `s4_dispatcher_panics_total{kind="replication"}`
1176            // + log at ERROR with the panic payload, so silent feature
1177            // degradation (= every replication PUT panicking and
1178            // dropping the replica without any visible signal) becomes
1179            // a first-class metric the operator can alert on.
1180            //
1181            // `AssertUnwindSafe` is required because the inner future
1182            // captures `Arc<...>` clones + a `do_put` closure that are
1183            // not `UnwindSafe` by default; the safety contract here is
1184            // "we don't continue using any of those captures after the
1185            // panic" which trivially holds (we drop them and return).
1186            use futures::FutureExt as _;
1187            let dispatcher_kind = "replication";
1188            let fut = crate::replication::replicate_object(
1189                rule,
1190                source_bucket_cl,
1191                source_key_cl,
1192                body_cl,
1193                metadata_cl,
1194                do_put,
1195                mgr_cl,
1196                generation,
1197                destination_key_override,
1198                source_lock_state,
1199            );
1200            if let Err(panic) = std::panic::AssertUnwindSafe(fut).catch_unwind().await {
1201                let panic_msg = panic
1202                    .downcast_ref::<&'static str>()
1203                    .copied()
1204                    .map(str::to_owned)
1205                    .or_else(|| panic.downcast_ref::<String>().cloned())
1206                    .unwrap_or_else(|| "(non-string panic payload)".to_owned());
1207                tracing::error!(
1208                    kind = dispatcher_kind,
1209                    panic_payload = %panic_msg,
1210                    "S4 dispatcher task panicked (caught by catch_unwind, runtime not poisoned)"
1211                );
1212                crate::metrics::record_dispatcher_panic(dispatcher_kind);
1213            }
1214        });
1215    }
1216
1217    /// v0.6 #42: attach the in-memory MFA-Delete enforcement manager.
1218    /// Once set, every DELETE / DELETE-version / delete-marker /
1219    /// `PutBucketVersioning` request against a bucket whose MFA-Delete
1220    /// state is `Enabled` requires a valid `x-amz-mfa: <serial> <code>`
1221    /// header (RFC 6238 6-digit TOTP); the gate is a no-op for buckets
1222    /// where MFA-Delete is `Disabled` (S3 default).
1223    #[must_use]
1224    pub fn with_mfa_delete(mut self, mgr: Arc<crate::mfa::MfaDeleteManager>) -> Self {
1225        self.mfa_delete = Some(mgr);
1226        self
1227    }
1228
1229    /// v0.6 #42: borrow the attached MFA-Delete manager (test /
1230    /// introspection — used by the snapshot path in `main.rs` to call
1231    /// `to_json` for restart-recoverable state).
1232    #[must_use]
1233    pub fn mfa_delete_manager(&self) -> Option<&Arc<crate::mfa::MfaDeleteManager>> {
1234        self.mfa_delete.as_ref()
1235    }
1236
1237    /// v0.6 #38: attach the in-memory CORS configuration manager. Once
1238    /// set, `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
1239    /// route through the manager instead of forwarding to the backend,
1240    /// and [`Self::handle_preflight`] becomes useful for the (future)
1241    /// listener-side OPTIONS interceptor.
1242    #[must_use]
1243    pub fn with_cors(mut self, mgr: Arc<crate::cors::CorsManager>) -> Self {
1244        self.cors = Some(mgr);
1245        self
1246    }
1247
1248    /// v0.6 #38: Borrow the attached CORS manager (test / introspection).
1249    #[must_use]
1250    pub fn cors_manager(&self) -> Option<&Arc<crate::cors::CorsManager>> {
1251        self.cors.as_ref()
1252    }
1253
1254    /// v0.6 #38: evaluate a CORS preflight request against the bucket's
1255    /// configured rules and, if a rule matches, return the headers that
1256    /// the (future) listener-side OPTIONS interceptor must put on the
1257    /// 200 response: `Access-Control-Allow-Origin`, `Access-Control-
1258    /// Allow-Methods`, `Access-Control-Allow-Headers`, optionally
1259    /// `Access-Control-Max-Age` and `Access-Control-Expose-Headers`.
1260    ///
1261    /// Returns `None` when no manager is attached, no config is
1262    /// registered for the bucket, or no rule matches the (origin,
1263    /// method, headers) triple. The caller is responsible for turning
1264    /// `None` into the appropriate 403 response.
1265    ///
1266    /// **Note:** the OPTIONS routing itself (i.e. wiring this method
1267    /// into the hyper-util listener path) is a follow-up — s3s does not
1268    /// surface OPTIONS as a typed S3 handler, so this method is
1269    /// currently call-able only from inside other handlers and tests.
1270    #[must_use]
1271    pub fn handle_preflight(
1272        &self,
1273        bucket: &str,
1274        origin: &str,
1275        method: &str,
1276        request_headers: &[String],
1277    ) -> Option<std::collections::HashMap<String, String>> {
1278        let mgr = self.cors.as_ref()?;
1279        let rule = mgr.match_preflight(bucket, origin, method, request_headers)?;
1280        let mut h = std::collections::HashMap::new();
1281        // Echo the matched origin back. If the rule used "*" we still
1282        // echo "*" (S3 spec — the spec does not require us to echo the
1283        // *requesting* origin when the wildcard matched).
1284        let allow_origin = if rule.allowed_origins.iter().any(|o| o == "*") {
1285            "*".to_string()
1286        } else {
1287            origin.to_string()
1288        };
1289        h.insert("Access-Control-Allow-Origin".to_string(), allow_origin);
1290        h.insert(
1291            "Access-Control-Allow-Methods".to_string(),
1292            rule.allowed_methods.join(", "),
1293        );
1294        if !rule.allowed_headers.is_empty() {
1295            // For the Allow-Headers response, echo back the rule's
1296            // pattern list verbatim (S3 echoes the configured list,
1297            // including "*" if present). Browsers honour exact-match
1298            // rules.
1299            h.insert(
1300                "Access-Control-Allow-Headers".to_string(),
1301                rule.allowed_headers.join(", "),
1302            );
1303        }
1304        if let Some(secs) = rule.max_age_seconds {
1305            h.insert("Access-Control-Max-Age".to_string(), secs.to_string());
1306        }
1307        if !rule.expose_headers.is_empty() {
1308            h.insert(
1309                "Access-Control-Expose-Headers".to_string(),
1310                rule.expose_headers.join(", "),
1311            );
1312        }
1313        Some(h)
1314    }
1315
1316    /// v0.5 #32: enable strict compliance mode. Every PUT must carry an
1317    /// SSE indicator (server-side encryption header or SSE-C customer
1318    /// key); requests without one are rejected with 400 InvalidRequest.
1319    /// Boot-time prerequisite checking lives in the binary
1320    /// (`validate_compliance_mode`) so this flag is purely the runtime
1321    /// switch.
1322    #[must_use]
1323    pub fn with_compliance_strict(mut self, on: bool) -> Self {
1324        self.compliance_strict = on;
1325        self
1326    }
1327
1328    /// v0.5 #30: attach the in-memory Object Lock (WORM) enforcement
1329    /// manager. Once set, `delete_object` and overwrite-path
1330    /// `put_object` refuse operations on locked keys with HTTP 403
1331    /// `AccessDenied`; new PUTs to a bucket with a default retention
1332    /// policy auto-create per-object lock state.
1333    #[must_use]
1334    pub fn with_object_lock(mut self, mgr: Arc<crate::object_lock::ObjectLockManager>) -> Self {
1335        self.object_lock = Some(mgr);
1336        self
1337    }
1338
1339    /// v0.7 #45: borrow the attached Object Lock manager (read-only —
1340    /// the lifecycle scanner uses this to skip currently-locked objects
1341    /// before issuing `delete_object`, since an Object Lock always wins
1342    /// over Lifecycle Expiration in AWS S3 semantics). Mirrors the
1343    /// shape of [`Self::lifecycle_manager`] /
1344    /// [`Self::tag_manager`] — purely additive accessor, no handler
1345    /// behaviour change.
1346    #[must_use]
1347    pub fn object_lock_manager(&self) -> Option<&Arc<crate::object_lock::ObjectLockManager>> {
1348        self.object_lock.as_ref()
1349    }
1350
1351    /// v0.5 #28: attach an SSE-KMS backend. `default_key_id` is used
1352    /// when a PUT requests SSE-KMS without naming a specific KMS key
1353    /// (operators set this to mirror AWS S3's bucket-default key).
1354    #[must_use]
1355    pub fn with_kms_backend(
1356        mut self,
1357        kms: Arc<dyn crate::kms::KmsBackend>,
1358        default_key_id: Option<String>,
1359    ) -> Self {
1360        self.kms = Some(kms);
1361        self.kms_default_key_id = default_key_id;
1362        self
1363    }
1364
1365    /// v0.5 #34: attach the first-class versioning state machine. Once
1366    /// set, this `S4Service` owns the per-bucket versioning state +
1367    /// per-(bucket, key) version chain; `put_object` / `get_object` /
1368    /// `delete_object` / `list_object_versions` /
1369    /// `get_bucket_versioning` / `put_bucket_versioning` consult the
1370    /// manager instead of passing through to the backend. The backend
1371    /// is still used as the byte store: Suspended / Unversioned buckets
1372    /// keep using `<key>` directly (legacy), Enabled buckets redirect
1373    /// each version's bytes to a shadow key
1374    /// (`<key>.__s4ver__/<version-id>`) so older versions survive newer
1375    /// PUTs to the same logical key.
1376    #[must_use]
1377    pub fn with_versioning(mut self, mgr: Arc<crate::versioning::VersioningManager>) -> Self {
1378        self.versioning = Some(mgr);
1379        self
1380    }
1381
1382    /// v0.8.5 #86 (audit M-3): borrow the attached versioning manager so
1383    /// the SIGUSR1 snapshot dump-back hook in `main.rs` can re-emit the
1384    /// in-memory state to the operator's `--versioning-state-file`
1385    /// without restarting the gateway. Mirrors the shape of
1386    /// [`Self::object_lock_manager`] / [`Self::lifecycle_manager`] —
1387    /// purely additive accessor, no handler behaviour change.
1388    #[must_use]
1389    pub fn versioning_manager(&self) -> Option<&Arc<crate::versioning::VersioningManager>> {
1390        self.versioning.as_ref()
1391    }
1392
1393    /// v0.8.5 #86 (audit M-2): override the default replication-dispatch
1394    /// concurrency cap (1024). Wired by the `--replication-max-concurrent`
1395    /// CLI flag in `main.rs`. Operators running heavy cross-region
1396    /// fan-out may need to raise this; operators on memory-constrained
1397    /// hosts may need to lower it. The new value replaces the existing
1398    /// `Semaphore` (so calling this after dispatchers are already in
1399    /// flight is fine — the in-flight tasks hold permits from the old
1400    /// semaphore which is dropped when its last permit is released).
1401    /// A `max` of 0 would deadlock all replicas; the value is silently
1402    /// clamped to 1 instead.
1403    #[must_use]
1404    pub fn with_replication_max_concurrent(mut self, max: usize) -> Self {
1405        let max = max.max(1);
1406        self.replication_semaphore = Arc::new(tokio::sync::Semaphore::new(max));
1407        self
1408    }
1409
1410    /// v0.8.5 #86 (audit M-2): borrow the in-flight replication
1411    /// concurrency permit pool. Tests inspect `available_permits()`
1412    /// after invoking `spawn_replication_if_matched` to verify the
1413    /// dispatcher actually `acquire_owned`s before kicking off the
1414    /// destination PUT.
1415    #[must_use]
1416    pub fn replication_semaphore(&self) -> &Arc<tokio::sync::Semaphore> {
1417        &self.replication_semaphore
1418    }
1419
1420    /// v0.4 #21 (kept for back-compat): attach a single SSE-S4 key.
1421    /// Internally wraps it in a 1-slot keyring with id=1 active, so
1422    /// new objects ride the v0.5 S4E2 frame while previously-written
1423    /// S4E1 bytes (this same key) still decrypt via the keyring's S4E1
1424    /// fallback path. Operators wanting true rotation should call
1425    /// [`Self::with_sse_keyring`] instead.
1426    #[must_use]
1427    pub fn with_sse_key(mut self, key: crate::sse::SharedSseKey) -> Self {
1428        let keyring = crate::sse::SseKeyring::new(1, key);
1429        self.sse_keyring = Some(std::sync::Arc::new(keyring));
1430        self
1431    }
1432
1433    /// v0.5 #29: attach a multi-key SSE-S4 keyring. PUT encrypts under
1434    /// the active key (S4E2 frame stamped with that key's id); GET
1435    /// dispatches on the body's magic — S4E1 falls back to trying every
1436    /// key in the ring (active first) so v0.4 objects survive a
1437    /// migration; S4E2 looks up the explicit key_id from the header.
1438    #[must_use]
1439    pub fn with_sse_keyring(mut self, keyring: crate::sse::SharedSseKeyring) -> Self {
1440        self.sse_keyring = Some(keyring);
1441        self
1442    }
1443
1444    /// v0.8 #52: opt the SSE-S4 PUT path into the chunked S4E5 frame
1445    /// (so the matching GET can stream-decrypt chunk-by-chunk
1446    /// instead of buffering the entire body before tag verify).
1447    /// `bytes` is the plaintext slice size — typically 1 MiB; 0
1448    /// disables the path and reverts to the legacy S4E2 buffered
1449    /// frame.
1450    ///
1451    /// SSE-C (S4E3) and SSE-KMS (S4E4) are intentionally untouched:
1452    /// the chunked envelopes for those flows are a follow-up issue
1453    /// (the customer-key wire surface needs separate version
1454    /// negotiation).
1455    ///
1456    /// Has no effect when `with_sse_keyring` / `with_sse_key` is
1457    /// not also set — the chunked path runs only on the SSE-S4
1458    /// branch of `put_object`.
1459    #[must_use]
1460    pub fn with_sse_chunk_size(mut self, bytes: usize) -> Self {
1461        self.sse_chunk_size = bytes;
1462        self
1463    }
1464
1465    /// v0.4 #20: attach an S3-style access-log emitter. Each completed
1466    /// PUT / GET / DELETE / List handler emits one entry into the
1467    /// emitter's buffer; a background flusher (started separately, see
1468    /// [`crate::access_log::AccessLog::spawn_flusher`]) writes hourly
1469    /// rotated `.log` files into the configured directory.
1470    #[must_use]
1471    pub fn with_access_log(mut self, log: crate::access_log::SharedAccessLog) -> Self {
1472        self.access_log = Some(log);
1473        self
1474    }
1475
1476    /// Capture the per-request access-log preamble before the request is
1477    /// consumed by the backend call. Returns `None` if no access logger
1478    /// is configured (cheap early-out so the handler doesn't pay the
1479    /// header-clone cost when access logging is off).
1480    fn access_log_preamble<I>(&self, req: &S3Request<I>) -> Option<AccessLogPreamble> {
1481        self.access_log.as_ref()?;
1482        Some(AccessLogPreamble {
1483            // v0.8.11 CRIT-4 fix: same trust gate as `request_context`.
1484            // Recording a client-controllable header in the access log
1485            // would poison forensic queries; leave it `None` until the
1486            // operator declares X-Forwarded-For is set by a trusted
1487            // proxy.
1488            remote_ip: if self.trust_x_forwarded_for {
1489                req.headers
1490                    .get("x-forwarded-for")
1491                    .and_then(|v| v.to_str().ok())
1492                    .and_then(|raw| raw.split(',').next())
1493                    .map(|s| s.trim().to_owned())
1494            } else {
1495                None
1496            },
1497            requester: Self::principal_of(req).map(str::to_owned),
1498            request_uri: format!("{} {}", req.method, req.uri.path()),
1499            user_agent: req
1500                .headers
1501                .get("user-agent")
1502                .and_then(|v| v.to_str().ok())
1503                .map(str::to_owned),
1504        })
1505    }
1506
1507    /// Internal — called by handlers at end-of-request with a captured
1508    /// preamble. Best-effort: swallows the await fast (clones Arc +
1509    /// pushes), no error propagation back to the request path.
1510    #[allow(clippy::too_many_arguments)]
1511    async fn record_access(
1512        &self,
1513        preamble: Option<AccessLogPreamble>,
1514        operation: &'static str,
1515        bucket: &str,
1516        key: Option<&str>,
1517        http_status: u16,
1518        bytes_sent: u64,
1519        object_size: u64,
1520        total_time_ms: u64,
1521        error_code: Option<&str>,
1522    ) {
1523        let (Some(log), Some(p)) = (self.access_log.as_ref(), preamble) else {
1524            return;
1525        };
1526        log.record(crate::access_log::AccessLogEntry {
1527            time: std::time::SystemTime::now(),
1528            bucket: bucket.to_owned(),
1529            remote_ip: p.remote_ip,
1530            requester: p.requester,
1531            operation,
1532            key: key.map(str::to_owned),
1533            request_uri: p.request_uri,
1534            http_status,
1535            error_code: error_code.map(str::to_owned),
1536            bytes_sent,
1537            object_size,
1538            total_time_ms,
1539            user_agent: p.user_agent,
1540        })
1541        .await;
1542    }
1543
1544    /// v0.4 #19: attach a per-(principal, bucket) token-bucket rate limiter.
1545    /// When set, every PUT / GET / DELETE / List / Copy / multipart op is
1546    /// throttle-checked before the policy gate; throttled requests return
1547    /// `S3ErrorCode::SlowDown` (HTTP 503) and bump
1548    /// `s4_rate_limit_throttled_total{principal,bucket}`.
1549    #[must_use]
1550    pub fn with_rate_limits(mut self, rl: crate::rate_limit::SharedRateLimits) -> Self {
1551        self.rate_limits = Some(rl);
1552        self
1553    }
1554
1555    /// Helper used by request handlers to apply the rate limit. Returns
1556    /// `Ok(())` when allowed (or no rate limiter is configured), or a
1557    /// `SlowDown` S3Error otherwise.
1558    fn enforce_rate_limit<I>(&self, req: &S3Request<I>, bucket: &str) -> S3Result<()> {
1559        let Some(rl) = self.rate_limits.as_ref() else {
1560            return Ok(());
1561        };
1562        let principal_id = Self::principal_of(req);
1563        if !rl.check(principal_id, bucket) {
1564            crate::metrics::record_rate_limit_throttle(principal_id.unwrap_or("-"), bucket);
1565            return Err(S3Error::with_message(
1566                S3ErrorCode::SlowDown,
1567                format!("rate-limited: bucket={bucket}"),
1568            ));
1569        }
1570        Ok(())
1571    }
1572
1573    /// Tell the policy evaluator that the listener is reached over TLS
1574    /// (or ACME). When `true`, the `aws:SecureTransport` Condition key
1575    /// resolves to `true`. Defaults to `false`.
1576    #[must_use]
1577    pub fn with_secure_transport(mut self, on: bool) -> Self {
1578        self.secure_transport = on;
1579        self
1580    }
1581
1582    #[must_use]
1583    pub fn with_max_body_bytes(mut self, n: usize) -> Self {
1584        self.max_body_bytes = n;
1585        self
1586    }
1587
1588    /// Attach an optional bucket policy (v0.2 #7). When `Some(...)`, every
1589    /// PUT / GET / DELETE / List handler runs `policy.evaluate(...)` before
1590    /// delegating to the backend; failures return `S3ErrorCode::AccessDenied`.
1591    /// When `None` (the default), no policy enforcement happens.
1592    #[must_use]
1593    pub fn with_policy(mut self, policy: crate::policy::SharedPolicy) -> Self {
1594        self.policy = Some(policy);
1595        self
1596    }
1597
1598    /// Pull the SigV4 access key id off the request's credentials, if any.
1599    /// Used as the `principal_id` for policy evaluation.
1600    fn principal_of<I>(req: &S3Request<I>) -> Option<&str> {
1601        req.credentials.as_ref().map(|c| c.access_key.as_str())
1602    }
1603
1604    /// v0.8.17 G-2: shared reserved-name guard used by every per-object
1605    /// API handler. `mode` chooses the AWS error shape: `Mutating`
1606    /// (PUT / Copy / DELETE / Tagging-write) returns
1607    /// `InvalidObjectName`; `Read` (GET / HEAD / Attributes / Tagging-read)
1608    /// returns `NoSuchKey` so a curious client gets the same response
1609    /// the listing filter has been giving them since v0.8.12 (the
1610    /// sidecar is invisible to list).
1611    ///
1612    /// v0.8.17 G-4: when `--allow-legacy-reserved-key-reads` is set
1613    /// AND the call is a `Read`, the guard returns `Ok(())` so
1614    /// operators upgrading from pre-v0.8.15 deployments can still
1615    /// access (and migrate off) any user-owned `<key>.s4index`
1616    /// objects that landed before M-1 / F-13 closed the namespace.
1617    /// Mutating operations stay blocked regardless of the flag —
1618    /// the flag is a read-only migration aid, not an injection
1619    /// re-opener.
1620    fn check_not_reserved_key(&self, key: &str, mode: ReservedKeyMode) -> S3Result<()> {
1621        if !s4_codec::index::is_reserved_sidecar_key(key) {
1622            return Ok(());
1623        }
1624        if matches!(mode, ReservedKeyMode::Read) && self.allow_legacy_reserved_key_reads {
1625            return Ok(());
1626        }
1627        match mode {
1628            ReservedKeyMode::Read => Err(S3Error::with_message(
1629                S3ErrorCode::NoSuchKey,
1630                format!("object key {key:?} is reserved for S4 internal sidecars"),
1631            )),
1632            ReservedKeyMode::Mutating => {
1633                let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
1634                    .unwrap_or(S3ErrorCode::InvalidArgument);
1635                Err(S3Error::with_message(
1636                    code,
1637                    format!(
1638                        "object key {key:?} is reserved (suffix `{}` is used for S4 internal \
1639                         sidecars)",
1640                        s4_codec::index::SIDECAR_SUFFIX,
1641                    ),
1642                ))
1643            }
1644        }
1645    }
1646
1647    /// v0.3 #13: build the per-request policy context from the incoming
1648    /// `S3Request`. Pulls `aws:UserAgent` from the User-Agent header,
1649    /// `aws:SourceIp` from the standard `X-Forwarded-For` header (most
1650    /// production deployments are behind an LB / reverse proxy that sets
1651    /// this), `aws:CurrentTime` from the system clock, and
1652    /// `aws:SecureTransport` from the per-listener TLS flag.
1653    fn request_context<I>(&self, req: &S3Request<I>) -> crate::policy::RequestContext {
1654        let user_agent = req
1655            .headers
1656            .get("user-agent")
1657            .and_then(|v| v.to_str().ok())
1658            .map(str::to_owned);
1659        // v0.8.11 CRIT-4 fix: `X-Forwarded-For` is a client-controllable
1660        // header. Trusting it unconditionally lets any public-internet
1661        // request claim it came from a trusted CIDR (e.g.
1662        // `curl -H 'X-Forwarded-For: 10.0.0.1'` to satisfy a
1663        // `Condition: NotIpAddress aws:SourceIp [10.0.0.0/8]` Deny).
1664        // We now only consume the header when the operator has
1665        // declared "this gateway sits behind a trusted reverse proxy
1666        // that scrubs client-supplied values" via
1667        // `with_trust_x_forwarded_for(true)` /
1668        // `--trust-x-forwarded-for`. Default leaves `source_ip` as
1669        // `None`, which fails closed for IP-allowlist Allow rules
1670        // and fails open for IP-blocklist Deny rules — operators
1671        // who need either case behind a public listener must opt in
1672        // or move the gate to the reverse proxy. The leftmost
1673        // comma-separated token is the originator per the
1674        // `X-Forwarded-For: client, proxy1, proxy2` convention.
1675        let source_ip = if self.trust_x_forwarded_for {
1676            req.headers
1677                .get("x-forwarded-for")
1678                .and_then(|v| v.to_str().ok())
1679                .and_then(|raw| raw.split(',').next())
1680                .and_then(|s| s.trim().parse().ok())
1681        } else {
1682            None
1683        };
1684        crate::policy::RequestContext {
1685            source_ip,
1686            user_agent,
1687            request_time: Some(std::time::SystemTime::now()),
1688            secure_transport: self.secure_transport,
1689            existing_object_tags: None,
1690            request_object_tags: None,
1691            extra: Default::default(),
1692        }
1693    }
1694
1695    /// Helper used by request handlers to enforce the optional policy.
1696    /// Returns `Ok(())` when allowed (or no policy is configured), or an
1697    /// `AccessDenied` S3Error otherwise. Bumps the policy denial Prometheus
1698    /// counter on deny.
1699    fn enforce_policy<I>(
1700        &self,
1701        req: &S3Request<I>,
1702        action: &'static str,
1703        bucket: &str,
1704        key: Option<&str>,
1705    ) -> S3Result<()> {
1706        self.enforce_policy_with_extra(req, action, bucket, key, None, None)
1707    }
1708
1709    /// v0.6 #39: variant of [`Self::enforce_policy`] that lets the
1710    /// caller plumb tag context (existing-on-object + on-request) into
1711    /// the policy evaluator. Both arguments default to `None`, in
1712    /// which case the resulting `RequestContext` is identical to
1713    /// [`Self::enforce_policy`]'s — so for handlers that don't deal
1714    /// with tags this is a transparent no-op.
1715    fn enforce_policy_with_extra<I>(
1716        &self,
1717        req: &S3Request<I>,
1718        action: &'static str,
1719        bucket: &str,
1720        key: Option<&str>,
1721        request_tags: Option<&crate::tagging::TagSet>,
1722        existing_tags: Option<&crate::tagging::TagSet>,
1723    ) -> S3Result<()> {
1724        let Some(policy) = self.policy.as_ref() else {
1725            return Ok(());
1726        };
1727        let principal_id = Self::principal_of(req);
1728        let mut ctx = self.request_context(req);
1729        if let Some(t) = request_tags {
1730            ctx.request_object_tags = Some(t.clone());
1731        }
1732        if let Some(t) = existing_tags {
1733            ctx.existing_object_tags = Some(t.clone());
1734        }
1735        let decision = policy.evaluate_with(action, bucket, key, principal_id, &ctx);
1736        if decision.allow {
1737            Ok(())
1738        } else {
1739            crate::metrics::record_policy_denial(action, bucket);
1740            tracing::info!(
1741                action,
1742                bucket,
1743                key = ?key,
1744                principal = ?principal_id,
1745                source_ip = ?ctx.source_ip,
1746                user_agent = ?ctx.user_agent,
1747                secure_transport = ctx.secure_transport,
1748                matched_sid = ?decision.matched_sid,
1749                effect = ?decision.matched_effect,
1750                "S4 policy denied request"
1751            );
1752            Err(S3Error::with_message(
1753                S3ErrorCode::AccessDenied,
1754                format!("denied by S4 policy: {action} on bucket={bucket}"),
1755            ))
1756        }
1757    }
1758
1759    /// テスト用: backend を取り戻す (test helper、production では使わない).
1760    /// v0.6 #40 で `backend` が `Arc<B>` 化したので `Arc::try_unwrap` で
1761    /// 1-clone の場合のみ返す。共有されている (= replication dispatcher が
1762    /// 同じ Arc を持っていて未完了) 場合は `Err` を返さず panic させる
1763    /// (test 用途専用 helper の caller 契約を維持)。
1764    pub fn into_backend(self) -> B {
1765        Arc::try_unwrap(self.backend).unwrap_or_else(|_| {
1766            panic!("into_backend: backend Arc still shared (replication dispatcher in flight?)")
1767        })
1768    }
1769
1770    /// 必要 frame だけを backend に Range GET し、frame parse + decompress + slice
1771    /// した結果を返す sidecar fast path。Range request の **帯域節約版**。
1772    async fn partial_range_get(
1773        &self,
1774        req: &S3Request<GetObjectInput>,
1775        plan: s4_codec::index::RangePlan,
1776        client_start: u64,
1777        client_end_exclusive: u64,
1778        total_original: u64,
1779        get_start: Instant,
1780    ) -> S3Result<S3Response<GetObjectOutput>> {
1781        // 必要 byte 範囲だけを backend に partial GET
1782        let backend_range = s3s::dto::Range::Int {
1783            first: plan.byte_start,
1784            last: Some(plan.byte_end_exclusive - 1),
1785        };
1786        let backend_input = GetObjectInput {
1787            bucket: req.input.bucket.clone(),
1788            key: req.input.key.clone(),
1789            range: Some(backend_range),
1790            ..Default::default()
1791        };
1792        let backend_req = S3Request {
1793            input: backend_input,
1794            method: req.method.clone(),
1795            uri: req.uri.clone(),
1796            headers: req.headers.clone(),
1797            extensions: http::Extensions::new(),
1798            credentials: req.credentials.clone(),
1799            region: req.region.clone(),
1800            service: req.service.clone(),
1801            trailing_headers: None,
1802        };
1803        let mut backend_resp = self.backend.get_object(backend_req).await?;
1804        let blob = backend_resp.output.body.take().ok_or_else(|| {
1805            S3Error::with_message(
1806                S3ErrorCode::InternalError,
1807                "backend partial GET returned empty body",
1808            )
1809        })?;
1810        let bytes = collect_blob(blob, self.max_body_bytes)
1811            .await
1812            .map_err(internal("collect partial body"))?;
1813
1814        // frame parse + decompress
1815        let mut combined = BytesMut::new();
1816        for frame in FrameIter::new(bytes) {
1817            let (header, payload) = frame.map_err(|e| {
1818                S3Error::with_message(
1819                    S3ErrorCode::InternalError,
1820                    format!("partial-range frame parse: {e}"),
1821                )
1822            })?;
1823            let chunk_manifest = ChunkManifest {
1824                codec: header.codec,
1825                original_size: header.original_size,
1826                compressed_size: header.compressed_size,
1827                crc32c: header.crc32c,
1828            };
1829            let decompressed = self
1830                .registry
1831                .decompress(payload, &chunk_manifest)
1832                .await
1833                .map_err(internal("partial-range decompress"))?;
1834            combined.extend_from_slice(&decompressed);
1835        }
1836        let combined = combined.freeze();
1837        let sliced = combined
1838            .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
1839
1840        // response 組立て
1841        let returned_size = sliced.len() as u64;
1842        backend_resp.output.content_length = Some(returned_size as i64);
1843        backend_resp.output.content_range = Some(format!(
1844            "bytes {client_start}-{}/{total_original}",
1845            client_end_exclusive - 1
1846        ));
1847        backend_resp.output.checksum_crc32 = None;
1848        backend_resp.output.checksum_crc32c = None;
1849        backend_resp.output.checksum_crc64nvme = None;
1850        backend_resp.output.checksum_sha1 = None;
1851        backend_resp.output.checksum_sha256 = None;
1852        backend_resp.output.e_tag = None;
1853        backend_resp.output.body = Some(bytes_to_blob(sliced));
1854        backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
1855
1856        let elapsed = get_start.elapsed();
1857        crate::metrics::record_get(
1858            "partial",
1859            plan.byte_end_exclusive - plan.byte_start,
1860            returned_size,
1861            elapsed.as_secs_f64(),
1862            true,
1863        );
1864        info!(
1865            op = "get_object",
1866            bucket = %req.input.bucket,
1867            key = %req.input.key,
1868            bytes_in = plan.byte_end_exclusive - plan.byte_start,
1869            bytes_out = returned_size,
1870            total_object_size = total_original,
1871            range = true,
1872            path = "sidecar-partial",
1873            latency_ms = elapsed.as_millis() as u64,
1874            "S4 partial Range GET via sidecar index"
1875        );
1876        Ok(backend_resp)
1877    }
1878
1879    /// v0.9 #106: SSE-S4 chunked (S4E6) encryption-aware partial
1880    /// Range GET. The sidecar carries an [`s4_codec::index::SseChunkBinding`]
1881    /// (salt + key_id + chunk geometry) that lets us:
1882    ///
1883    /// 1. Map the [`s4_codec::index::RangePlan`]'s pre-encrypt byte range
1884    ///    to an encrypted chunk-range via
1885    ///    [`FrameIndex::encrypted_lookup`].
1886    /// 2. Partial-GET only those S4E6 chunks from backend (instead of
1887    ///    the entire encrypted body).
1888    /// 3. Decrypt the fetched chunks via
1889    ///    [`crate::sse::decrypt_s4e6_chunk_range`] (per-chunk
1890    ///    independently sealed — no need for the full body's tag).
1891    /// 4. Frame-parse + decompress the decrypted plaintext and slice
1892    ///    out the client-requested bytes via the existing
1893    ///    [`Self::partial_range_get`] machinery (re-used to keep one
1894    ///    source of truth for the response shaping).
1895    ///
1896    /// Returns `Err(...)` on any failure (auth, range, parse) so the
1897    /// caller can decide to fall back to the buffered full-GET path.
1898    /// In practice we surface a clear `InternalError` and let it
1899    /// bubble — Range GET on an encrypted body that fails partial
1900    /// fetch is a genuine error condition (sidecar / body mismatch,
1901    /// keyring rotated, etc.), not a quietly-degrade case.
1902    #[allow(clippy::too_many_arguments)]
1903    async fn partial_range_get_encrypted(
1904        &self,
1905        req: &S3Request<GetObjectInput>,
1906        plan: s4_codec::index::RangePlan,
1907        enc_plan: s4_codec::index::EncryptedRangePlan,
1908        sse: s4_codec::index::SseChunkBinding,
1909        client_start: u64,
1910        client_end_exclusive: u64,
1911        total_original: u64,
1912        get_start: Instant,
1913    ) -> S3Result<S3Response<GetObjectOutput>> {
1914        let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
1915            S3Error::with_message(
1916                S3ErrorCode::InvalidRequest,
1917                "object is SSE-S4 chunked but no --sse-s4-key is configured on this gateway",
1918            )
1919        })?;
1920        // Partial-fetch the enc byte range that covers the needed
1921        // chunks. Note that `byte_end_exclusive - 1` is the inclusive
1922        // last byte (matches the existing partial_range_get
1923        // convention).
1924        let backend_range = s3s::dto::Range::Int {
1925            first: enc_plan.enc_byte_start,
1926            last: Some(enc_plan.enc_byte_end_exclusive - 1),
1927        };
1928        let backend_input = GetObjectInput {
1929            bucket: req.input.bucket.clone(),
1930            key: req.input.key.clone(),
1931            range: Some(backend_range),
1932            ..Default::default()
1933        };
1934        let backend_req = S3Request {
1935            input: backend_input,
1936            method: req.method.clone(),
1937            uri: req.uri.clone(),
1938            headers: req.headers.clone(),
1939            extensions: http::Extensions::new(),
1940            credentials: req.credentials.clone(),
1941            region: req.region.clone(),
1942            service: req.service.clone(),
1943            trailing_headers: None,
1944        };
1945        let mut backend_resp = self.backend.get_object(backend_req).await?;
1946        let blob = backend_resp.output.body.take().ok_or_else(|| {
1947            S3Error::with_message(
1948                S3ErrorCode::InternalError,
1949                "backend partial GET returned empty body (SSE-S4 chunked Range)",
1950            )
1951        })?;
1952        let enc_bytes = collect_blob(blob, self.max_body_bytes)
1953            .await
1954            .map_err(internal("collect SSE-S4 chunked partial body"))?;
1955
1956        // Decrypt the partial chunks → pre-encrypt (= compressed-framed) plaintext.
1957        let plaintext = crate::sse::decrypt_s4e6_chunk_range(
1958            &enc_bytes,
1959            keyring.as_ref(),
1960            sse.enc_chunk_size,
1961            sse.enc_chunk_count,
1962            sse.enc_key_id,
1963            &sse.enc_salt,
1964            sse.enc_plaintext_len,
1965            enc_plan.chunk_idx_start,
1966            enc_plan.chunk_idx_last_inclusive,
1967        )
1968        .map_err(|e| {
1969            S3Error::with_message(
1970                S3ErrorCode::InternalError,
1971                format!("SSE-S4 chunked partial decrypt failed: {e}"),
1972            )
1973        })?;
1974        // Slice the decrypted concatenation down to the requested
1975        // pre-encrypt byte range (= the `RangePlan.byte_start..
1976        // byte_end_exclusive` range, expressed inside the chunks we
1977        // fetched).
1978        let s = enc_plan.pre_encrypt_slice_start_in_concat as usize;
1979        let e = enc_plan.pre_encrypt_slice_end_in_concat as usize;
1980        if e > plaintext.len() {
1981            return Err(S3Error::with_message(
1982                S3ErrorCode::InternalError,
1983                "SSE-S4 chunked partial decrypt produced fewer bytes than the sidecar declared",
1984            ));
1985        }
1986        let pre_encrypt_slice = plaintext.slice(s..e);
1987
1988        // Frame-parse + decompress the pre-encrypt slice, then slice
1989        // again on the original byte range. The plan's
1990        // slice_start_in_combined / slice_end_in_combined account for
1991        // the original_offset of the first frame we fetched — they
1992        // are pre-encrypt-domain offsets, identical to the
1993        // non-encrypted partial-range path.
1994        let mut combined = BytesMut::new();
1995        for frame in FrameIter::new(pre_encrypt_slice) {
1996            let (header, payload) = frame.map_err(|fe| {
1997                S3Error::with_message(
1998                    S3ErrorCode::InternalError,
1999                    format!("SSE-S4 chunked partial frame parse: {fe}"),
2000                )
2001            })?;
2002            let chunk_manifest = ChunkManifest {
2003                codec: header.codec,
2004                original_size: header.original_size,
2005                compressed_size: header.compressed_size,
2006                crc32c: header.crc32c,
2007            };
2008            let decompressed = self
2009                .registry
2010                .decompress(payload, &chunk_manifest)
2011                .await
2012                .map_err(internal("SSE-S4 chunked partial decompress"))?;
2013            combined.extend_from_slice(&decompressed);
2014        }
2015        let combined = combined.freeze();
2016        let sliced = combined
2017            .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
2018
2019        // Response shaping: identical to the unencrypted partial
2020        // path (clear backend checksums / e_tag since they describe
2021        // the encrypted body, not the plaintext slice).
2022        let returned_size = sliced.len() as u64;
2023        backend_resp.output.content_length = Some(returned_size as i64);
2024        backend_resp.output.content_range = Some(format!(
2025            "bytes {client_start}-{}/{total_original}",
2026            client_end_exclusive - 1
2027        ));
2028        backend_resp.output.checksum_crc32 = None;
2029        backend_resp.output.checksum_crc32c = None;
2030        backend_resp.output.checksum_crc64nvme = None;
2031        backend_resp.output.checksum_sha1 = None;
2032        backend_resp.output.checksum_sha256 = None;
2033        backend_resp.output.e_tag = None;
2034        backend_resp.output.body = Some(bytes_to_blob(sliced));
2035        backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
2036
2037        let elapsed = get_start.elapsed();
2038        // Use the encrypted bytes_in for the bandwidth-saved metric —
2039        // that's what actually traversed the wire, vs. the full
2040        // encrypted body that the buffered fallback would have
2041        // fetched.
2042        crate::metrics::record_get(
2043            "sse-s4-chunked-partial",
2044            enc_plan.enc_byte_end_exclusive - enc_plan.enc_byte_start,
2045            returned_size,
2046            elapsed.as_secs_f64(),
2047            true,
2048        );
2049        info!(
2050            op = "get_object",
2051            bucket = %req.input.bucket,
2052            key = %req.input.key,
2053            bytes_in = enc_plan.enc_byte_end_exclusive - enc_plan.enc_byte_start,
2054            bytes_out = returned_size,
2055            total_object_size = total_original,
2056            range = true,
2057            path = "sidecar-partial-sse-s4-chunked",
2058            chunks_fetched = (enc_plan.chunk_idx_last_inclusive - enc_plan.chunk_idx_start + 1) as u64,
2059            latency_ms = elapsed.as_millis() as u64,
2060            "S4 partial Range GET via v3 sidecar (SSE-S4 chunked fast-path)"
2061        );
2062        Ok(backend_resp)
2063    }
2064
2065    /// `<key>.s4index` sidecar object を backend に書く。失敗しても本体 PUT は
2066    /// 成功扱いにしたいので、err は warn ログのみ (Range GET の partial path が
2067    /// 使えなくなるが、full read fallback で意味的には正しい結果を返す)。
2068    async fn write_sidecar(&self, bucket: &str, key: &str, index: &FrameIndex) {
2069        let bytes = encode_index(index);
2070        let len = bytes.len() as i64;
2071        let sidecar = sidecar_key(key);
2072        // v0.7 #49: synthetic re-entry URI must be percent-encoded; if
2073        // the (already legally-arbitrary) S3 key produces something we
2074        // cannot encode at all, drop the sidecar PUT (the GET path
2075        // falls back to a full read on a missing sidecar) instead of
2076        // panicking on `parse().unwrap()`.
2077        let uri = match safe_object_uri(bucket, &sidecar) {
2078            Ok(u) => u,
2079            Err(e) => {
2080                tracing::warn!(
2081                    bucket,
2082                    key,
2083                    "S4 write_sidecar skipped (key not URI-encodable): {e}"
2084                );
2085                return;
2086            }
2087        };
2088        let put_input = PutObjectInput {
2089            bucket: bucket.into(),
2090            key: sidecar,
2091            body: Some(bytes_to_blob(bytes)),
2092            content_length: Some(len),
2093            content_type: Some("application/x-s4-index".into()),
2094            ..Default::default()
2095        };
2096        let put_req = S3Request {
2097            input: put_input,
2098            method: http::Method::PUT,
2099            uri,
2100            headers: http::HeaderMap::new(),
2101            extensions: http::Extensions::new(),
2102            credentials: None,
2103            region: None,
2104            service: None,
2105            trailing_headers: None,
2106        };
2107        if let Err(e) = self.backend.put_object(put_req).await {
2108            tracing::warn!(
2109                bucket,
2110                key,
2111                "S4 write_sidecar failed (Range GET will fall back to full read): {e}"
2112            );
2113        }
2114    }
2115
2116    /// v0.8.4 #73 H-2: confirm that the sidecar we just decoded still
2117    /// describes the current backend object before we trust its frame
2118    /// offsets for a partial Range GET. The sidecar carries the source
2119    /// `etag` and `compressed_size` that were observed at PUT time; we
2120    /// HEAD the backend object and compare.
2121    ///
2122    /// Decision matrix:
2123    /// - sidecar `source_etag = None` (legacy v1 / build_index_from_body
2124    ///   that wasn't stamped) → return `true` (best-effort, preserves
2125    ///   pre-v0.8.4 behaviour for existing on-disk sidecars).
2126    /// - HEAD fails → return `false` (we can't tell either way; full GET
2127    ///   path will surface the real backend error to the client).
2128    /// - HEAD ETag matches → `true`.
2129    /// - HEAD ETag differs OR HEAD size differs from
2130    ///   `source_compressed_size` → `false` (sidecar stale or attacker-
2131    ///   written; fall back to full GET).
2132    async fn sidecar_version_binding_ok(
2133        &self,
2134        bucket: &str,
2135        key: &str,
2136        index: &FrameIndex,
2137    ) -> bool {
2138        let Some(ref expected_etag) = index.source_etag else {
2139            // Legacy sidecar without the v0.8.4 #73 H-2 binding —
2140            // back-compat: trust it (the partial fetch is the same
2141            // best-effort path that v0.8.3 and earlier shipped).
2142            return true;
2143        };
2144        let head_input = HeadObjectInput {
2145            bucket: bucket.into(),
2146            key: key.into(),
2147            ..Default::default()
2148        };
2149        let uri = match safe_object_uri(bucket, key) {
2150            Ok(u) => u,
2151            Err(_) => return false,
2152        };
2153        let head_req = S3Request {
2154            input: head_input,
2155            method: http::Method::HEAD,
2156            uri,
2157            headers: http::HeaderMap::new(),
2158            extensions: http::Extensions::new(),
2159            credentials: None,
2160            region: None,
2161            service: None,
2162            trailing_headers: None,
2163        };
2164        let head = match self.backend.head_object(head_req).await {
2165            Ok(r) => r.output,
2166            Err(e) => {
2167                tracing::debug!(
2168                    bucket,
2169                    key,
2170                    "S4 sidecar version-binding HEAD failed, falling back to full GET: {e}"
2171                );
2172                return false;
2173            }
2174        };
2175        // ETag is a strong-vs-weak enum; we compare on the unwrapped string
2176        // form (matches what the PUT path stamped — see below).
2177        let live_etag = head.e_tag.as_ref().map(|t| t.value());
2178        if live_etag != Some(expected_etag.as_str()) {
2179            tracing::debug!(
2180                bucket,
2181                key,
2182                "sidecar stale (ETag mismatch), falling back to full GET (sidecar={:?}, live={:?})",
2183                expected_etag,
2184                live_etag,
2185            );
2186            return false;
2187        }
2188        if let Some(expected_size) = index.source_compressed_size
2189            && let Some(live_size) = head.content_length
2190            && live_size as u64 != expected_size
2191        {
2192            tracing::debug!(
2193                bucket,
2194                key,
2195                "sidecar stale (size mismatch), falling back to full GET (sidecar={}, live={})",
2196                expected_size,
2197                live_size,
2198            );
2199            return false;
2200        }
2201        true
2202    }
2203
2204    /// `<key>.s4index` sidecar を backend から読み出す。なければ None。
2205    async fn read_sidecar(&self, bucket: &str, key: &str) -> Option<FrameIndex> {
2206        let sidecar = sidecar_key(key);
2207        // v0.7 #49: same encode-or-bail treatment as write_sidecar.
2208        let uri = safe_object_uri(bucket, &sidecar).ok()?;
2209        let get_input = GetObjectInput {
2210            bucket: bucket.into(),
2211            key: sidecar,
2212            ..Default::default()
2213        };
2214        let get_req = S3Request {
2215            input: get_input,
2216            method: http::Method::GET,
2217            uri,
2218            headers: http::HeaderMap::new(),
2219            extensions: http::Extensions::new(),
2220            credentials: None,
2221            region: None,
2222            service: None,
2223            trailing_headers: None,
2224        };
2225        let resp = self.backend.get_object(get_req).await.ok()?;
2226        let blob = resp.output.body?;
2227        let bytes = collect_blob(blob, 64 * 1024 * 1024).await.ok()?;
2228        decode_index(bytes).ok()
2229    }
2230
2231    /// Multipart object (frame 列) を解凍 → 元 bytes を再構築。
2232    ///
2233    /// **per-frame codec dispatch**: 各 frame header に codec_id が入っているので、
2234    /// frame ごとに registry が違う codec を呼ぶことができる。同一 object 内で
2235    /// 異なる codec が混在していても透過的に解凍可能 (parquet 風 mixed columns 等)。
2236    async fn decompress_multipart(&self, bytes: bytes::Bytes) -> S3Result<bytes::Bytes> {
2237        let mut out = BytesMut::new();
2238        // v0.8.15 H-h: cap the *aggregate* decoded output. Each
2239        // individual frame is already bounded by
2240        // `validate_decompress_manifest` (default 5 GiB per frame),
2241        // but a forged multi-frame body can declare many frames
2242        // each near the limit — without an object-level ceiling, a
2243        // single GET could pin tens of GiB of plaintext in
2244        // `BytesMut::extend_from_slice`. Use the gateway's
2245        // `max_body_bytes` (same cap that bounds PUT bodies) so a
2246        // GET can never produce more plaintext than a PUT can ever
2247        // legitimately have stored.
2248        let aggregate_cap = self.max_body_bytes;
2249        let mut produced: usize = 0;
2250        for frame in FrameIter::new(bytes) {
2251            let (header, payload) = frame.map_err(|e| {
2252                S3Error::with_message(
2253                    S3ErrorCode::InternalError,
2254                    format!("multipart frame parse: {e}"),
2255                )
2256            })?;
2257            let chunk_manifest = ChunkManifest {
2258                codec: header.codec,
2259                original_size: header.original_size,
2260                compressed_size: header.compressed_size,
2261                crc32c: header.crc32c,
2262            };
2263            // v0.8.15 H-h: pre-flight check on the declared
2264            // `original_size` so a forged manifest claiming a frame
2265            // that would push us past the cap is rejected before we
2266            // start decoding. Defence-in-depth alongside the
2267            // post-decode `produced` check below.
2268            if (produced as u64).saturating_add(header.original_size) > aggregate_cap as u64 {
2269                return Err(S3Error::with_message(
2270                    S3ErrorCode::InternalError,
2271                    format!(
2272                        "multipart aggregate output exceeds cap: would reach \
2273                         {produced_total} bytes after this frame, cap is {aggregate_cap}",
2274                        produced_total = (produced as u64).saturating_add(header.original_size),
2275                    ),
2276                ));
2277            }
2278            let decompressed = self
2279                .registry
2280                .decompress(payload, &chunk_manifest)
2281                .await
2282                .map_err(internal("multipart frame decompress"))?;
2283            produced = produced.saturating_add(decompressed.len());
2284            if produced > aggregate_cap {
2285                return Err(S3Error::with_message(
2286                    S3ErrorCode::InternalError,
2287                    format!(
2288                        "multipart aggregate output exceeded cap: {produced} bytes \
2289                         emitted, cap is {aggregate_cap}"
2290                    ),
2291                ));
2292            }
2293            out.extend_from_slice(&decompressed);
2294        }
2295        Ok(out.freeze())
2296    }
2297}
2298
2299/// Parse a CopySourceRange header value (`bytes=N-M`, `bytes=N-`, `bytes=-N`)
2300/// into the s3s::dto::Range used by the GetObject path. The S3 spec only
2301/// allows `bytes=N-M` for upload_part_copy (no suffix or open-ended), so
2302/// reject the other variants for parity with AWS.
2303fn parse_copy_source_range(s: &str) -> Result<s3s::dto::Range, String> {
2304    let rest = s
2305        .strip_prefix("bytes=")
2306        .ok_or_else(|| format!("CopySourceRange must start with 'bytes=', got {s:?}"))?;
2307    let (a, b) = rest
2308        .split_once('-')
2309        .ok_or_else(|| format!("CopySourceRange must be 'bytes=N-M', got {s:?}"))?;
2310    let first: u64 = a
2311        .parse()
2312        .map_err(|_| format!("CopySourceRange first byte not a number: {a:?}"))?;
2313    let last: u64 = b
2314        .parse()
2315        .map_err(|_| format!("CopySourceRange last byte not a number: {b:?}"))?;
2316    if last < first {
2317        return Err(format!("CopySourceRange last < first: {s:?}"));
2318    }
2319    Ok(s3s::dto::Range::Int {
2320        first,
2321        last: Some(last),
2322    })
2323}
2324
2325/// v0.5 #34: synthesize the backend storage key for a given
2326/// (logical key, version-id) pair on an Enabled-versioning bucket.
2327///
2328/// Uses the `__s4ver__/` infix because:
2329/// - it's not a substring of `.s4index` / `.s4ver` natural keys (no false-positive
2330///   listing filter collisions)
2331/// - directory-style separator keeps S3 console "browse by prefix" UX intact
2332///   (versions roll up under one virtual folder per object)
2333/// - human-readable on debug logs / `aws s3 ls`
2334///
2335/// `list_objects` / `list_objects_v2` / `list_object_versions` MUST filter
2336/// keys containing `.__s4ver__/` from results so customers don't see internal
2337/// shadow objects.
2338pub fn versioned_shadow_key(key: &str, version_id: &str) -> String {
2339    format!("{key}.__s4ver__/{version_id}")
2340}
2341
2342/// Test for the marker substring used by [`versioned_shadow_key`]. Cheap str
2343/// scan; both list_objects filter and the GET passthrough check use this.
2344fn is_versioning_shadow_key(key: &str) -> bool {
2345    key.contains(".__s4ver__/")
2346}
2347
2348/// v0.6 #42: wall-clock seconds since the UNIX epoch — fed to
2349/// `mfa::check_mfa` so the TOTP verifier can match the client's
2350/// authenticator app's view of "now". Falls back to `0` on the
2351/// (impossible-in-practice) clock-before-1970 path so the verifier
2352/// rejects rather than panicking.
2353fn current_unix_secs() -> u64 {
2354    std::time::SystemTime::now()
2355        .duration_since(std::time::UNIX_EPOCH)
2356        .map(|d| d.as_secs())
2357        .unwrap_or(0)
2358}
2359
2360/// v0.6 #42: translate an `MfaError` into the matching S3 wire error.
2361///
2362/// - `Missing` / `SerialMismatch` / `InvalidCode` → `403 AccessDenied`
2363///   (S3 spec for MFA Delete: every gating failure surfaces as
2364///   `AccessDenied`, not a separate `MFA*` code).
2365/// - `Malformed` → `400 InvalidRequest` (the request itself is
2366///   syntactically broken, not a permission issue).
2367fn mfa_error_to_s3(e: crate::mfa::MfaError) -> S3Error {
2368    match e {
2369        crate::mfa::MfaError::Missing => S3Error::with_message(
2370            S3ErrorCode::AccessDenied,
2371            "MFA token required for this operation",
2372        ),
2373        crate::mfa::MfaError::Malformed => {
2374            S3Error::with_message(S3ErrorCode::InvalidRequest, "malformed x-amz-mfa header")
2375        }
2376        crate::mfa::MfaError::SerialMismatch => S3Error::with_message(
2377            S3ErrorCode::AccessDenied,
2378            "MFA serial does not match configured device",
2379        ),
2380        crate::mfa::MfaError::InvalidCode => {
2381            S3Error::with_message(S3ErrorCode::AccessDenied, "invalid MFA code")
2382        }
2383    }
2384}
2385
2386fn is_multipart_object(metadata: &Option<Metadata>) -> bool {
2387    metadata
2388        .as_ref()
2389        .and_then(|m| m.get(META_MULTIPART))
2390        .map(|v| v == "true")
2391        .unwrap_or(false)
2392}
2393
2394const META_CODEC: &str = "s4-codec";
2395const META_ORIGINAL_SIZE: &str = "s4-original-size";
2396const META_COMPRESSED_SIZE: &str = "s4-compressed-size";
2397const META_CRC32C: &str = "s4-crc32c";
2398/// Multipart upload で per-part frame format を使ったオブジェクトであることを示す。
2399/// GET 時にこの flag を見て frame parser を起動する。
2400const META_MULTIPART: &str = "s4-multipart";
2401/// v0.2 #4: single-PUT でも S4F2 framed format で書かれていることを示す。
2402/// 旧 v0.1 single-PUT は raw 圧縮 bytes (この flag なし)。GET 時にこの flag を
2403/// 見て framed 経路 (= multipart と同じ FrameIter parse) に流す。
2404const META_FRAMED: &str = "s4-framed";
2405
2406fn is_framed_v2_object(metadata: &Option<Metadata>) -> bool {
2407    metadata
2408        .as_ref()
2409        .and_then(|m| m.get(META_FRAMED))
2410        .map(|v| v == "true")
2411        .unwrap_or(false)
2412}
2413
2414/// v0.4 #21: detect SSE-S4 by the metadata flag we set on PUT.
2415fn is_sse_encrypted(metadata: &Option<Metadata>) -> bool {
2416    metadata
2417        .as_ref()
2418        .and_then(|m| m.get("s4-encrypted"))
2419        .map(|v| v == "aes-256-gcm")
2420        .unwrap_or(false)
2421}
2422
2423/// v0.5 #27: pull the three SSE-C headers off an input struct. The S3
2424/// contract is "all three or none" — partial sets are a 400.
2425///
2426/// Returns `Ok(None)` when no SSE-C headers were sent (server-managed or
2427/// no encryption), `Ok(Some(material))` on validated client key, and
2428/// `Err` for malformed or partial inputs.
2429fn extract_sse_c_material(
2430    algorithm: &Option<String>,
2431    key: &Option<String>,
2432    md5: &Option<String>,
2433) -> S3Result<Option<crate::sse::CustomerKeyMaterial>> {
2434    match (algorithm, key, md5) {
2435        (None, None, None) => Ok(None),
2436        (Some(a), Some(k), Some(m)) => crate::sse::parse_customer_key_headers(a, k, m)
2437            .map(Some)
2438            .map_err(sse_c_error_to_s3),
2439        _ => Err(S3Error::with_message(
2440            S3ErrorCode::InvalidRequest,
2441            "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
2442        )),
2443    }
2444}
2445
2446/// v0.5 #28: detect SSE-KMS request — `x-amz-server-side-encryption: aws:kms`.
2447/// Returns the key-id to wrap under, falling back to the gateway default.
2448fn extract_kms_key_id(
2449    sse: &Option<ServerSideEncryption>,
2450    sse_kms_key_id: &Option<String>,
2451    gateway_default: Option<&str>,
2452) -> Option<String> {
2453    let asks_for_kms = sse
2454        .as_ref()
2455        .map(|s| s.as_str() == ServerSideEncryption::AWS_KMS)
2456        .unwrap_or(false);
2457    if !asks_for_kms {
2458        return None;
2459    }
2460    sse_kms_key_id
2461        .clone()
2462        .or_else(|| gateway_default.map(str::to_owned))
2463}
2464
2465/// v0.5 #28: map kms module errors to AWS-shaped S3 error codes.
2466/// `KeyNotFound` is operator misconfig (400); `BackendUnavailable` is a
2467/// transient KMS outage (503). Other variants are 500 InternalError.
2468fn kms_error_to_s3(e: crate::kms::KmsError) -> S3Error {
2469    use crate::kms::KmsError as K;
2470    match e {
2471        K::KeyNotFound { key_id } => S3Error::with_message(
2472            S3ErrorCode::InvalidArgument,
2473            format!("KMS key not found: {key_id}"),
2474        ),
2475        K::BackendUnavailable { message } => S3Error::with_message(
2476            S3ErrorCode::ServiceUnavailable,
2477            format!("KMS backend unavailable: {message}"),
2478        ),
2479        other => S3Error::with_message(S3ErrorCode::InternalError, format!("KMS error: {other}")),
2480    }
2481}
2482
2483/// v0.5 #27: map sse module errors to AWS-shaped S3 error codes.
2484/// `WrongCustomerKey` → 403 AccessDenied (matches AWS behaviour);
2485/// `InvalidCustomerKey` / algorithm / required / unexpected → 400.
2486fn sse_c_error_to_s3(e: crate::sse::SseError) -> S3Error {
2487    use crate::sse::SseError as E;
2488    match e {
2489        E::WrongCustomerKey => S3Error::with_message(
2490            S3ErrorCode::AccessDenied,
2491            "SSE-C key does not match the key used at PUT time",
2492        ),
2493        E::InvalidCustomerKey { reason } => {
2494            S3Error::with_message(S3ErrorCode::InvalidArgument, format!("SSE-C: {reason}"))
2495        }
2496        E::CustomerKeyAlgorithmUnsupported { algo } => S3Error::with_message(
2497            S3ErrorCode::InvalidArgument,
2498            format!("SSE-C unsupported algorithm: {algo:?} (only AES256 is allowed)"),
2499        ),
2500        E::CustomerKeyRequired => S3Error::with_message(
2501            S3ErrorCode::InvalidRequest,
2502            "object is SSE-C encrypted; supply x-amz-server-side-encryption-customer-* headers",
2503        ),
2504        E::CustomerKeyUnexpected => S3Error::with_message(
2505            S3ErrorCode::InvalidRequest,
2506            "object is not SSE-C encrypted; do not send x-amz-server-side-encryption-customer-* headers",
2507        ),
2508        other => S3Error::with_message(S3ErrorCode::InternalError, format!("SSE error: {other}")),
2509    }
2510}
2511
2512fn extract_manifest(metadata: &Option<Metadata>) -> Option<ChunkManifest> {
2513    let m = metadata.as_ref()?;
2514    let codec = m
2515        .get(META_CODEC)
2516        .and_then(|s| s.parse::<CodecKind>().ok())?;
2517    let original_size = m.get(META_ORIGINAL_SIZE)?.parse().ok()?;
2518    let compressed_size = m.get(META_COMPRESSED_SIZE)?.parse().ok()?;
2519    let crc32c = m.get(META_CRC32C)?.parse().ok()?;
2520    Some(ChunkManifest {
2521        codec,
2522        original_size,
2523        compressed_size,
2524        crc32c,
2525    })
2526}
2527
2528fn write_manifest(metadata: &mut Option<Metadata>, manifest: &ChunkManifest) {
2529    let meta = metadata.get_or_insert_with(Default::default);
2530    meta.insert(META_CODEC.into(), manifest.codec.as_str().into());
2531    meta.insert(
2532        META_ORIGINAL_SIZE.into(),
2533        manifest.original_size.to_string(),
2534    );
2535    meta.insert(
2536        META_COMPRESSED_SIZE.into(),
2537        manifest.compressed_size.to_string(),
2538    );
2539    meta.insert(META_CRC32C.into(), manifest.crc32c.to_string());
2540}
2541
2542fn internal<E: std::fmt::Display>(prefix: &'static str) -> impl FnOnce(E) -> S3Error {
2543    move |e| S3Error::with_message(S3ErrorCode::InternalError, format!("{prefix}: {e}"))
2544}
2545
2546/// v0.6 #41: map a `select::SelectError` to the S3 error surface. AWS
2547/// uses a domain-specific `InvalidSqlExpression` code for parse / unsupported
2548/// errors, but s3s 0.13 doesn't expose that as a typed variant — we
2549/// fall back to the well-known `InvalidRequest` 400 with a descriptive
2550/// message that includes the original error context.
2551fn select_error_to_s3(e: crate::select::SelectError, fmt: &str) -> S3Error {
2552    use crate::select::SelectError;
2553    match e {
2554        SelectError::Parse(msg) => S3Error::with_message(
2555            S3ErrorCode::InvalidRequest,
2556            format!("SQL parse error: {msg}"),
2557        ),
2558        SelectError::UnsupportedFeature(msg) => S3Error::with_message(
2559            S3ErrorCode::InvalidRequest,
2560            format!("unsupported SQL feature: {msg}"),
2561        ),
2562        SelectError::RowEval(msg) => S3Error::with_message(
2563            S3ErrorCode::InvalidRequest,
2564            format!("SQL row evaluation error: {msg}"),
2565        ),
2566        SelectError::InputFormat(msg) => S3Error::with_message(
2567            S3ErrorCode::InvalidRequest,
2568            format!("{fmt} input format error: {msg}"),
2569        ),
2570    }
2571}
2572
2573/// v0.5 #30: parse the `x-amz-bypass-governance-retention` header into a
2574/// boolean flag. AWS S3 accepts `true` (case-insensitive); any other value
2575/// (including missing) is treated as `false`.
2576fn parse_bypass_governance_header(headers: &http::HeaderMap) -> bool {
2577    headers
2578        .get("x-amz-bypass-governance-retention")
2579        .and_then(|v| v.to_str().ok())
2580        .map(|s| s.eq_ignore_ascii_case("true"))
2581        .unwrap_or(false)
2582}
2583
2584/// Convert s3s `Timestamp` into a `chrono::DateTime<Utc>` by formatting it
2585/// as an RFC3339 string and re-parsing through `chrono`. The string format
2586/// avoids pulling the `time` crate (transitive dep of s3s, not declared by
2587/// s4-server) into our direct deps. Returns `None` if the format/parse fails
2588/// or the value is outside `chrono`'s supported range.
2589fn timestamp_to_chrono_utc(ts: &Timestamp) -> Option<chrono::DateTime<chrono::Utc>> {
2590    let mut buf = Vec::new();
2591    ts.format(s3s::dto::TimestampFormat::DateTime, &mut buf)
2592        .ok()?;
2593    let s = std::str::from_utf8(&buf).ok()?;
2594    chrono::DateTime::parse_from_rfc3339(s)
2595        .ok()
2596        .map(|dt| dt.with_timezone(&chrono::Utc))
2597}
2598
2599/// Inverse of [`timestamp_to_chrono_utc`] — emit RFC3339 (the s3s
2600/// `DateTime` wire format) and re-parse via `Timestamp::parse`.
2601fn chrono_utc_to_timestamp(dt: chrono::DateTime<chrono::Utc>) -> Timestamp {
2602    // chrono's RFC3339 output format matches s3s' parser ("...Z" with
2603    // optional sub-second precision). Fall back to UNIX_EPOCH if anything
2604    // unexpected happens — we never produce malformed strings, so this
2605    // branch is unreachable in practice.
2606    let s = dt.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
2607    Timestamp::parse(s3s::dto::TimestampFormat::DateTime, &s).unwrap_or_default()
2608}
2609
2610/// v0.6 #39: convert our internal [`crate::tagging::TagSet`] into the
2611/// s3s `Vec<Tag>` wire shape used on `GetObject/BucketTaggingOutput`.
2612/// Both halves of every pair land in the `Some(_)` slot — AWS marks
2613/// the field optional but always populates it on response.
2614fn tagset_to_aws(set: &crate::tagging::TagSet) -> Vec<Tag> {
2615    set.iter()
2616        .map(|(k, v)| Tag {
2617            key: Some(k.clone()),
2618            value: Some(v.clone()),
2619        })
2620        .collect()
2621}
2622
2623/// v0.6 #39: inverse of [`tagset_to_aws`] for input handlers. Missing
2624/// keys / values become empty strings (mirrors AWS, which rejects
2625/// `<Key/>` with InvalidTag at the parser layer; downstream
2626/// `TagSet::validate` then enforces our size limits).
2627fn aws_to_tagset(tags: &[Tag]) -> Result<crate::tagging::TagSet, crate::tagging::TagError> {
2628    let pairs = tags
2629        .iter()
2630        .map(|t| {
2631            (
2632                t.key.clone().unwrap_or_default(),
2633                t.value.clone().unwrap_or_default(),
2634            )
2635        })
2636        .collect();
2637    crate::tagging::TagSet::from_pairs(pairs)
2638}
2639
2640/// `Range` request を decompressed object サイズ `total` に適用して `(start, end_exclusive)`
2641/// を返す。`Range::Int { first, last }` は `bytes=first-last` (last は inclusive)、
2642/// `Range::Suffix { length }` は末尾 `length` byte。S3 仕様に準拠。
2643pub fn resolve_range(range: &s3s::dto::Range, total: u64) -> Result<(u64, u64), String> {
2644    if total == 0 {
2645        return Err("cannot range-get zero-length object".into());
2646    }
2647    match range {
2648        s3s::dto::Range::Int { first, last } => {
2649            let start = *first;
2650            let end_inclusive = match last {
2651                Some(l) => (*l).min(total - 1),
2652                None => total - 1,
2653            };
2654            if start > end_inclusive || start >= total {
2655                return Err(format!(
2656                    "range bytes={start}-{:?} out of object size {total}",
2657                    last
2658                ));
2659            }
2660            Ok((start, end_inclusive + 1))
2661        }
2662        s3s::dto::Range::Suffix { length } => {
2663            let len = (*length).min(total);
2664            Ok((total - len, total))
2665        }
2666    }
2667}
2668
2669#[async_trait::async_trait]
2670impl<B: S3> S3 for S4Service<B> {
2671    // === 圧縮を挟む path (PUT) ===
2672    #[tracing::instrument(
2673        name = "s4.put_object",
2674        skip(self, req),
2675        fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_in, bytes_out, latency_ms)
2676    )]
2677    async fn put_object(
2678        &self,
2679        mut req: S3Request<PutObjectInput>,
2680    ) -> S3Result<S3Response<PutObjectOutput>> {
2681        let put_start = Instant::now();
2682        let put_bucket = req.input.bucket.clone();
2683        let put_key = req.input.key.clone();
2684        // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
2685        self.check_not_reserved_key(&put_key, ReservedKeyMode::Mutating)?;
2686        let access_preamble = self.access_log_preamble(&req);
2687        self.enforce_rate_limit(&req, &put_bucket)?;
2688        // v0.6 #39: parse `x-amz-tagging` (URL-encoded query string) so
2689        // the IAM policy gate sees the request's tags via
2690        // `s3:RequestObjectTag/<key>`. `existing_object_tags` is also
2691        // resolved from the Tagging manager (when wired) so
2692        // `s3:ExistingObjectTag/<key>` works on overwrite.
2693        let request_tags: Option<crate::tagging::TagSet> = req
2694            .input
2695            .tagging
2696            .as_deref()
2697            .map(crate::tagging::parse_tagging_header)
2698            .transpose()
2699            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
2700        let existing_tags: Option<crate::tagging::TagSet> = self
2701            .tagging
2702            .as_ref()
2703            .and_then(|m| m.get_object_tags(&put_bucket, &put_key));
2704        self.enforce_policy_with_extra(
2705            &req,
2706            "s3:PutObject",
2707            &put_bucket,
2708            Some(&put_key),
2709            request_tags.as_ref(),
2710            existing_tags.as_ref(),
2711        )?;
2712        // v0.5 #30: an Object Lock-protected key cannot be overwritten by
2713        // a non-versioned PUT (Suspended / Unversioned bucket). Enabled
2714        // bucket PUTs are exempt because they materialise a fresh
2715        // version under a shadow key (`<key>.__s4ver__/<vid>`) — the
2716        // locked version's bytes are untouched. The check mirrors the
2717        // delete path (Compliance never bypassable, Governance via the
2718        // bypass header, legal hold never).
2719        if let Some(mgr) = self.object_lock.as_ref()
2720            && let Some(state) = mgr.get(&put_bucket, &put_key)
2721        {
2722            let bucket_versioned_enabled = self
2723                .versioning
2724                .as_ref()
2725                .map(|v| v.state(&put_bucket) == crate::versioning::VersioningState::Enabled)
2726                .unwrap_or(false);
2727            if !bucket_versioned_enabled {
2728                let bypass = parse_bypass_governance_header(&req.headers);
2729                let now = chrono::Utc::now();
2730                if !state.can_delete(now, bypass) {
2731                    crate::metrics::record_policy_denial("s3:PutObject", &put_bucket);
2732                    return Err(S3Error::with_message(
2733                        S3ErrorCode::AccessDenied,
2734                        "Access Denied because object protected by object lock",
2735                    ));
2736                }
2737            }
2738        }
2739        // v0.5 #30: per-PUT explicit retention / legal hold (S3
2740        // `x-amz-object-lock-mode`, `x-amz-object-lock-retain-until-date`,
2741        // `x-amz-object-lock-legal-hold`). Captured before the body
2742        // moves into the backend; persisted into the manager only on
2743        // backend success below.
2744        let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
2745            .input
2746            .object_lock_mode
2747            .as_ref()
2748            .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
2749        let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
2750            .input
2751            .object_lock_retain_until_date
2752            .as_ref()
2753            .and_then(timestamp_to_chrono_utc);
2754        let explicit_legal_hold_on: Option<bool> = req
2755            .input
2756            .object_lock_legal_hold_status
2757            .as_ref()
2758            .map(|s| s.as_str().eq_ignore_ascii_case("ON"));
2759        if let Some(blob) = req.input.body.take() {
2760            // v0.9 #106: parse client-supplied checksum headers
2761            // **before** awaiting any body bytes. A malformed
2762            // `Content-MD5` / `x-amz-checksum-*` value must surface
2763            // as `InvalidDigest` immediately so a slow / non-
2764            // delivering body cannot tie up the handler waiting on
2765            // bytes only to reject the request on a header-level
2766            // problem. The parsed `ClientChecksums` value is reused
2767            // by the streaming-framed branch below; the
2768            // bytes-buffered branch keeps its own
2769            // `verify_client_body_checksums` call which is idempotent
2770            // with this parse.
2771            let client_checksums = crate::streaming_checksum::ClientChecksums::from_request_fields(
2772                req.input.content_md5.as_deref(),
2773                req.input.checksum_crc32.as_deref(),
2774                req.input.checksum_crc32c.as_deref(),
2775                req.input.checksum_sha1.as_deref(),
2776                req.input.checksum_sha256.as_deref(),
2777                req.input.checksum_crc64nvme.as_deref(),
2778            )?;
2779            // Sample 4 KiB から codec を決定。streaming-aware codec なら streaming
2780            // compress fast path、そうでなければ従来の collect-then-compress。
2781            let (sample, rest_stream) = peek_sample(blob, SAMPLE_BYTES)
2782                .await
2783                .map_err(internal("peek put sample"))?;
2784            let sample_len = sample.len().min(SAMPLE_BYTES);
2785            // v0.8 #56: pass the request's Content-Length (when present) so
2786            // the sampling dispatcher can promote large objects to a GPU
2787            // codec. Chunked transfers (no Content-Length) keep CPU.
2788            let total_size_hint = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2789            let kind = self
2790                .dispatcher
2791                .pick_with_size_hint(&sample[..sample_len], total_size_hint)
2792                .await;
2793
2794            // Passthrough buys nothing from S4F2 wrapping (no compression =
2795            // no per-chunk frame to skip past) and the +28-byte header
2796            // overhead breaks size-sensitive callers that expect a true
2797            // pass-through. So passthrough always uses the legacy raw-blob
2798            // path; only compressing codecs go through the framed path.
2799            //
2800            // v0.9 #106 — true streaming PUT checksum verify. The
2801            // streaming-framed path used to fail-open on client-supplied
2802            // whole-body checksums (`x-amz-checksum-{crc32, crc32c, sha1,
2803            // sha256, crc64nvme}` and `Content-MD5`): the v0.8.13 #127
2804            // attempt to "force buffered when any checksum header is
2805            // present" had to be reverted in v0.8.14 #129 because modern
2806            // AWS SDKs auto-attach `x-amz-checksum-crc32`, which made
2807            // every SDK PUT lose the streaming-framed path and therefore
2808            // its sidecar (range_get_falls_back_to_full_when_sidecar_etag_stale
2809            // + upload_part_copy_propagates_source_version_id failed on
2810            // CI). v0.9 #106 keeps the streaming-framed path and tees
2811            // each chunk into a multi-hasher (`streaming_checksum`
2812            // module) as it flows through the compressor. On EOF the
2813            // hashers are finalised and compared; a mismatch surfaces
2814            // as a synthetic `io::Error` carrying
2815            // `StreamingChecksumError` which we downcast back below and
2816            // map to a typed 400 BadDigest. Sidecar emission is
2817            // unaffected — the verifier sits **upstream** of
2818            // `streaming_compress_to_frames`, so on mismatch the call
2819            // returns Err and we never reach the backend write or
2820            // sidecar build, preserving the post-revert invariant.
2821            //
2822            // Scope: single-PUT cpu-zstd / passthrough only. Multipart
2823            // `upload_part` keeps its buffered per-part verify (the
2824            // part body is already in memory there for framing /
2825            // padding, so streaming verify wouldn't save anything).
2826            // GPU codecs (nvcomp-*) fall through to the buffered
2827            // branch below — they are bytes-buffered today and use the
2828            // existing `verify_client_body_checksums`.
2829            // (`client_checksums` was parsed before `peek_sample`
2830            // above so malformed values fail pre-stream.)
2831            //
2832            // v0.9 #106 trailer support: the chunked / SigV4-streaming
2833            // SDK case attaches the actual checksum value in the
2834            // request **trailers** (post-body). The `x-amz-trailer`
2835            // request header announces which algorithm(s) will follow;
2836            // we use it to decide which hashers to spin up at body
2837            // start so the digest is ready to compare once trailers
2838            // arrive. After the codec consumes the body we read
2839            // `req.trailing_headers` and run a deferred comparison
2840            // against the finalised digests via
2841            // `ComputedDigests::compare_b64` (see post-stream block
2842            // below). Without this, a bad trailer checksum on the
2843            // streaming-framed path would silently pass — same
2844            // fail-open shape this issue is closing, different
2845            // delivery mechanism.
2846            let trailer_hashers: crate::streaming_checksum::WhichHashers = req
2847                .headers
2848                .get("x-amz-trailer")
2849                .and_then(|v| v.to_str().ok())
2850                .map(crate::streaming_checksum::WhichHashers::from_trailer_header)
2851                .unwrap_or_default();
2852            let which_hashers = client_checksums.which_hashers().or(trailer_hashers);
2853            let use_framed = supports_streaming_compress(kind) && kind != CodecKind::Passthrough;
2854            let (compressed, manifest, is_framed) = if use_framed {
2855                // streaming fast path: input は memory に collect しない
2856                let chained = chain_sample_with_rest(sample, rest_stream);
2857                // v0.9 #106: tee the chained input through a multi-hasher
2858                // when ANY client checksum claim is present (header or
2859                // trailer). The wrapper is a no-op (and skipped
2860                // entirely) when neither side has work, so non-
2861                // checksummed PUTs keep their pre-#106 throughput.
2862                let (chained, digest_handle) = if which_hashers.any() {
2863                    let (b, h) = crate::streaming_checksum::tee_into_hashers_with_handle(
2864                        chained,
2865                        client_checksums.clone(),
2866                        which_hashers,
2867                    );
2868                    (b, Some(h))
2869                } else {
2870                    (chained, None)
2871                };
2872                debug!(
2873                    bucket = ?req.input.bucket,
2874                    key = ?req.input.key,
2875                    codec = kind.as_str(),
2876                    path = "streaming-framed",
2877                    client_checksum_verify = client_checksums.any(),
2878                    "S4 put_object: compressing (streaming, S4F2 multi-frame)"
2879                );
2880                // v0.4 #16: pick the chunk size based on the request's
2881                // Content-Length when known, falling back to the 4 MiB
2882                // default for chunked transfers.
2883                let chunk_size = pick_chunk_size(req.input.content_length.map(|n| n as u64));
2884                // v0.8.4 #73 M2: pass the request's Content-Length so
2885                // streaming_compress_to_frames can fail-fast on a mid-PUT
2886                // truncation (client disconnect after sending half the
2887                // body). `None` is the chunked-Transfer-Encoding case
2888                // where the upstream genuinely doesn't know the size and
2889                // the backend's framing layer is the only truncation
2890                // signal we have.
2891                let expected_input_size =
2892                    req.input.content_length.and_then(|n| u64::try_from(n).ok());
2893                let (body, manifest) = streaming_compress_to_frames(
2894                    chained,
2895                    Arc::clone(&self.registry),
2896                    kind,
2897                    chunk_size,
2898                    expected_input_size,
2899                )
2900                .await
2901                .map_err(|e| match e {
2902                    s4_codec::CodecError::TruncatedStream { expected, got } => {
2903                        // 400 IncompleteBody: client advertised N bytes
2904                        // but disconnected after `got`. Mirrors AWS S3's
2905                        // canonical error code for the same shape so SDK
2906                        // retries kick in instead of treating the PUT as
2907                        // a successful upload of a half-body.
2908                        S3Error::with_message(
2909                            S3ErrorCode::IncompleteBody,
2910                            format!("PUT body truncated: expected {expected} bytes, got {got}"),
2911                        )
2912                    }
2913                    // v0.8.15 M-4: 400
2914                    // `RequestBodyLengthMismatch` for over-length
2915                    // bodies. AWS S3 returns this when the declared
2916                    // `Content-Length` is smaller than the wire body;
2917                    // S4 used to silently accept the surplus bytes.
2918                    // `IncompleteBody` is the closest typed variant
2919                    // in the s3s enum — we widen the message so the
2920                    // SDK / curl side sees the shape unambiguously.
2921                    s4_codec::CodecError::OverlengthStream { expected, got } => {
2922                        let code = S3ErrorCode::from_bytes(b"RequestBodyLengthMismatch")
2923                            .unwrap_or(S3ErrorCode::IncompleteBody);
2924                        S3Error::with_message(
2925                            code,
2926                            format!(
2927                                "PUT body length mismatch: Content-Length declared {expected} \
2928                                 bytes, body carried at least {got}"
2929                            ),
2930                        )
2931                    }
2932                    // v0.9 #106: streaming checksum mismatch — the tee
2933                    // wrapper emitted a synthetic io::Error carrying
2934                    // StreamingChecksumError. Downcast and remap to
2935                    // BadDigest so the client sees the same response
2936                    // the buffered path would have produced.
2937                    s4_codec::CodecError::Io(ref io_err) => {
2938                        if let Some(alg) =
2939                            crate::streaming_checksum::extract_streaming_checksum_error(io_err)
2940                        {
2941                            let code = S3ErrorCode::from_bytes(b"BadDigest")
2942                                .unwrap_or(S3ErrorCode::InvalidArgument);
2943                            S3Error::with_message(
2944                                code,
2945                                format!("client-supplied {alg} did not match the received body"),
2946                            )
2947                        } else {
2948                            internal("streaming framed compress")(e)
2949                        }
2950                    }
2951                    other => internal("streaming framed compress")(other),
2952                })?;
2953                // v0.9 #106 trailer-deferred verify. Header claims
2954                // have already been compared eagerly inside the tee
2955                // at EOF (mismatch surfaces as `BadDigest` through
2956                // the `CodecError::Io` branch above). Now that the
2957                // body has been fully consumed, request trailers are
2958                // available — delegate to the shared trailer-verify
2959                // helper (also used by the buffered branch below,
2960                // see v0.9 #106-audit-R2 P2-INT-2).
2961                //
2962                // **Fail-closed when announced trailers are
2963                // missing**: if the client announced
2964                // `x-amz-trailer: x-amz-checksum-*` but did NOT
2965                // deliver the trailer value (or the trailers block
2966                // never arrived), the helper refuses the PUT with
2967                // `BadDigest`. Skipping the comparison in that case
2968                // would silently re-open the streaming fail-open
2969                // this issue closes — a client could declare an
2970                // integrity check and then omit the value to bypass
2971                // verification.
2972                if let Some(handle) = digest_handle.as_ref() {
2973                    let announced = req
2974                        .headers
2975                        .get("x-amz-trailer")
2976                        .and_then(|v| v.to_str().ok());
2977                    // If the tee never finalised (computed is None)
2978                    // the body was incomplete; the CodecError path
2979                    // would have already surfaced — defensive belt
2980                    // for any future refactor. We still need a
2981                    // ComputedDigests instance to feed the helper
2982                    // when trailers were announced, so synthesise
2983                    // an empty one and let `compare_b64` reject
2984                    // every claim as BadDigest (every algorithm
2985                    // slot is None).
2986                    let computed = handle
2987                        .lock()
2988                        .expect("digest handle lock poisoned")
2989                        .clone()
2990                        .unwrap_or_default();
2991                    verify_client_trailer_checksums(
2992                        announced,
2993                        req.trailing_headers.as_ref(),
2994                        &computed,
2995                    )?;
2996                }
2997                (body, manifest, true)
2998            } else {
2999                // GPU codec 等で streaming-aware でないものは bytes-buffered path
3000                // (raw 圧縮 bytes、framed なし — back-compat 互換 path)
3001                let bytes = collect_with_sample(sample, rest_stream, self.max_body_bytes)
3002                    .await
3003                    .map_err(internal("collect put body (buffered path)"))?;
3004                // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
3005                // checksum algorithms against the received body on
3006                // the buffered path. The streaming-framed branch
3007                // above redirects here when ANY checksum header is
3008                // present (#127 MED-B), so this is the single
3009                // checkpoint for client-supplied integrity.
3010                verify_client_body_checksums(
3011                    &bytes,
3012                    req.input.content_md5.as_deref(),
3013                    req.input.checksum_crc32.as_deref(),
3014                    req.input.checksum_crc32c.as_deref(),
3015                    req.input.checksum_sha1.as_deref(),
3016                    req.input.checksum_sha256.as_deref(),
3017                    req.input.checksum_crc64nvme.as_deref(),
3018                )?;
3019                // v0.9 #106-audit-R2 P2-INT-2: SigV4-streaming trailer
3020                // checksums must verify on the buffered path too. Pre-fix
3021                // the streaming-framed branch above handled
3022                // `x-amz-trailer` while this branch silently dropped
3023                // it — a client could PUT through a GPU codec / non-
3024                // streaming dispatch and bypass trailer verification.
3025                // We have the full body in memory here, so a one-shot
3026                // `compute_digests` followed by the shared
3027                // `verify_client_trailer_checksums` helper closes the
3028                // gap. The hasher selector is derived from the same
3029                // `x-amz-trailer` header parser the streaming branch
3030                // uses (`WhichHashers::from_trailer_header`).
3031                if let Some(announced) = req
3032                    .headers
3033                    .get("x-amz-trailer")
3034                    .and_then(|v| v.to_str().ok())
3035                {
3036                    let which =
3037                        crate::streaming_checksum::WhichHashers::from_trailer_header(announced);
3038                    if which.any() {
3039                        let computed = crate::streaming_checksum::compute_digests(&bytes, which);
3040                        verify_client_trailer_checksums(
3041                            Some(announced),
3042                            req.trailing_headers.as_ref(),
3043                            &computed,
3044                        )?;
3045                    } else {
3046                        // Header announced only non-checksum trailers
3047                        // (e.g. `x-amz-trailer-signature`). The helper
3048                        // would return Ok in that case — invoke it
3049                        // anyway for symmetry with the streaming branch
3050                        // so a future change to the filter logic stays
3051                        // wired through both paths.
3052                        verify_client_trailer_checksums(
3053                            Some(announced),
3054                            req.trailing_headers.as_ref(),
3055                            &crate::streaming_checksum::ComputedDigests::default(),
3056                        )?;
3057                    }
3058                }
3059                debug!(
3060                    bucket = ?req.input.bucket,
3061                    key = ?req.input.key,
3062                    bytes = bytes.len(),
3063                    codec = kind.as_str(),
3064                    path = "buffered",
3065                    "S4 put_object: compressing (buffered, raw blob)"
3066                );
3067                // v0.8 #55: telemetry-returning compress so we can stamp
3068                // GPU-pipeline Prometheus metrics (`s4_gpu_compress_seconds`,
3069                // throughput gauge, OOM counter) for nvcomp / dietgpu codecs.
3070                // CPU codecs come back with `gpu_seconds = None` and the
3071                // stamp helper short-circuits — no extra cost on CPU path.
3072                let (compress_res, tel) = self.registry.compress_with_telemetry(bytes, kind).await;
3073                stamp_gpu_compress_telemetry(&tel);
3074                let (body, m) = compress_res.map_err(internal("registry compress"))?;
3075                (body, m, false)
3076            };
3077
3078            write_manifest(&mut req.input.metadata, &manifest);
3079            if is_framed {
3080                // v0.2 #4: framed body であることを GET 側に伝える meta flag。
3081                req.input
3082                    .metadata
3083                    .get_or_insert_with(Default::default)
3084                    .insert(META_FRAMED.into(), "true".into());
3085            }
3086            // 重要: content_length を圧縮後サイズで更新する。
3087            // これを忘れると下流 (aws-sdk-s3 → S3) が宣言サイズ分の bytes を
3088            // 待ち続けて RequestTimeout で失敗する (S3 仕様)。
3089            req.input.content_length = Some(compressed.len() as i64);
3090            // body を書き換えたので、客側が送ってきた original body 用の
3091            // checksum / MD5 ヘッダは無効化する (そのまま転送すると下流 S3 が
3092            // XAmzContentChecksumMismatch を返す)。S4 自身の整合性は
3093            // ChunkManifest.crc32c で担保している。
3094            req.input.checksum_algorithm = None;
3095            req.input.checksum_crc32 = None;
3096            req.input.checksum_crc32c = None;
3097            req.input.checksum_crc64nvme = None;
3098            req.input.checksum_sha1 = None;
3099            req.input.checksum_sha256 = None;
3100            req.input.content_md5 = None;
3101            let original_size = manifest.original_size;
3102            let compressed_size = manifest.compressed_size;
3103            let codec_label = manifest.codec.as_str();
3104            // (sidecar_index is built below, after the SSE-mode
3105            // extraction, so v0.8.12 HIGH-10 can short-circuit the
3106            // build when the on-disk bytes are about to be encrypted.)
3107            // v0.4 #21 / v0.5 #29 / v0.5 #27: encrypt-after-compress.
3108            // Precedence:
3109            //   - SSE-C headers present → per-request customer key (S4E3)
3110            //   - server-managed keyring configured → active key (S4E2)
3111            //   - neither → no encryption (raw compressed body)
3112            // The `s4-encrypted: aes-256-gcm` metadata flag is set in
3113            // both encrypted modes; the on-disk frame magic distinguishes
3114            // S4E1 / S4E2 / S4E3 so GET picks the right decrypt path.
3115            // v0.7 #48 BUG-2/3 fix: take() the SSE fields off req.input
3116            // so the encryption headers are NOT forwarded to the
3117            // backend. S4 owns the encrypt-then-store contract; if we
3118            // leave the headers in place, real S3-compat backends
3119            // (MinIO / AWS) try to apply their own SSE on top and
3120            // either reject (MinIO requires HTTPS for SSE-C) or fail
3121            // (MinIO has no KMS configured). MemoryBackend ignored
3122            // these so mock tests passed.
3123            let sse_c_alg = req.input.sse_customer_algorithm.take();
3124            let sse_c_key = req.input.sse_customer_key.take();
3125            let sse_c_md5 = req.input.sse_customer_key_md5.take();
3126            let sse_header = req.input.server_side_encryption.take();
3127            let sse_kms_key = req.input.ssekms_key_id.take();
3128            let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3129            // v0.5 #28: SSE-KMS request? Resolves to None unless the
3130            // request asks for `aws:kms` AND a key id is available
3131            // (explicit header or gateway default). When set, we'll
3132            // generate a per-object DEK below.
3133            let kms_key_id = extract_kms_key_id(
3134                &sse_header,
3135                &sse_kms_key,
3136                self.kms_default_key_id.as_deref(),
3137            );
3138            // v0.8.12 HIGH-10 fix: the sidecar offsets describe the
3139            // pre-encrypt `compressed` body, but the bytes the
3140            // backend stores when any SSE mode is active are
3141            // *post-encrypt* (different length, different layout).
3142            // A Range GET on an SSE-encrypted object would slice the
3143            // ciphertext at the stale offsets, hand the wrong bytes
3144            // to the frame parser, and 500. Suppress the sidecar
3145            // entirely when SSE is going to be applied below;
3146            // encrypted-object Range GET falls back to the buffered
3147            // path (decrypt full body → frame parse → slice), trading
3148            // partial-fetch performance for correctness.
3149            //
3150            // v0.9 #106 (encryption-aware sidecar): re-enable sidecar
3151            // emission for the **SSE-S4 chunked (S4E6) path only** —
3152            // S4E6 chunks are per-chunk independently sealed so the
3153            // GET path can compute encrypted byte ranges, partial-fetch
3154            // just the needed chunks, decrypt + frame-parse + slice.
3155            // The pre-encrypt `compressed` offsets in the sidecar are
3156            // still load-bearing (the GET path decrypts into the
3157            // pre-encrypt domain before frame-parsing), with the new
3158            // v3 SSE binding (`sse_v3`) stamped below once the
3159            // encrypt path runs and reveals the per-PUT salt /
3160            // chunk_count / key_id. SSE-KMS / SSE-C / S4E2 buffered
3161            // (`--sse-chunk-size 0`) keep the v0.8.12 #120 buffered
3162            // fallback (= sidecar suppressed) — multi-mode plumbing
3163            // is the v0.10+ roadmap.
3164            let will_encrypt =
3165                sse_c_material.is_some() || kms_key_id.is_some() || self.sse_keyring.is_some();
3166            let sse_s4_chunked_path = sse_c_material.is_none()
3167                && kms_key_id.is_none()
3168                && self.sse_keyring.is_some()
3169                && self.sse_chunk_size > 0;
3170            let sidecar_index = if is_framed && (!will_encrypt || sse_s4_chunked_path) {
3171                s4_codec::index::build_index_from_body(&compressed).ok()
3172            } else {
3173                None
3174            };
3175            // v0.5 #32: in compliance-strict mode, every PUT must
3176            // declare SSE — either client-supplied (SSE-C), KMS, or by
3177            // virtue of a server-side keyring being configured (which
3178            // applies SSE-S4 to every PUT automatically). Requests that
3179            // would otherwise land as plain compressed bytes are
3180            // rejected with 400 InvalidRequest.
3181            if self.compliance_strict
3182                && sse_c_material.is_none()
3183                && kms_key_id.is_none()
3184                && self.sse_keyring.is_none()
3185                && sse_header.as_ref().map(|s| s.as_str()) != Some(ServerSideEncryption::AES256)
3186            {
3187                return Err(S3Error::with_message(
3188                    S3ErrorCode::InvalidRequest,
3189                    "compliance-mode strict: PUT must include x-amz-server-side-encryption \
3190                     (AES256 or aws:kms) or x-amz-server-side-encryption-customer-* headers",
3191                ));
3192            }
3193            // SSE-C and SSE-KMS are mutually exclusive on a single PUT
3194            // (AWS S3 returns 400 InvalidArgument). SSE-C wins by spec.
3195            if sse_c_material.is_some() && kms_key_id.is_some() {
3196                return Err(S3Error::with_message(
3197                    S3ErrorCode::InvalidArgument,
3198                    "SSE-C and SSE-KMS cannot be used together on the same PUT",
3199                ));
3200            }
3201            // KMS path needs to call generate_dek().await before the
3202            // body_to_send branch; capture the result here.
3203            //
3204            // v0.8.1 #58: the plaintext DEK lives in three places
3205            // during one PUT:
3206            //
3207            //   1. The `Zeroizing<Vec<u8>>` returned by `generate_dek`
3208            //      — wiped when the binding `dek` falls out of scope at
3209            //      the end of this `if`-arm.
3210            //   2. The stack `[u8; 32]` we copy into for `SseSource::Kms`
3211            //      — wrapped in `Zeroizing<[u8; 32]>` so it's wiped when
3212            //      the outer `kms_wrap` `Option` is dropped at the end
3213            //      of `put_object`.
3214            //   3. AES-GCM internal key state inside the `aes-gcm`
3215            //      crate during `encrypt_with_source` — out of scope
3216            //      for this fix; tracked separately in v0.8.2.
3217            let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
3218                if let Some(ref key_id) = kms_key_id {
3219                    let kms = self.kms.as_ref().ok_or_else(|| {
3220                    S3Error::with_message(
3221                        S3ErrorCode::InvalidRequest,
3222                        "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
3223                    )
3224                })?;
3225                    // `dek` is `Zeroizing<Vec<u8>>`; deref + slice access
3226                    // works unchanged via `Deref<Target=Vec<u8>>`.
3227                    let (dek, wrapped) = kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
3228                    if dek.len() != 32 {
3229                        return Err(S3Error::with_message(
3230                            S3ErrorCode::InternalError,
3231                            format!(
3232                                "KMS backend returned a DEK of {} bytes (expected 32)",
3233                                dek.len()
3234                            ),
3235                        ));
3236                    }
3237                    let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
3238                        zeroize::Zeroizing::new([0u8; 32]);
3239                    dek_arr.copy_from_slice(&dek);
3240                    // `dek` (the `Zeroizing<Vec<u8>>`) is dropped at the
3241                    // end of this scope, wiping the heap allocation.
3242                    Some((dek_arr, wrapped))
3243                } else {
3244                    None
3245                };
3246            // v0.7 #48 BUG-4 fix: stamp the SSE *type* into metadata
3247            // alongside `s4-encrypted` so HEAD (which doesn't fetch the
3248            // body) can echo the correct `x-amz-server-side-encryption`
3249            // value. Without this, HEAD on an SSE-KMS object would not
3250            // echo `aws:kms` because the frame magic is only available
3251            // on the body (which HEAD doesn't read).
3252            let body_to_send = if let Some(ref m) = sse_c_material {
3253                let meta = req.input.metadata.get_or_insert_with(Default::default);
3254                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
3255                meta.insert("s4-sse-type".into(), "AES256".into());
3256                meta.insert(
3257                    "s4-sse-c-key-md5".into(),
3258                    base64::engine::general_purpose::STANDARD.encode(m.key_md5),
3259                );
3260                crate::sse::encrypt_with_source(
3261                    &compressed,
3262                    crate::sse::SseSource::CustomerKey {
3263                        key: &m.key,
3264                        key_md5: &m.key_md5,
3265                    },
3266                )
3267            } else if let Some((ref dek, ref wrapped)) = kms_wrap {
3268                let meta = req.input.metadata.get_or_insert_with(Default::default);
3269                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
3270                meta.insert("s4-sse-type".into(), "aws:kms".into());
3271                meta.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
3272                // v0.8.1 #58: `dek` is `&Zeroizing<[u8; 32]>`; `SseSource::Kms`
3273                // wants `&[u8; 32]`. Rust auto-derefs `&Zeroizing<T>` to
3274                // `&T` here via `Deref<Target=T>`, so the binding picks
3275                // up the inner array reference without copying. The array
3276                // stays in the `Zeroizing` wrapper that owns it and gets
3277                // wiped when `kms_wrap` drops at the end of `put_object`.
3278                let dek_ref: &[u8; 32] = dek;
3279                crate::sse::encrypt_with_source(
3280                    &compressed,
3281                    crate::sse::SseSource::Kms {
3282                        dek: dek_ref,
3283                        wrapped,
3284                    },
3285                )
3286            } else if let Some(keyring) = self.sse_keyring.as_ref() {
3287                // SSE-S4 is server-driven transparent encryption; the
3288                // client didn't ask for SSE. We stamp `s4-encrypted`
3289                // (internal flag the GET path needs) but deliberately
3290                // do NOT stamp `s4-sse-type` — that lights up the HEAD
3291                // echo of `x-amz-server-side-encryption: AES256`,
3292                // which would falsely advertise AWS-style SSE-S3
3293                // semantics the operator didn't request.
3294                let meta = req.input.metadata.get_or_insert_with(Default::default);
3295                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
3296                // v0.8 #52: when `--sse-chunk-size > 0` is configured,
3297                // emit the chunked S4E5 frame so the matching GET can
3298                // stream-decrypt instead of buffering 5 GiB before
3299                // emitting a byte. Falls back to the buffered S4E2
3300                // frame at chunk_size=0 (default) so existing
3301                // deployments are bit-for-bit unchanged.
3302                if self.sse_chunk_size > 0 {
3303                    crate::sse::encrypt_v2_chunked(&compressed, keyring, self.sse_chunk_size)
3304                        .map_err(|e| {
3305                            S3Error::with_message(
3306                                S3ErrorCode::InternalError,
3307                                format!("SSE-S4 chunked encrypt failed: {e}"),
3308                            )
3309                        })?
3310                } else {
3311                    crate::sse::encrypt_v2(&compressed, keyring)
3312                }
3313            } else {
3314                compressed.clone()
3315            };
3316            // v0.9 #106: when the SSE-S4 chunked path ran (and only
3317            // that path — SSE-KMS / SSE-C / S4E2 buffered keep the
3318            // buffered fallback), parse the S4E6 header bytes back
3319            // out of `body_to_send` to recover the per-PUT salt /
3320            // key_id / chunk_count and stamp them onto the sidecar's
3321            // SSE binding. The salt isn't secret (it lives in the
3322            // encrypted body's plaintext header) so duplicating it
3323            // in the sidecar saves the GET path an extra HEAD/GET to
3324            // re-derive it. `parse_s4e6_header` reads the fixed-
3325            // layout fields only — any failure leaves `sse_binding`
3326            // as `None`, which falls through to the legacy buffered
3327            // fallback on GET (= safe degradation, not corruption).
3328            let sse_binding: Option<s4_codec::index::SseChunkBinding> = if sse_s4_chunked_path {
3329                match crate::sse::parse_s4e6_header(&body_to_send) {
3330                    Ok(hdr) => Some(s4_codec::index::SseChunkBinding {
3331                        enc_chunk_size: hdr.chunk_size,
3332                        enc_chunk_count: hdr.chunk_count,
3333                        enc_key_id: hdr.key_id,
3334                        enc_salt: *hdr.salt,
3335                        enc_plaintext_len: compressed.len() as u64,
3336                        // S4E6_HEADER_BYTES = 24 today; carried
3337                        // explicitly so a future bump (e.g. S4E7
3338                        // with a different fixed-header size) can't
3339                        // silently break v3 sidecar decode.
3340                        enc_header_bytes: crate::sse::S4E6_HEADER_BYTES as u32,
3341                    }),
3342                    Err(e) => {
3343                        tracing::warn!(
3344                            bucket = %put_bucket,
3345                            key = %put_key,
3346                            "S4 sidecar SSE-binding stamp failed (Range GET will fall back \
3347                             to buffered): {e}"
3348                        );
3349                        None
3350                    }
3351                }
3352            } else {
3353                None
3354            };
3355            // v0.6 #40: capture the about-to-be-sent body + metadata so
3356            // the replication dispatcher (run after the source PUT
3357            // succeeds) can hand the same backend bytes to the
3358            // destination bucket. `Bytes` clone is cheap (refcounted).
3359            let replication_body = body_to_send.clone();
3360            let replication_metadata = req.input.metadata.clone();
3361            // v0.7 #48 BUG-1 fix: SSE encryption (S4E1/E2/E3/E4 frames)
3362            // makes the body longer than the post-compression bytes
3363            // (header + nonce + tag overhead). The earlier
3364            // content_length stamp at compressed.len() is now stale, so
3365            // re-stamp from the actual bytes about to be sent or the
3366            // backend (real S3 / MinIO) rejects with
3367            // `StreamLengthMismatch`. MemoryBackend never validated
3368            // this, which is why mock-only tests passed.
3369            req.input.content_length = Some(body_to_send.len() as i64);
3370            req.input.body = Some(bytes_to_blob(body_to_send));
3371            // v0.5 #34: pre-allocate a version-id when the bucket is
3372            // Enabled, then redirect the backend storage key to the
3373            // shadow path so older versions survive newer PUTs.
3374            // Suspended / Unversioned buckets keep using the plain
3375            // `<key>` (S3 spec: Suspended overwrites the same backend
3376            // object). Pre-allocation (instead of recording after PUT)
3377            // ensures the shadow key + the response's
3378            // `x-amz-version-id` use the same vid.
3379            let pending_version: Option<crate::versioning::PutOutcome> = self
3380                .versioning
3381                .as_ref()
3382                .map(|mgr| mgr.state(&put_bucket))
3383                .map(|state| match state {
3384                    crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3385                        version_id: crate::versioning::VersioningManager::new_version_id(),
3386                        versioned_response: true,
3387                    },
3388                    crate::versioning::VersioningState::Suspended
3389                    | crate::versioning::VersioningState::Unversioned => {
3390                        crate::versioning::PutOutcome {
3391                            version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3392                            versioned_response: false,
3393                        }
3394                    }
3395                });
3396            if let Some(ref pv) = pending_version
3397                && pv.versioned_response
3398            {
3399                req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3400            }
3401            // v0.8.4 #73 H-2: capture the to-be-stored body length BEFORE
3402            // the move into `req.input` is consumed by the backend call.
3403            // The sidecar's `source_compressed_size` is checked against
3404            // the live HEAD `Content-Length` on Range GET to detect a
3405            // backend-side mutation.
3406            let backend_object_size = req.input.content_length.and_then(|n| u64::try_from(n).ok());
3407            let mut backend_resp = self.backend.put_object(req).await;
3408            // v0.9 #106 (Codex P2): on the SSE-S4 chunked PUT path,
3409            // if we *couldn't* recover the per-PUT salt / key_id /
3410            // chunk_count (= `sse_binding.is_none()`), we MUST NOT
3411            // emit any sidecar — the bytes on disk are S4E6-encrypted
3412            // and the offsets in `sidecar_index` are pre-encrypt. A
3413            // v2 sidecar (sans SSE binding) would skip the encryption-
3414            // aware GET fast-path AND skip the v0.8.12 #120 buffered
3415            // fallback (the GET path treats a present sidecar as
3416            // "use partial_range_get on the backend body"), so it
3417            // would slice ciphertext at plaintext offsets, hand wrong
3418            // bytes to the frame parser, and 500 (or worse, return
3419            // garbage that decodes by accident). Drop the sidecar so
3420            // the GET falls back to buffered = correct.
3421            let suppress_sidecar_for_failed_sse_binding =
3422                sse_s4_chunked_path && sse_binding.is_none();
3423            if let Some(mut idx) = sidecar_index
3424                && let Ok(ref resp) = backend_resp
3425                && idx.entries.len() > 1
3426                && !suppress_sidecar_for_failed_sse_binding
3427            {
3428                // 1 chunk しかない (small object) なら sidecar は意味がない (=
3429                // partial fetch しても full body と同じ範囲) ので省略。
3430                // Sidecar は user-visible key で書く (latest version の
3431                // partial fetch path 用)。Old versions の Range GET は今 task
3432                // の scope 外 (full read fallback でも意味的には正しい)。
3433                //
3434                // v0.8.4 #73 H-2: stamp the version-binding fields the
3435                // GET path needs to detect a stale / attacker-written
3436                // sidecar. ETag comes from the backend's PUT response —
3437                // when missing (some backends don't return an ETag) we
3438                // synthesize a CRC-derived stable identifier so the
3439                // sidecar still binds to *something*; the GET HEAD will
3440                // see the same backend ETag (None vs None) and treat the
3441                // pair as consistent.
3442                let source_etag = resp.output.e_tag.as_ref().map(|t| t.value().to_string());
3443                idx.source_etag = source_etag;
3444                idx.source_compressed_size = backend_object_size;
3445                // v0.9 #106: stamp the SSE chunked binding so the GET
3446                // path can run the encrypted Range partial-fetch
3447                // fast-path. `None` keeps the sidecar at v2 layout
3448                // (= existing behaviour for non-SSE-S4-chunked PUTs).
3449                idx.sse_v3 = sse_binding;
3450                self.write_sidecar(&put_bucket, &put_key, &idx).await;
3451            }
3452            // v0.5 #34: commit the new version into the manager only on
3453            // backend success. Use the pre-allocated vid so the response
3454            // header and the chain entry agree.
3455            if let (Some(mgr), Some(pv), Ok(resp)) = (
3456                self.versioning.as_ref(),
3457                pending_version.as_ref(),
3458                backend_resp.as_mut(),
3459            ) {
3460                let etag = resp
3461                    .output
3462                    .e_tag
3463                    .clone()
3464                    .map(ETag::into_value)
3465                    .unwrap_or_else(|| format!("\"crc32c-{}\"", manifest.crc32c));
3466                let now = chrono::Utc::now();
3467                mgr.commit_put_with_version(
3468                    &put_bucket,
3469                    &put_key,
3470                    crate::versioning::VersionEntry {
3471                        version_id: pv.version_id.clone(),
3472                        etag,
3473                        size: original_size,
3474                        is_delete_marker: false,
3475                        created_at: now,
3476                    },
3477                );
3478                if pv.versioned_response {
3479                    resp.output.version_id = Some(pv.version_id.clone());
3480                }
3481            }
3482            // v0.5 #27: AWS S3 echoes the SSE-C headers back on success
3483            // so the client knows the server actually applied the
3484            // requested algorithm and which key fingerprint matched.
3485            if let (Some(m), Ok(resp)) = (sse_c_material.as_ref(), backend_resp.as_mut()) {
3486                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
3487                resp.output.sse_customer_key_md5 =
3488                    Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
3489            }
3490            // v0.5 #28: SSE-KMS echo — `aws:kms` + the canonical key id
3491            // the backend returned (AWS KMS returns the ARN even when
3492            // the request used an alias).
3493            if let (Some((_, wrapped)), Ok(resp)) = (kms_wrap.as_ref(), backend_resp.as_mut()) {
3494                resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
3495                    ServerSideEncryption::AWS_KMS,
3496                ));
3497                resp.output.ssekms_key_id = Some(wrapped.key_id.clone());
3498            }
3499            // v0.5 #30: persist any per-PUT explicit retention / legal
3500            // hold the client supplied, then auto-apply the bucket
3501            // default (no-op when state is already populated). The
3502            // explicit fields take precedence — the bucket-default
3503            // helper bails out as soon as it sees any retention.
3504            if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3505                if explicit_lock_mode.is_some()
3506                    || explicit_retain_until.is_some()
3507                    || explicit_legal_hold_on.is_some()
3508                {
3509                    let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3510                    if let Some(m) = explicit_lock_mode {
3511                        state.mode = Some(m);
3512                    }
3513                    if let Some(u) = explicit_retain_until {
3514                        state.retain_until = Some(u);
3515                    }
3516                    if let Some(lh) = explicit_legal_hold_on {
3517                        state.legal_hold_on = lh;
3518                    }
3519                    mgr.set(&put_bucket, &put_key, state);
3520                }
3521                mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3522            }
3523            let _ = (original_size, compressed_size); // mute unused warnings
3524            let elapsed = put_start.elapsed();
3525            crate::metrics::record_put(
3526                codec_label,
3527                original_size,
3528                compressed_size,
3529                elapsed.as_secs_f64(),
3530                backend_resp.is_ok(),
3531            );
3532            // v0.4 #20: structured access-log entry (best-effort).
3533            self.record_access(
3534                access_preamble,
3535                "REST.PUT.OBJECT",
3536                &put_bucket,
3537                Some(&put_key),
3538                if backend_resp.is_ok() { 200 } else { 500 },
3539                compressed_size,
3540                original_size,
3541                elapsed.as_millis() as u64,
3542                backend_resp.as_ref().err().map(|e| e.code().as_str()),
3543            )
3544            .await;
3545            info!(
3546                op = "put_object",
3547                bucket = %put_bucket,
3548                key = %put_key,
3549                codec = codec_label,
3550                bytes_in = original_size,
3551                bytes_out = compressed_size,
3552                ratio = format!(
3553                    "{:.3}",
3554                    if original_size == 0 { 1.0 } else { compressed_size as f64 / original_size as f64 }
3555                ),
3556                latency_ms = elapsed.as_millis() as u64,
3557                ok = backend_resp.is_ok(),
3558                "S4 put completed"
3559            );
3560            // v0.6 #35: fire bucket-notification destinations (best-effort,
3561            // detached). Skipped when no manager is attached or when the
3562            // bucket has no rule matching `s3:ObjectCreated:Put` for this
3563            // key.
3564            if backend_resp.is_ok()
3565                && let Some(mgr) = self.notifications.as_ref()
3566            {
3567                let dests = mgr.match_destinations(
3568                    &put_bucket,
3569                    &crate::notifications::EventType::ObjectCreatedPut,
3570                    &put_key,
3571                );
3572                if !dests.is_empty() {
3573                    let etag = backend_resp
3574                        .as_ref()
3575                        .ok()
3576                        .and_then(|r| r.output.e_tag.clone())
3577                        .map(ETag::into_value);
3578                    let version_id = pending_version
3579                        .as_ref()
3580                        .filter(|pv| pv.versioned_response)
3581                        .map(|pv| pv.version_id.clone());
3582                    tokio::spawn(crate::notifications::dispatch_event(
3583                        Arc::clone(mgr),
3584                        put_bucket.clone(),
3585                        put_key.clone(),
3586                        crate::notifications::EventType::ObjectCreatedPut,
3587                        Some(original_size),
3588                        etag,
3589                        version_id,
3590                        format!("S4-{}", uuid::Uuid::new_v4()),
3591                    ));
3592                }
3593            }
3594            // v0.6 #39: persist parsed `x-amz-tagging` tags into the
3595            // tagging manager on a successful PUT. AWS PutObject's
3596            // tagging is a full-replace operation (not a merge), so
3597            // any pre-existing entry for `(bucket, key)` is overwritten.
3598            if backend_resp.is_ok()
3599                && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3600            {
3601                mgr.put_object_tags(&put_bucket, &put_key, tags);
3602            }
3603            // v0.6 #40: cross-bucket replication fire-point. On
3604            // successful source PUT, consult the replication manager;
3605            // when an enabled rule matches, mark the source key
3606            // `Pending` and spawn a detached task that PUTs the same
3607            // backend bytes + metadata to the rule's destination
3608            // bucket. The dispatcher itself records `Completed` /
3609            // `Failed` and bumps the drop counter on retry-budget
3610            // exhaustion.
3611            self.spawn_replication_if_matched(
3612                &put_bucket,
3613                &put_key,
3614                &request_tags,
3615                &replication_body,
3616                &replication_metadata,
3617                backend_resp.is_ok(),
3618                pending_version.as_ref(),
3619            );
3620            return backend_resp;
3621        }
3622        // Body-less PUT (rare: zero-length object). Mirror the body-full
3623        // versioning hooks so list_object_versions / GET-by-version still see
3624        // empty-body objects in the chain.
3625        let pending_version: Option<crate::versioning::PutOutcome> = self
3626            .versioning
3627            .as_ref()
3628            .map(|mgr| mgr.state(&put_bucket))
3629            .map(|state| match state {
3630                crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3631                    version_id: crate::versioning::VersioningManager::new_version_id(),
3632                    versioned_response: true,
3633                },
3634                _ => crate::versioning::PutOutcome {
3635                    version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3636                    versioned_response: false,
3637                },
3638            });
3639        if let Some(ref pv) = pending_version
3640            && pv.versioned_response
3641        {
3642            req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3643        }
3644        let mut backend_resp = self.backend.put_object(req).await;
3645        if let (Some(mgr), Some(pv), Ok(resp)) = (
3646            self.versioning.as_ref(),
3647            pending_version.as_ref(),
3648            backend_resp.as_mut(),
3649        ) {
3650            let etag = resp
3651                .output
3652                .e_tag
3653                .clone()
3654                .map(ETag::into_value)
3655                .unwrap_or_default();
3656            let now = chrono::Utc::now();
3657            mgr.commit_put_with_version(
3658                &put_bucket,
3659                &put_key,
3660                crate::versioning::VersionEntry {
3661                    version_id: pv.version_id.clone(),
3662                    etag,
3663                    size: 0,
3664                    is_delete_marker: false,
3665                    created_at: now,
3666                },
3667            );
3668            if pv.versioned_response {
3669                resp.output.version_id = Some(pv.version_id.clone());
3670            }
3671        }
3672        // v0.5 #30: same explicit-then-default lock-state commit as the
3673        // body-bearing branch above, so a zero-length PUT also picks up
3674        // bucket-default retention.
3675        if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3676            if explicit_lock_mode.is_some()
3677                || explicit_retain_until.is_some()
3678                || explicit_legal_hold_on.is_some()
3679            {
3680                let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3681                if let Some(m) = explicit_lock_mode {
3682                    state.mode = Some(m);
3683                }
3684                if let Some(u) = explicit_retain_until {
3685                    state.retain_until = Some(u);
3686                }
3687                if let Some(lh) = explicit_legal_hold_on {
3688                    state.legal_hold_on = lh;
3689                }
3690                mgr.set(&put_bucket, &put_key, state);
3691            }
3692            mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3693        }
3694        // v0.6 #35: same notification fire-point as the body-bearing PUT
3695        // branch above (zero-length objects still match `ObjectCreated:Put`
3696        // rules per the AWS event taxonomy).
3697        if backend_resp.is_ok()
3698            && let Some(mgr) = self.notifications.as_ref()
3699        {
3700            let dests = mgr.match_destinations(
3701                &put_bucket,
3702                &crate::notifications::EventType::ObjectCreatedPut,
3703                &put_key,
3704            );
3705            if !dests.is_empty() {
3706                let etag = backend_resp
3707                    .as_ref()
3708                    .ok()
3709                    .and_then(|r| r.output.e_tag.clone())
3710                    .map(ETag::into_value);
3711                let version_id = pending_version
3712                    .as_ref()
3713                    .filter(|pv| pv.versioned_response)
3714                    .map(|pv| pv.version_id.clone());
3715                tokio::spawn(crate::notifications::dispatch_event(
3716                    Arc::clone(mgr),
3717                    put_bucket.clone(),
3718                    put_key.clone(),
3719                    crate::notifications::EventType::ObjectCreatedPut,
3720                    Some(0),
3721                    etag,
3722                    version_id,
3723                    format!("S4-{}", uuid::Uuid::new_v4()),
3724                ));
3725            }
3726        }
3727        // v0.6 #39: persist parsed `x-amz-tagging` for the body-less
3728        // (zero-length) PUT branch too — same shape as the body-bearing
3729        // branch above.
3730        if backend_resp.is_ok()
3731            && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3732        {
3733            mgr.put_object_tags(&put_bucket, &put_key, tags);
3734        }
3735        // v0.6 #40: cross-bucket replication for the zero-length PUT
3736        // branch — same shape as the body-bearing branch above.
3737        // v0.8.2 #61: pass `pending_version` so a versioned source's
3738        // destination receives the same shadow-key path.
3739        self.spawn_replication_if_matched(
3740            &put_bucket,
3741            &put_key,
3742            &request_tags,
3743            &bytes::Bytes::new(),
3744            &None,
3745            backend_resp.is_ok(),
3746            pending_version.as_ref(),
3747        );
3748        backend_resp
3749    }
3750
3751    // === 圧縮を解く path (GET) ===
3752    #[tracing::instrument(
3753        name = "s4.get_object",
3754        skip(self, req),
3755        fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_out, range, path)
3756    )]
3757    async fn get_object(
3758        &self,
3759        mut req: S3Request<GetObjectInput>,
3760    ) -> S3Result<S3Response<GetObjectOutput>> {
3761        let get_start = Instant::now();
3762        let get_bucket = req.input.bucket.clone();
3763        let get_key = req.input.key.clone();
3764        // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
3765        self.check_not_reserved_key(&get_key, ReservedKeyMode::Read)?;
3766        self.enforce_rate_limit(&req, &get_bucket)?;
3767        self.enforce_policy(&req, "s3:GetObject", &get_bucket, Some(&get_key))?;
3768        // Range request の事前検出 (decompress 後 slice する path に使う)。
3769        let range_request = req.input.range.take();
3770        // v0.5 #27: pull SSE-C material from the input headers before
3771        // the request is moved into the backend. A header parse error
3772        // fails fast (no body fetch). The material is consumed below
3773        // when decrypting an S4E3-framed body; the SSE-C headers on
3774        // `req.input` are cleared so the backend doesn't see them.
3775        let sse_c_alg = req.input.sse_customer_algorithm.take();
3776        let sse_c_key = req.input.sse_customer_key.take();
3777        let sse_c_md5 = req.input.sse_customer_key_md5.take();
3778        let get_sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3779
3780        // v0.5 #34: route the GET through the VersioningManager when
3781        // attached AND the bucket is in a versioning-aware state.
3782        // Resolves which version to fetch (explicit `?versionId=` query
3783        // param vs. chain latest), translates a delete-marker into 404
3784        // NoSuchKey, and rewrites the backend storage key to the shadow
3785        // path (`<key>.__s4ver__/<vid>`) for non-null Enabled-bucket
3786        // versions. `resolved_version_id` is stamped onto the response
3787        // so clients see a coherent `x-amz-version-id` header.
3788        //
3789        // When the bucket is Unversioned (or no manager attached), the
3790        // chain-resolution step is skipped and the request flows
3791        // through the existing single-key path unchanged.
3792        let resolved_version_id: Option<String> = match self.versioning.as_ref() {
3793            Some(mgr)
3794                if mgr.state(&get_bucket) != crate::versioning::VersioningState::Unversioned =>
3795            {
3796                let req_vid = req.input.version_id.take();
3797                let entry = match req_vid.as_deref() {
3798                    Some(vid) => {
3799                        mgr.lookup_version(&get_bucket, &get_key, vid)
3800                            .ok_or_else(|| {
3801                                S3Error::with_message(
3802                                    S3ErrorCode::NoSuchVersion,
3803                                    format!("no such version: {vid}"),
3804                                )
3805                            })?
3806                    }
3807                    None => mgr.lookup_latest(&get_bucket, &get_key).ok_or_else(|| {
3808                        S3Error::with_message(
3809                            S3ErrorCode::NoSuchKey,
3810                            format!("no such key: {get_key}"),
3811                        )
3812                    })?,
3813                };
3814                if entry.is_delete_marker {
3815                    // S3 spec: GET without versionId on a
3816                    // delete-marker latest → 404 NoSuchKey + the
3817                    // response carries `x-amz-delete-marker: true`.
3818                    // GET with explicit versionId pointing at a delete
3819                    // marker → 405 MethodNotAllowed; we surface
3820                    // NoSuchKey here for both since s3s collapses them
3821                    // into the same not-found error path.
3822                    return Err(S3Error::with_message(
3823                        S3ErrorCode::NoSuchKey,
3824                        format!("delete marker is the current version of {get_key}"),
3825                    ));
3826                }
3827                if entry.version_id != crate::versioning::NULL_VERSION_ID {
3828                    req.input.key = versioned_shadow_key(&get_key, &entry.version_id);
3829                }
3830                Some(entry.version_id)
3831            }
3832            _ => None,
3833        };
3834
3835        // ====== Range GET の partial-fetch fast path (sidecar index 利用) ======
3836        // sidecar `<key>.s4index` が存在し、multipart-framed object であれば
3837        // 必要 frame だけを backend に Range GET し帯域節約する。
3838        //
3839        // v0.8.4 #73 H-2: BEFORE trusting the sidecar's frame offsets,
3840        // verify the source object hasn't been overwritten / mutated since
3841        // the sidecar was stamped. The sidecar carries the backend ETag
3842        // captured at PUT time (`source_etag`); a HEAD against the current
3843        // backend object tells us the live ETag. If they disagree we treat
3844        // the sidecar as stale and fall through to the full-GET path —
3845        // returning the wrong frames for a Range request would surface as
3846        // a CRC mismatch deeper in the stack but would also potentially
3847        // disclose unrelated frames if a hostile operator wrote the
3848        // sidecar themselves. Fail-open to "full read" is the safe default.
3849        //
3850        // Legacy v1 sidecars (no `source_etag` populated) keep the old
3851        // best-effort behaviour so existing on-disk indexes don't suddenly
3852        // start missing the partial-fetch path.
3853        if let Some(ref r) = range_request
3854            && let Some(index) = self.read_sidecar(&req.input.bucket, &req.input.key).await
3855            && self
3856                .sidecar_version_binding_ok(&req.input.bucket, &req.input.key, &index)
3857                .await
3858        {
3859            let total = index.total_original_size();
3860            let (start, end_exclusive) = match resolve_range(r, total) {
3861                Ok(v) => v,
3862                Err(e) => {
3863                    return Err(S3Error::with_message(S3ErrorCode::InvalidRange, e));
3864                }
3865            };
3866            if let Some(plan) = index.lookup_range(start, end_exclusive) {
3867                // v0.9 #106: v3 sidecar with an SSE chunked binding →
3868                // encrypted partial-fetch fast-path. SSE-S4 chunked
3869                // (S4E6) is the only scope-in encryption mode; for
3870                // every other case (v1 / v2 sidecar) we fall through
3871                // to the existing pre-encrypt `partial_range_get`.
3872                // SSE-KMS / SSE-C / S4E2 buffered never get a
3873                // sidecar emitted (see PUT path `sidecar_index`
3874                // condition), so they trivially take the existing
3875                // buffered fallback further down.
3876                //
3877                // Codex P2 (round 2): when the sidecar HAS an SSE
3878                // binding but `encrypted_lookup` returns `None` (=
3879                // stale / corrupted chunk geometry, or a Range that
3880                // falls outside the declared `enc_plaintext_len`),
3881                // we must NOT fall through to `partial_range_get`
3882                // — that would slice the S4E6 ciphertext at
3883                // pre-encrypt offsets and either 500 or return
3884                // garbage. Skip the fast-path entirely so the
3885                // buffered fallback below decrypts + frame-parses
3886                // correctly.
3887                if let Some(sse) = index.sse_v3.as_ref() {
3888                    if let Some(enc_plan) = index.encrypted_lookup(&plan) {
3889                        return self
3890                            .partial_range_get_encrypted(
3891                                &req,
3892                                plan,
3893                                enc_plan,
3894                                *sse,
3895                                start,
3896                                end_exclusive,
3897                                total,
3898                                get_start,
3899                            )
3900                            .await;
3901                    }
3902                    // Encrypted body + binding present but
3903                    // `encrypted_lookup` refused (= sidecar /
3904                    // body mismatch). Fall through to the buffered
3905                    // full-GET below — safer than slicing
3906                    // ciphertext with pre-encrypt offsets.
3907                    //
3908                    // Data-flow note: `req.input.range` was
3909                    // already `.take()`-ed into `range_request` at
3910                    // L3695, so the subsequent
3911                    // `self.backend.get_object(req)` carries no
3912                    // Range header (= full body fetch). The local
3913                    // `range_request` is then re-applied to the
3914                    // *decrypted + decompressed* plaintext by the
3915                    // buffered slice path further down. Without
3916                    // the `.take()` above, we'd have to clear it
3917                    // explicitly here or we'd slice ciphertext.
3918                } else {
3919                    return self
3920                        .partial_range_get(&req, plan, start, end_exclusive, total, get_start)
3921                        .await;
3922                }
3923            }
3924        }
3925        let mut resp = self.backend.get_object(req).await?;
3926        // v0.5 #34: stamp the resolved version-id so the client sees a
3927        // coherent `x-amz-version-id` header (only for chains owned by
3928        // the manager — Unversioned buckets / no-manager paths never
3929        // set this).
3930        if let Some(ref vid) = resolved_version_id {
3931            resp.output.version_id = Some(vid.clone());
3932        }
3933        let is_multipart = is_multipart_object(&resp.output.metadata);
3934        let is_framed_v2 = is_framed_v2_object(&resp.output.metadata);
3935        // v0.2 #4: framed-v2 single-PUT は多 frame parse が必要なので
3936        // multipart と同じ path に流す。
3937        let needs_frame_parse = is_multipart || is_framed_v2;
3938        let manifest_opt = extract_manifest(&resp.output.metadata);
3939
3940        if !needs_frame_parse && manifest_opt.is_none() {
3941            // S4 が書いていないオブジェクトは透過 (raw bucket pre-existing object 等)
3942            debug!("S4 get_object: object lacks s4-codec metadata, returning as-is");
3943            return Ok(resp);
3944        }
3945
3946        if let Some(blob) = resp.output.body.take() {
3947            // v0.4 #21 / v0.5 #27: if the object was stored under SSE
3948            // (metadata flag `s4-encrypted: aes-256-gcm`), decrypt
3949            // before any frame parse / streaming decompress. Encrypted
3950            // bodies are opaque to the codec; this also forces the
3951            // buffered path because AES-GCM needs the full body for tag
3952            // verify. SSE-C uses the per-request customer key, SSE-S4
3953            // falls back to the configured keyring.
3954            let blob = if is_sse_encrypted(&resp.output.metadata) {
3955                let body = collect_blob(blob, self.max_body_bytes)
3956                    .await
3957                    .map_err(internal("collect SSE-encrypted body"))?;
3958                // v0.5 #28: peek the frame magic to route the right
3959                // decrypt path. S4E4 means SSE-KMS — unwrap the DEK
3960                // through the KMS backend (async). S4E1/E2/E3 take
3961                // the sync path (keyring or customer key).
3962                //
3963                // v0.8 #52 (S4E5) / v0.8.1 #57 (S4E6): the chunked
3964                // SSE-S4 frames take the *streaming* path — we hand
3965                // the response body a per-chunk verify-and-emit
3966                // Stream so the client sees chunk 0 plaintext after
3967                // one chunk-worth of AES-GCM verify (vs. waiting
3968                // for the whole body's tag), and the gateway no
3969                // longer needs to materialize the full plaintext
3970                // in memory before responding. SSE-C is out of
3971                // scope for the chunked path (chunked S4E3 is a
3972                // follow-up), so this branch requires the SSE-S4
3973                // keyring to be wired and `get_sse_c_material` to
3974                // be absent — otherwise we surface a clear
3975                // misconfiguration error instead of silently
3976                // falling through to the buffered chunked path.
3977                // v0.8.11 CRIT-1 fix: the chunked stream early-return is
3978                // only correct when the decrypted body IS the user's
3979                // plaintext as-stored. If the object went through the
3980                // codec (compressed) or carries S4F2 frames, returning
3981                // the decrypt stream directly hands the client
3982                // compressed / framed bytes. Restrict the early-return
3983                // to codec=Passthrough + non-framed objects; everything
3984                // else falls through to the buffered path, which
3985                // decrypt-buffers S4E5/S4E6 via
3986                // `decrypt_chunked_buffered_default` and then runs the
3987                // existing decompress pipeline.
3988                let chunked_streaming_safe = !needs_frame_parse
3989                    && manifest_opt
3990                        .as_ref()
3991                        .map(|m| m.codec == CodecKind::Passthrough)
3992                        .unwrap_or(false);
3993                if matches!(crate::sse::peek_magic(&body), Some("S4E5") | Some("S4E6"))
3994                    && get_sse_c_material.is_none()
3995                    && chunked_streaming_safe
3996                {
3997                    let keyring_arc = self.sse_keyring.clone().ok_or_else(|| {
3998                        S3Error::with_message(
3999                            S3ErrorCode::InvalidRequest,
4000                            "object is SSE-S4 encrypted (S4E5/S4E6) but no --sse-s4-key is configured on this gateway",
4001                        )
4002                    })?;
4003                    let body_len = body.len() as u64;
4004                    let stream = crate::sse::decrypt_chunked_stream(body, keyring_arc.as_ref());
4005                    // Stream is `'static` (the keyring borrow is
4006                    // consumed up front; the cipher lives inside
4007                    // the stream state — see decrypt_chunked_stream
4008                    // doc), so we can move it straight into a
4009                    // StreamingBlob without lifetime gymnastics.
4010                    use futures::StreamExt;
4011                    let mapped = stream.map(|r| {
4012                        r.map_err(|e| std::io::Error::other(format!("SSE-S4 chunked decrypt: {e}")))
4013                    });
4014                    use s3s::dto::StreamingBlob;
4015                    resp.output.body = Some(StreamingBlob::wrap(mapped));
4016                    // Plaintext content_length is unknown until all
4017                    // chunks have been verified; null it out so the
4018                    // ByteStream wrapper reports `unknown` to the
4019                    // HTTP layer (which then emits chunked transfer-
4020                    // encoding) rather than lying about the size.
4021                    resp.output.content_length = None;
4022                    // The backend's checksums + ETag describe the
4023                    // encrypted body (S4E5/S4E6 wire format), not
4024                    // the plaintext we're about to stream — clear them
4025                    // so the AWS SDK doesn't fail the GET with a
4026                    // ChecksumMismatch on a successful round-trip.
4027                    // Mirrors the streaming-zstd path at L1180-1185.
4028                    resp.output.checksum_crc32 = None;
4029                    resp.output.checksum_crc32c = None;
4030                    resp.output.checksum_crc64nvme = None;
4031                    resp.output.checksum_sha1 = None;
4032                    resp.output.checksum_sha256 = None;
4033                    resp.output.e_tag = None;
4034                    let elapsed = get_start.elapsed();
4035                    crate::metrics::record_get(
4036                        "sse-s4-chunked",
4037                        body_len,
4038                        body_len,
4039                        elapsed.as_secs_f64(),
4040                        true,
4041                    );
4042                    return Ok(resp);
4043                }
4044                let plain = match crate::sse::peek_magic(&body) {
4045                    Some("S4E4") => {
4046                        let kms = self.kms.as_ref().ok_or_else(|| {
4047                            S3Error::with_message(
4048                                S3ErrorCode::InvalidRequest,
4049                                "object is SSE-KMS encrypted but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4050                            )
4051                        })?;
4052                        let kms_ref: &dyn crate::kms::KmsBackend = kms.as_ref();
4053                        crate::sse::decrypt_with_kms(&body, kms_ref)
4054                            .await
4055                            .map_err(|e| match e {
4056                                crate::sse::SseError::KmsBackend(k) => kms_error_to_s3(k),
4057                                other => S3Error::with_message(
4058                                    S3ErrorCode::InternalError,
4059                                    format!("SSE-KMS decrypt failed: {other}"),
4060                                ),
4061                            })?
4062                    }
4063                    _ => {
4064                        if let Some(ref m) = get_sse_c_material {
4065                            crate::sse::decrypt(
4066                                &body,
4067                                crate::sse::SseSource::CustomerKey {
4068                                    key: &m.key,
4069                                    key_md5: &m.key_md5,
4070                                },
4071                            )
4072                            .map_err(sse_c_error_to_s3)?
4073                        } else {
4074                            let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
4075                                S3Error::with_message(
4076                                    S3ErrorCode::InvalidRequest,
4077                                    "object is SSE-S4 encrypted but no --sse-s4-key is configured on this gateway",
4078                                )
4079                            })?;
4080                            crate::sse::decrypt(&body, keyring).map_err(|e| {
4081                                S3Error::with_message(
4082                                    S3ErrorCode::InternalError,
4083                                    format!("SSE-S4 decrypt failed: {e}"),
4084                                )
4085                            })?
4086                        }
4087                    }
4088                };
4089                // v0.5 #28: parse out the on-disk wrapped DEK's key id
4090                // so the GET response can echo `x-amz-server-side-encryption-aws-kms-key-id`.
4091                if matches!(crate::sse::peek_magic(&body), Some("S4E4"))
4092                    && let Ok(hdr) = crate::sse::parse_s4e4_header(&body)
4093                {
4094                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
4095                        ServerSideEncryption::AWS_KMS,
4096                    ));
4097                    resp.output.ssekms_key_id = Some(hdr.key_id.to_string());
4098                }
4099                bytes_to_blob(plain)
4100            } else if let Some(ref m) = get_sse_c_material {
4101                // Client sent SSE-C headers for an unencrypted object —
4102                // mirror AWS S3's 400 InvalidRequest.
4103                let _ = m;
4104                return Err(sse_c_error_to_s3(
4105                    crate::sse::SseError::CustomerKeyUnexpected,
4106                ));
4107            } else {
4108                blob
4109            };
4110            // v0.5 #27: SSE-C echo on success — algorithm + key MD5
4111            // tell the client that the supplied key was the one used.
4112            if let Some(ref m) = get_sse_c_material {
4113                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
4114                resp.output.sse_customer_key_md5 =
4115                    Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
4116            }
4117            // ====== Streaming fast path (CpuZstd, non-multipart, codec supports it) ======
4118            // 大規模 object (e.g. 5 GB) を memory に collect すると OOM するので、
4119            // codec が streaming-aware なら body を chunk-by-chunk で decompress して
4120            // 即座に client に流す。
4121            //
4122            // ただし Range request 時は streaming できない (slice するため total bytes
4123            // が必要) → buffered path に fall through。
4124            if range_request.is_none()
4125                && !needs_frame_parse
4126                && let Some(ref m) = manifest_opt
4127                && supports_streaming_decompress(m.codec)
4128                && m.codec == CodecKind::CpuZstd
4129            {
4130                // v0.8.4 #73 H-1: wrap the decompressor output in a
4131                // rolling-CRC32C verifier so a tampered ciphertext (or a
4132                // backend-side corruption that the zstd decoder happens
4133                // to "successfully" decode into wrong bytes) surfaces as
4134                // a streaming error tail at EOF instead of silently
4135                // delivering corrupt plaintext to the client. The wrap
4136                // is a pure pass-through during the body — no extra
4137                // buffering, TTFB unaffected — and the integrity
4138                // decision lands at the last chunk.
4139                let decompressed_blob = cpu_zstd_decompress_stream(blob);
4140                let verified_reader = Crc32cVerifyingReader::new(
4141                    blob_to_async_read(decompressed_blob),
4142                    m.crc32c,
4143                    m.original_size,
4144                );
4145                let verified_blob = async_read_to_blob(verified_reader);
4146                resp.output.content_length = Some(m.original_size as i64);
4147                resp.output.checksum_crc32 = None;
4148                resp.output.checksum_crc32c = None;
4149                resp.output.checksum_crc64nvme = None;
4150                resp.output.checksum_sha1 = None;
4151                resp.output.checksum_sha256 = None;
4152                resp.output.e_tag = None;
4153                resp.output.body = Some(verified_blob);
4154                let elapsed = get_start.elapsed();
4155                crate::metrics::record_get(
4156                    m.codec.as_str(),
4157                    m.compressed_size,
4158                    m.original_size,
4159                    elapsed.as_secs_f64(),
4160                    true,
4161                );
4162                info!(
4163                    op = "get_object",
4164                    bucket = %get_bucket,
4165                    key = %get_key,
4166                    codec = m.codec.as_str(),
4167                    bytes_in = m.compressed_size,
4168                    bytes_out = m.original_size,
4169                    path = "streaming",
4170                    setup_latency_ms = elapsed.as_millis() as u64,
4171                    "S4 get started (streaming)"
4172                );
4173                return Ok(resp);
4174            }
4175            // Passthrough: そのまま流す (Range なしの場合のみ streaming)
4176            if range_request.is_none()
4177                && !needs_frame_parse
4178                && let Some(ref m) = manifest_opt
4179                && m.codec == CodecKind::Passthrough
4180            {
4181                resp.output.content_length = Some(m.original_size as i64);
4182                resp.output.checksum_crc32 = None;
4183                resp.output.checksum_crc32c = None;
4184                resp.output.checksum_crc64nvme = None;
4185                resp.output.checksum_sha1 = None;
4186                resp.output.checksum_sha256 = None;
4187                resp.output.e_tag = None;
4188                resp.output.body = Some(blob);
4189                debug!("S4 get_object: passthrough streaming");
4190                return Ok(resp);
4191            }
4192
4193            // ====== Buffered slow path (multipart frame parser, GPU codecs) ======
4194            let bytes = collect_blob(blob, self.max_body_bytes)
4195                .await
4196                .map_err(internal("collect get body"))?;
4197
4198            let decompressed = if needs_frame_parse {
4199                // multipart objects と framed-v2 single-PUT objects は同じ
4200                // S4F2 frame 列なので decompress_multipart で統一処理
4201                self.decompress_multipart(bytes).await?
4202            } else {
4203                let manifest = manifest_opt.as_ref().expect("non-multipart guarded above");
4204                self.registry
4205                    .decompress(bytes, manifest)
4206                    .await
4207                    .map_err(internal("registry decompress"))?
4208            };
4209
4210            // Range request があれば slice。なければ full body を返す。
4211            let total_size = decompressed.len() as u64;
4212            let (final_bytes, status_override) = if let Some(r) = range_request.as_ref() {
4213                let (start, end) = resolve_range(r, total_size)
4214                    .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
4215                let sliced = decompressed.slice(start as usize..end as usize);
4216                resp.output.content_range = Some(format!(
4217                    "bytes {start}-{}/{total_size}",
4218                    end.saturating_sub(1)
4219                ));
4220                (sliced, Some(http::StatusCode::PARTIAL_CONTENT))
4221            } else {
4222                (decompressed, None)
4223            };
4224            // 解凍後の真のサイズを返す (S3 client は content_length を信頼するので
4225            // 圧縮 size のままだと downstream が body を途中で切ってしまう)
4226            resp.output.content_length = Some(final_bytes.len() as i64);
4227            // 圧縮済 bytes の checksum を返すと AWS SDK 側で StreamingError
4228            // (ChecksumMismatch) になる。ETag も backend が返した「圧縮済 bytes の
4229            // MD5/checksum」なので意味的にズレる — クリアして S4 自身の crc32c
4230            // (manifest 内 / frame 内) で integrity を保証する設計にする。
4231            resp.output.checksum_crc32 = None;
4232            resp.output.checksum_crc32c = None;
4233            resp.output.checksum_crc64nvme = None;
4234            resp.output.checksum_sha1 = None;
4235            resp.output.checksum_sha256 = None;
4236            resp.output.e_tag = None;
4237            let returned_size = final_bytes.len() as u64;
4238            let codec_label = manifest_opt
4239                .as_ref()
4240                .map(|m| m.codec.as_str())
4241                .unwrap_or("multipart");
4242            resp.output.body = Some(bytes_to_blob(final_bytes));
4243            if let Some(status) = status_override {
4244                resp.status = Some(status);
4245            }
4246            let elapsed = get_start.elapsed();
4247            crate::metrics::record_get(codec_label, 0, returned_size, elapsed.as_secs_f64(), true);
4248            info!(
4249                op = "get_object",
4250                bucket = %get_bucket,
4251                key = %get_key,
4252                codec = codec_label,
4253                bytes_out = returned_size,
4254                total_object_size = total_size,
4255                range = range_request.is_some(),
4256                path = "buffered",
4257                latency_ms = elapsed.as_millis() as u64,
4258                "S4 get completed (buffered)"
4259            );
4260        }
4261        // v0.6 #40: echo the recorded `x-amz-replication-status` so
4262        // consumers can poll progress (PENDING / COMPLETED / FAILED).
4263        if let Some(mgr) = self.replication.as_ref()
4264            && let Some(status) = mgr.lookup_status(&get_bucket, &get_key)
4265        {
4266            resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
4267                status.as_aws_str().to_owned(),
4268            ));
4269        }
4270        Ok(resp)
4271    }
4272
4273    // === passthrough delegations ===
4274    async fn head_bucket(
4275        &self,
4276        req: S3Request<HeadBucketInput>,
4277    ) -> S3Result<S3Response<HeadBucketOutput>> {
4278        self.backend.head_bucket(req).await
4279    }
4280    async fn list_buckets(
4281        &self,
4282        req: S3Request<ListBucketsInput>,
4283    ) -> S3Result<S3Response<ListBucketsOutput>> {
4284        self.backend.list_buckets(req).await
4285    }
4286    async fn create_bucket(
4287        &self,
4288        req: S3Request<CreateBucketInput>,
4289    ) -> S3Result<S3Response<CreateBucketOutput>> {
4290        self.backend.create_bucket(req).await
4291    }
4292    async fn delete_bucket(
4293        &self,
4294        req: S3Request<DeleteBucketInput>,
4295    ) -> S3Result<S3Response<DeleteBucketOutput>> {
4296        self.backend.delete_bucket(req).await
4297    }
4298    async fn head_object(
4299        &self,
4300        req: S3Request<HeadObjectInput>,
4301    ) -> S3Result<S3Response<HeadObjectOutput>> {
4302        // v0.6 #40: capture bucket/key before req is consumed so the
4303        // replication-status echo can look the entry up.
4304        let head_bucket = req.input.bucket.clone();
4305        let head_key = req.input.key.clone();
4306        // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
4307        self.check_not_reserved_key(&head_key, ReservedKeyMode::Read)?;
4308        let mut resp = self.backend.head_object(req).await?;
4309        if let Some(manifest) = extract_manifest(&resp.output.metadata) {
4310            // 客側には decompress 後の意味のある content_length / checksum を返す。
4311            // backend が返す圧縮済 bytes の checksum / e_tag は意味が違うため除去
4312            // (S4 は manifest 内の crc32c で integrity を担保する)。
4313            resp.output.content_length = Some(manifest.original_size as i64);
4314            resp.output.checksum_crc32 = None;
4315            resp.output.checksum_crc32c = None;
4316            resp.output.checksum_crc64nvme = None;
4317            resp.output.checksum_sha1 = None;
4318            resp.output.checksum_sha256 = None;
4319            resp.output.e_tag = None;
4320        }
4321        // v0.6 #40: echo `x-amz-replication-status` (PENDING / COMPLETED
4322        // / FAILED) so consumers can poll progress without a GET.
4323        if let Some(mgr) = self.replication.as_ref()
4324            && let Some(status) = mgr.lookup_status(&head_bucket, &head_key)
4325        {
4326            resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
4327                status.as_aws_str().to_owned(),
4328            ));
4329        }
4330        // v0.7 #48 BUG-4 fix: HEAD must echo SSE indicators so SDKs
4331        // and pipelines see the same posture they got on PUT. The PUT
4332        // path stamps `s4-sse-type` metadata for exactly this — HEAD
4333        // doesn't fetch the body, so it can't peek frame magic.
4334        if let Some(meta) = resp.output.metadata.as_ref()
4335            && let Some(sse_type) = meta.get("s4-sse-type")
4336        {
4337            {
4338                match sse_type.as_str() {
4339                    "aws:kms" => {
4340                        resp.output.server_side_encryption = Some(
4341                            ServerSideEncryption::from_static(ServerSideEncryption::AWS_KMS),
4342                        );
4343                        if let Some(key_id) = meta.get("s4-sse-kms-key-id") {
4344                            resp.output.ssekms_key_id = Some(key_id.clone());
4345                        }
4346                    }
4347                    _ => {
4348                        resp.output.server_side_encryption = Some(
4349                            ServerSideEncryption::from_static(ServerSideEncryption::AES256),
4350                        );
4351                        if let Some(md5) = meta.get("s4-sse-c-key-md5") {
4352                            resp.output.sse_customer_algorithm =
4353                                Some(crate::sse::SSE_C_ALGORITHM.into());
4354                            resp.output.sse_customer_key_md5 = Some(md5.clone());
4355                        }
4356                    }
4357                }
4358            }
4359        }
4360        Ok(resp)
4361    }
4362    async fn delete_object(
4363        &self,
4364        mut req: S3Request<DeleteObjectInput>,
4365    ) -> S3Result<S3Response<DeleteObjectOutput>> {
4366        let bucket = req.input.bucket.clone();
4367        let key = req.input.key.clone();
4368        // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
4369        // The S4 internal sidecar cleanup path
4370        // (`write_sidecar` and friends) talks to
4371        // `self.backend.delete_object(...)` directly, NOT through
4372        // this trait method, so the guard doesn't break
4373        // legitimate sidecar cleanup.
4374        self.check_not_reserved_key(&key, ReservedKeyMode::Mutating)?;
4375        self.enforce_rate_limit(&req, &bucket)?;
4376        self.enforce_policy(&req, "s3:DeleteObject", &bucket, Some(&key))?;
4377        // v0.6 #42: MFA Delete enforcement. When the bucket has
4378        // MFA-Delete = Enabled, every DELETE / DELETE-version /
4379        // delete-marker form needs `x-amz-mfa: <serial> <code>` (RFC 6238
4380        // 6-digit TOTP). Runs *before* the WORM / versioning routers so
4381        // a missing token is denied for free regardless of which delete
4382        // path the request would otherwise take.
4383        if let Some(mgr) = self.mfa_delete.as_ref()
4384            && mgr.is_enabled(&bucket)
4385        {
4386            let header = req.input.mfa.as_deref();
4387            if let Err(e) = crate::mfa::check_mfa(&bucket, header, mgr, current_unix_secs()) {
4388                crate::metrics::record_mfa_delete_denial(&bucket);
4389                return Err(mfa_error_to_s3(e));
4390            }
4391        }
4392        // v0.5 #30: refuse the delete while a WORM lock is in effect.
4393        // Compliance can never be bypassed; Governance can be overridden
4394        // via `x-amz-bypass-governance-retention: true`; legal hold
4395        // never. The check happens before the versioning router so a
4396        // locked object can't be soft-deleted (delete-marker push) on an
4397        // Enabled bucket either — S3 spec says lock applies to all
4398        // delete forms.
4399        if let Some(mgr) = self.object_lock.as_ref()
4400            && let Some(state) = mgr.get(&bucket, &key)
4401        {
4402            let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
4403            // v0.8.12 HIGH-7 fix: the bypass header alone used to be
4404            // enough to override Governance retention. AWS spec
4405            // requires the caller hold `s3:BypassGovernanceRetention`
4406            // for the target ARN; without that, the header is
4407            // silently ignored (not an error — it lines up with how
4408            // AWS' canonical behaviour treats unprivileged callers).
4409            let bypass_allowed = if bypass_header {
4410                self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
4411                    .is_ok()
4412            } else {
4413                false
4414            };
4415            let now = chrono::Utc::now();
4416            if !state.can_delete(now, bypass_allowed) {
4417                crate::metrics::record_policy_denial("s3:DeleteObject", &bucket);
4418                return Err(S3Error::with_message(
4419                    S3ErrorCode::AccessDenied,
4420                    "Access Denied because object protected by object lock",
4421                ));
4422            }
4423        }
4424        // v0.5 #34: route DELETE through the VersioningManager when the
4425        // bucket is in a versioning-aware state.
4426        //
4427        // - Enabled bucket, no version_id → push a delete marker into
4428        //   the chain. NO backend object is touched (older versions
4429        //   stay reachable via specific-version GET).
4430        // - Enabled / Suspended bucket, with version_id → physical
4431        //   delete. Backend bytes at the shadow key (or `<key>` for
4432        //   `null`) are removed; chain entry is dropped. If the deleted
4433        //   entry was a delete marker, no backend bytes exist for it
4434        //   (record-only).
4435        // - Suspended bucket, no version_id → push a "null" delete
4436        //   marker (S3 spec); backend bytes at `<key>` are physically
4437        //   removed (same as legacy).
4438        // - Unversioned bucket → fall through to legacy passthrough.
4439        if let Some(mgr) = self.versioning.as_ref() {
4440            let state = mgr.state(&bucket);
4441            if state != crate::versioning::VersioningState::Unversioned {
4442                let req_vid = req.input.version_id.take();
4443                if let Some(vid) = req_vid {
4444                    // Specific-version DELETE: touch backend bytes only
4445                    // when the entry was a real version (not a delete
4446                    // marker, which has no backend bytes).
4447                    let outcome = mgr.record_delete_specific(&bucket, &key, &vid);
4448                    let backend_target = if vid == crate::versioning::NULL_VERSION_ID {
4449                        key.clone()
4450                    } else {
4451                        versioned_shadow_key(&key, &vid)
4452                    };
4453                    let was_real_version = outcome
4454                        .as_ref()
4455                        .map(|o| !o.is_delete_marker)
4456                        .unwrap_or(false);
4457                    if was_real_version {
4458                        // Best-effort backend cleanup; missing bytes
4459                        // are not an error (e.g. shadow key already
4460                        // GC'd).
4461                        let backend_input = DeleteObjectInput {
4462                            bucket: bucket.clone(),
4463                            key: backend_target,
4464                            ..Default::default()
4465                        };
4466                        let backend_req = S3Request {
4467                            input: backend_input,
4468                            method: http::Method::DELETE,
4469                            uri: req.uri.clone(),
4470                            headers: req.headers.clone(),
4471                            extensions: http::Extensions::new(),
4472                            credentials: req.credentials.clone(),
4473                            region: req.region.clone(),
4474                            service: req.service.clone(),
4475                            trailing_headers: None,
4476                        };
4477                        let _ = self.backend.delete_object(backend_req).await;
4478                    }
4479                    let mut output = DeleteObjectOutput {
4480                        version_id: Some(vid.clone()),
4481                        ..Default::default()
4482                    };
4483                    if let Some(o) = outcome.as_ref()
4484                        && o.is_delete_marker
4485                    {
4486                        output.delete_marker = Some(true);
4487                    }
4488                    // v0.6 #35: specific-version DELETE always counts as
4489                    // a hard `ObjectRemoved:Delete` event (the chain
4490                    // entry, marker or not, is gone after this call).
4491                    self.fire_delete_notification(
4492                        &bucket,
4493                        &key,
4494                        crate::notifications::EventType::ObjectRemovedDelete,
4495                        Some(vid.clone()),
4496                    );
4497                    return Ok(S3Response::new(output));
4498                }
4499                // No version_id: record a delete marker (state-aware).
4500                let outcome = mgr.record_delete(&bucket, &key);
4501                if state == crate::versioning::VersioningState::Suspended {
4502                    // Suspended buckets also evict the prior `<key>`
4503                    // bytes (the previous null version is gone too).
4504                    let backend_input = DeleteObjectInput {
4505                        bucket: bucket.clone(),
4506                        key: key.clone(),
4507                        ..Default::default()
4508                    };
4509                    let backend_req = S3Request {
4510                        input: backend_input,
4511                        method: http::Method::DELETE,
4512                        uri: req.uri.clone(),
4513                        headers: req.headers.clone(),
4514                        extensions: http::Extensions::new(),
4515                        credentials: req.credentials.clone(),
4516                        region: req.region.clone(),
4517                        service: req.service.clone(),
4518                        trailing_headers: None,
4519                    };
4520                    let _ = self.backend.delete_object(backend_req).await;
4521                }
4522                let output = DeleteObjectOutput {
4523                    delete_marker: Some(true),
4524                    version_id: outcome.version_id.clone(),
4525                    ..Default::default()
4526                };
4527                // v0.6 #35: versioned bucket DELETE without a version-id
4528                // creates a delete marker — the dedicated AWS event
4529                // taxonomy entry. Suspended-state buckets also push a
4530                // (null) marker, so the same event fires there.
4531                self.fire_delete_notification(
4532                    &bucket,
4533                    &key,
4534                    crate::notifications::EventType::ObjectRemovedDeleteMarker,
4535                    outcome.version_id,
4536                );
4537                return Ok(S3Response::new(output));
4538            }
4539        }
4540        // Legacy / Unversioned path: physical delete on the backend +
4541        // best-effort sidecar cleanup (mirrors v0.4 behaviour).
4542        let resp = self.backend.delete_object(req).await?;
4543        // v0.5 #30: drop any per-object lock state once the delete has
4544        // succeeded so the freed key can be re-armed by a future PUT
4545        // under the bucket default. Reaching here implies the lock had
4546        // already passed `can_delete` above, so this is purely cleanup.
4547        if let Some(mgr) = self.object_lock.as_ref() {
4548            mgr.clear(&bucket, &key);
4549        }
4550        // v0.6 #39: drop any object-level tag set on physical delete —
4551        // the freed key starts a fresh tag history if a future PUT
4552        // re-creates it. (Versioned-delete branches above return early
4553        // and do NOT touch tags, mirroring AWS where tag state is
4554        // attached to the logical key, not the version chain.)
4555        if let Some(mgr) = self.tagging.as_ref() {
4556            mgr.delete_object_tags(&bucket, &key);
4557        }
4558        let sidecar = sidecar_key(&key);
4559        // v0.7 #49: skip the sidecar DELETE if the key + sidecar suffix
4560        // can't be encoded into a request URI — the primary delete
4561        // already succeeded and a stale sidecar is harmless (Range GET
4562        // re-validates the underlying object on next read).
4563        if let Ok(uri) = safe_object_uri(&bucket, &sidecar) {
4564            let sidecar_input = DeleteObjectInput {
4565                bucket: bucket.clone(),
4566                key: sidecar,
4567                ..Default::default()
4568            };
4569            let sidecar_req = S3Request {
4570                input: sidecar_input,
4571                method: http::Method::DELETE,
4572                uri,
4573                headers: http::HeaderMap::new(),
4574                extensions: http::Extensions::new(),
4575                credentials: None,
4576                region: None,
4577                service: None,
4578                trailing_headers: None,
4579            };
4580            let _ = self.backend.delete_object(sidecar_req).await;
4581        }
4582        // v0.6 #35: legacy unversioned-bucket hard delete fires the
4583        // canonical `ObjectRemoved:Delete` event.
4584        self.fire_delete_notification(
4585            &bucket,
4586            &key,
4587            crate::notifications::EventType::ObjectRemovedDelete,
4588            None,
4589        );
4590        Ok(resp)
4591    }
4592    async fn delete_objects(
4593        &self,
4594        req: S3Request<DeleteObjectsInput>,
4595    ) -> S3Result<S3Response<DeleteObjectsOutput>> {
4596        // v0.6 #42: MFA Delete applies once to the whole batch (S3 spec:
4597        // when MFA-Delete is on the bucket, a missing / invalid token
4598        // fails the entire DeleteObjects request, not per-object).
4599        if let Some(mgr) = self.mfa_delete.as_ref()
4600            && mgr.is_enabled(&req.input.bucket)
4601        {
4602            let header = req.input.mfa.as_deref();
4603            if let Err(e) =
4604                crate::mfa::check_mfa(&req.input.bucket, header, mgr, current_unix_secs())
4605            {
4606                crate::metrics::record_mfa_delete_denial(&req.input.bucket);
4607                return Err(mfa_error_to_s3(e));
4608            }
4609        }
4610        // v0.8.11 CRIT-3 fix: route every entry through the gated
4611        // per-object `delete_object` path so Object Lock, IAM policy,
4612        // versioning, tagging, sidecar cleanup and notification fan-
4613        // out all fire for batch DELETE. The previous
4614        // `self.backend.delete_objects(req).await` straight-through
4615        // bypassed every gate, so a `legal_hold=on` key listed inside
4616        // a DeleteObjects XML was happily removed.
4617        //
4618        // S3 spec note: DeleteObjects is "best-effort per object" —
4619        // a failure on one key surfaces as an `Errors` entry without
4620        // aborting the rest of the batch. Quiet-mode suppresses the
4621        // `Deleted` list (errors are still reported). We honour both.
4622        let bucket = req.input.bucket.clone();
4623        let bypass_governance = req.input.bypass_governance_retention.unwrap_or(false);
4624        let mfa_header = req.input.mfa.clone();
4625        let quiet = req.input.delete.quiet.unwrap_or(false);
4626        let mut deleted: Vec<DeletedObject> = Vec::new();
4627        let mut errors: Vec<s3s::dto::Error> = Vec::new();
4628        for ident in req.input.delete.objects.iter() {
4629            let key = ident.key.clone();
4630            let version_id = ident.version_id.clone();
4631            let per_input = DeleteObjectInput {
4632                bucket: bucket.clone(),
4633                key: key.clone(),
4634                version_id: version_id.clone(),
4635                bypass_governance_retention: Some(bypass_governance),
4636                mfa: mfa_header.clone(),
4637                ..Default::default()
4638            };
4639            let per_uri = match safe_object_uri(&bucket, &key) {
4640                Ok(u) => u,
4641                Err(_) => {
4642                    errors.push(s3s::dto::Error {
4643                        code: Some("InvalidArgument".to_owned()),
4644                        key: Some(key),
4645                        message: Some("object key is not URI-encodable".to_owned()),
4646                        version_id,
4647                    });
4648                    continue;
4649                }
4650            };
4651            let per_req = S3Request {
4652                input: per_input,
4653                method: http::Method::DELETE,
4654                uri: per_uri,
4655                headers: req.headers.clone(),
4656                extensions: http::Extensions::new(),
4657                credentials: req.credentials.clone(),
4658                region: req.region.clone(),
4659                service: req.service.clone(),
4660                trailing_headers: None,
4661            };
4662            match self.delete_object(per_req).await {
4663                Ok(resp) => {
4664                    let out = resp.output;
4665                    // DeleteObjectOutput doesn't surface a separate
4666                    // `delete_marker_version_id`; the marker's version
4667                    // id is whatever `version_id` carries (when the
4668                    // versioning manager pushed a delete-marker, that
4669                    // field already holds the marker's vid).
4670                    let vid = out.version_id.clone().or(version_id);
4671                    deleted.push(DeletedObject {
4672                        key: Some(key),
4673                        version_id: vid.clone(),
4674                        delete_marker: out.delete_marker,
4675                        delete_marker_version_id: vid,
4676                    });
4677                }
4678                Err(e) => {
4679                    let code_str = e.code().as_str().to_owned();
4680                    let msg = e.message().unwrap_or(code_str.as_str()).to_owned();
4681                    errors.push(s3s::dto::Error {
4682                        code: Some(code_str),
4683                        key: Some(key),
4684                        message: Some(msg),
4685                        version_id,
4686                    });
4687                }
4688            }
4689        }
4690        let output = DeleteObjectsOutput {
4691            deleted: if quiet || deleted.is_empty() {
4692                None
4693            } else {
4694                Some(deleted)
4695            },
4696            errors: if errors.is_empty() {
4697                None
4698            } else {
4699                Some(errors)
4700            },
4701            ..Default::default()
4702        };
4703        Ok(S3Response::new(output))
4704    }
4705    async fn copy_object(
4706        &self,
4707        mut req: S3Request<CopyObjectInput>,
4708    ) -> S3Result<S3Response<CopyObjectOutput>> {
4709        // copy is conceptually "GetObject src + PutObject dst" — enforce both.
4710        let dst_bucket = req.input.bucket.clone();
4711        let dst_key = req.input.key.clone();
4712        // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
4713        self.check_not_reserved_key(&dst_key, ReservedKeyMode::Mutating)?;
4714        self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
4715        if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4716            // v0.8.17 G-2: source `<key>.s4index` would let
4717            // CopyObject expose the raw sidecar (frame layout +
4718            // source ETag) into a writable destination, bypassing
4719            // the F-13 GET reject. Same guard, Read mode (returns
4720            // NoSuchKey to match listing semantics).
4721            self.check_not_reserved_key(key, ReservedKeyMode::Read)?;
4722            self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
4723        }
4724        // S4-aware copy: source object に s4-* metadata がある場合、それを
4725        // destination に確実に preserve する。
4726        //
4727        // - MetadataDirective::COPY (default): backend が source metadata を
4728        //   そのまま copy するので S4 metadata も自動で渡る。介入不要
4729        // - MetadataDirective::REPLACE: 客が指定した metadata で source を
4730        //   上書き → s4-* metadata が消えると destination は decompress 不能に
4731        //   なる (silent corruption)。S4 が source metadata を HEAD で取得し、
4732        //   s4-* fields を input.metadata に強制 merge する
4733        let needs_merge = req
4734            .input
4735            .metadata_directive
4736            .as_ref()
4737            .map(|d| d.as_str() == MetadataDirective::REPLACE)
4738            .unwrap_or(false);
4739        if needs_merge && let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4740            // v0.8.16 F-8: strip the client-supplied `s4-*` keys
4741            // *unconditionally* — the v0.8.15 M-2 fix only ran the
4742            // strip inside the `if let Ok(head) = ...` block, so a
4743            // backend HEAD failure (transient 5xx, NoSuchKey on a
4744            // racing delete) left attacker-injected `s4-*` /
4745            // `S4-*` metadata intact on the destination. Now we
4746            // strip first, then re-populate from the source HEAD
4747            // when available — HEAD failure simply means the
4748            // destination loses the codec markers (correct: a
4749            // CopyObject without the source's codec metadata
4750            // produces an unreadable object, but doesn't allow
4751            // injection).
4752            let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4753            dest_meta.retain(|k, _| !k.to_ascii_lowercase().starts_with("s4-"));
4754            let head_input = HeadObjectInput {
4755                bucket: bucket.to_string(),
4756                key: key.to_string(),
4757                ..Default::default()
4758            };
4759            let head_req = S3Request {
4760                input: head_input,
4761                method: req.method.clone(),
4762                uri: req.uri.clone(),
4763                headers: req.headers.clone(),
4764                extensions: http::Extensions::new(),
4765                credentials: req.credentials.clone(),
4766                region: req.region.clone(),
4767                service: req.service.clone(),
4768                trailing_headers: None,
4769            };
4770            if let Ok(head) = self.backend.head_object(head_req).await
4771                && let Some(src_meta) = head.output.metadata.as_ref()
4772            {
4773                let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4774                for key in [
4775                    META_CODEC,
4776                    META_ORIGINAL_SIZE,
4777                    META_COMPRESSED_SIZE,
4778                    META_CRC32C,
4779                    META_MULTIPART,
4780                    META_FRAMED,
4781                ] {
4782                    if let Some(v) = src_meta.get(key) {
4783                        dest_meta.insert(key.to_string(), v.clone());
4784                    }
4785                }
4786                // SSE markers are equally reserved — propagate any
4787                // source flags so a copy of an encrypted object stays
4788                // marked as encrypted at the destination.
4789                for sse_key in [
4790                    "s4-encrypted",
4791                    "s4-sse-type",
4792                    "s4-sse-c-key-md5",
4793                    "s4-sse-kms-key-id",
4794                ] {
4795                    if let Some(v) = src_meta.get(sse_key) {
4796                        dest_meta.insert(sse_key.to_string(), v.clone());
4797                    }
4798                }
4799                debug!(
4800                    src_bucket = %bucket,
4801                    src_key = %key,
4802                    "S4 copy_object: replaced client s4-* metadata with source values across REPLACE directive (v0.8.15 M-2)"
4803                );
4804            }
4805        }
4806        self.backend.copy_object(req).await
4807    }
4808    async fn list_objects(
4809        &self,
4810        req: S3Request<ListObjectsInput>,
4811    ) -> S3Result<S3Response<ListObjectsOutput>> {
4812        self.enforce_rate_limit(&req, &req.input.bucket)?;
4813        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4814        let mut resp = self.backend.list_objects(req).await?;
4815        // S4 内部 object (`*.s4index` sidecar、`.__s4ver__/` shadow versions
4816        // — v0.5 #34) を顧客から隠す。
4817        if let Some(contents) = resp.output.contents.as_mut() {
4818            contents.retain(|o| {
4819                o.key
4820                    .as_ref()
4821                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4822                    .unwrap_or(true)
4823            });
4824        }
4825        Ok(resp)
4826    }
4827    async fn list_objects_v2(
4828        &self,
4829        req: S3Request<ListObjectsV2Input>,
4830    ) -> S3Result<S3Response<ListObjectsV2Output>> {
4831        self.enforce_rate_limit(&req, &req.input.bucket)?;
4832        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4833        let mut resp = self.backend.list_objects_v2(req).await?;
4834        if let Some(contents) = resp.output.contents.as_mut() {
4835            let before = contents.len();
4836            contents.retain(|o| {
4837                o.key
4838                    .as_ref()
4839                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4840                    .unwrap_or(true)
4841            });
4842            // key_count も補正 (S3 spec compliance)
4843            if let Some(kc) = resp.output.key_count.as_mut() {
4844                *kc -= (before - contents.len()) as i32;
4845            }
4846        }
4847        Ok(resp)
4848    }
4849    /// v0.4 #17: filter S4-internal sidecars from versioned listings.
4850    /// v0.5 #34: when a [`crate::versioning::VersioningManager`] is
4851    /// attached AND the bucket is in a versioning-aware state, build
4852    /// the `Versions` / `DeleteMarkers` arrays directly from the
4853    /// in-memory chain (paginated + ordered the S3 way: key asc,
4854    /// version newest-first inside each key). Otherwise fall back to
4855    /// passthrough + sidecar-filter (legacy v0.4 behaviour).
4856    async fn list_object_versions(
4857        &self,
4858        req: S3Request<ListObjectVersionsInput>,
4859    ) -> S3Result<S3Response<ListObjectVersionsOutput>> {
4860        self.enforce_rate_limit(&req, &req.input.bucket)?;
4861        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4862        // v0.5 #34: VersioningManager-owned path.
4863        if let Some(mgr) = self.versioning.as_ref()
4864            && mgr.state(&req.input.bucket) != crate::versioning::VersioningState::Unversioned
4865        {
4866            let max_keys = req.input.max_keys.unwrap_or(1000) as usize;
4867            let page = mgr.list_versions(
4868                &req.input.bucket,
4869                req.input.prefix.as_deref(),
4870                req.input.key_marker.as_deref(),
4871                req.input.version_id_marker.as_deref(),
4872                max_keys,
4873            );
4874            let versions: Vec<ObjectVersion> = page
4875                .versions
4876                .into_iter()
4877                .map(|e| ObjectVersion {
4878                    key: Some(e.key),
4879                    version_id: Some(e.version_id),
4880                    is_latest: Some(e.is_latest),
4881                    e_tag: Some(ETag::Strong(e.etag)),
4882                    size: Some(e.size as i64),
4883                    last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4884                    ..Default::default()
4885                })
4886                .collect();
4887            let delete_markers: Vec<DeleteMarkerEntry> = page
4888                .delete_markers
4889                .into_iter()
4890                .map(|e| DeleteMarkerEntry {
4891                    key: Some(e.key),
4892                    version_id: Some(e.version_id),
4893                    is_latest: Some(e.is_latest),
4894                    last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4895                    ..Default::default()
4896                })
4897                .collect();
4898            let output = ListObjectVersionsOutput {
4899                name: Some(req.input.bucket.clone()),
4900                prefix: req.input.prefix.clone(),
4901                key_marker: req.input.key_marker.clone(),
4902                version_id_marker: req.input.version_id_marker.clone(),
4903                max_keys: req.input.max_keys,
4904                versions: if versions.is_empty() {
4905                    None
4906                } else {
4907                    Some(versions)
4908                },
4909                delete_markers: if delete_markers.is_empty() {
4910                    None
4911                } else {
4912                    Some(delete_markers)
4913                },
4914                is_truncated: Some(page.is_truncated),
4915                next_key_marker: page.next_key_marker,
4916                next_version_id_marker: page.next_version_id_marker,
4917                ..Default::default()
4918            };
4919            return Ok(S3Response::new(output));
4920        }
4921        // Legacy passthrough path (v0.4 #17 sidecar filter retained).
4922        let mut resp = self.backend.list_object_versions(req).await?;
4923        if let Some(versions) = resp.output.versions.as_mut() {
4924            versions.retain(|v| {
4925                v.key
4926                    .as_ref()
4927                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4928                    .unwrap_or(true)
4929            });
4930        }
4931        if let Some(markers) = resp.output.delete_markers.as_mut() {
4932            markers.retain(|m| {
4933                m.key
4934                    .as_ref()
4935                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4936                    .unwrap_or(true)
4937            });
4938        }
4939        Ok(resp)
4940    }
4941
4942    async fn create_multipart_upload(
4943        &self,
4944        mut req: S3Request<CreateMultipartUploadInput>,
4945    ) -> S3Result<S3Response<CreateMultipartUploadOutput>> {
4946        // v0.8.12 HIGH-9 fix: gate multipart Create on `s3:PutObject` —
4947        // the destination is conceptually about to host a new object,
4948        // matching what `put_object` enforces L2078. Without this, a
4949        // bucket policy denying `s3:PutObject` was bypassable simply
4950        // by switching the client to the multipart wire path.
4951        let mp_bucket = req.input.bucket.clone();
4952        let mp_key = req.input.key.clone();
4953        // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
4954        self.check_not_reserved_key(&mp_key, ReservedKeyMode::Mutating)?;
4955        self.enforce_policy(&req, "s3:PutObject", &mp_bucket, Some(&mp_key))?;
4956        self.enforce_rate_limit(&req, &mp_bucket)?;
4957        // Multipart object は per-part 圧縮 + frame 形式で書く。GET 時に
4958        // frame parse を起動するため、object metadata に flag を立てる。
4959        // codec は dispatcher の default kind を採用 (per-part 別 codec は Phase 2)。
4960        let codec_kind = self.registry.default_kind();
4961        let meta = req.input.metadata.get_or_insert_with(Default::default);
4962        meta.insert(META_MULTIPART.into(), "true".into());
4963        meta.insert(META_CODEC.into(), codec_kind.as_str().into());
4964        // v0.8 #54 BUG-10 fix: take() the SSE request fields off
4965        // `req.input` so they are NOT forwarded to the backend on
4966        // CreateMultipartUpload. Same root cause as v0.7 #48 BUG-2/3 on
4967        // single-PUT — MinIO rejects SSE-C with "HTTPS required" and
4968        // SSE-KMS with "KMS not configured" when the headers reach it.
4969        // S4 owns the encrypt-then-store contract; we capture the
4970        // recipe in `multipart_state` here and apply it on Complete.
4971        let sse_c_alg = req.input.sse_customer_algorithm.take();
4972        let sse_c_key = req.input.sse_customer_key.take();
4973        let sse_c_md5 = req.input.sse_customer_key_md5.take();
4974        let sse_header = req.input.server_side_encryption.take();
4975        let sse_kms_key = req.input.ssekms_key_id.take();
4976        // Strip the encryption-context too — leaving it would make
4977        // MinIO try to validate it against a non-existent KMS key.
4978        let _ = req.input.ssekms_encryption_context.take();
4979        let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
4980        let kms_key_id = extract_kms_key_id(
4981            &sse_header,
4982            &sse_kms_key,
4983            self.kms_default_key_id.as_deref(),
4984        );
4985        // SSE-C / SSE-KMS exclusivity (mirrors put_object L1870).
4986        if sse_c_material.is_some() && kms_key_id.is_some() {
4987            return Err(S3Error::with_message(
4988                S3ErrorCode::InvalidArgument,
4989                "SSE-C and SSE-KMS cannot be used together on the same multipart upload",
4990            ));
4991        }
4992        let sse_mode = if let Some(ref m) = sse_c_material {
4993            // v0.8.2 #62 (H-6 audit fix): wrap the customer-supplied
4994            // 32-byte key in `Zeroizing` so abandoned uploads (or
4995            // normal Complete/Abort) wipe the key bytes on drop. The
4996            // `key_md5` is the public fingerprint and stays as a
4997            // bare `[u8; 16]`.
4998            crate::multipart_state::MultipartSseMode::SseC {
4999                key: zeroize::Zeroizing::new(m.key),
5000                key_md5: m.key_md5,
5001            }
5002        } else if let Some(ref kid) = kms_key_id {
5003            // KMS pre-flight: fail at Create rather than at Complete if
5004            // the gateway has no KMS backend wired (mirrors the
5005            // put_object L1879 check).
5006            if self.kms.is_none() {
5007                return Err(S3Error::with_message(
5008                    S3ErrorCode::InvalidRequest,
5009                    "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
5010                ));
5011            }
5012            crate::multipart_state::MultipartSseMode::SseKms {
5013                key_id: kid.clone(),
5014            }
5015        } else if self.sse_keyring.is_some() {
5016            // SSE-S4: server-driven transparent encryption. Activates
5017            // whenever the gateway has a keyring configured AND the
5018            // client didn't pick a different SSE mode.
5019            crate::multipart_state::MultipartSseMode::SseS4
5020        } else {
5021            crate::multipart_state::MultipartSseMode::None
5022        };
5023        // v0.8 #54 BUG-9 fix: parse the Tagging header on Create. The
5024        // single-PUT path does this on PutObject; the multipart path
5025        // captures it now and commits via TagManager on Complete.
5026        let request_tags: Option<crate::tagging::TagSet> = req
5027            .input
5028            .tagging
5029            .as_deref()
5030            .map(crate::tagging::parse_tagging_header)
5031            .transpose()
5032            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5033        // Strip the `Tagging` field off the input so the backend
5034        // doesn't try to apply it (no-op on MinIO but keeps the wire
5035        // clean).
5036        let _ = req.input.tagging.take();
5037        // Object Lock recipe (BUG-7 — captured here, applied on Complete).
5038        let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
5039            .input
5040            .object_lock_mode
5041            .as_ref()
5042            .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
5043        let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
5044            .input
5045            .object_lock_retain_until_date
5046            .as_ref()
5047            .and_then(timestamp_to_chrono_utc);
5048        let explicit_legal_hold_on: bool = req
5049            .input
5050            .object_lock_legal_hold_status
5051            .as_ref()
5052            .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
5053            .unwrap_or(false);
5054        let bucket = req.input.bucket.clone();
5055        let key = req.input.key.clone();
5056        debug!(
5057            bucket = %bucket,
5058            key = %key,
5059            codec = codec_kind.as_str(),
5060            sse = ?sse_mode,
5061            "S4 create_multipart_upload: marking object for per-part compression"
5062        );
5063        let mut resp = self.backend.create_multipart_upload(req).await?;
5064        // Stash the per-upload context only after the backend handed
5065        // us an upload_id (failed Creates leave nothing in the store).
5066        if let Some(upload_id) = resp.output.upload_id.as_ref() {
5067            self.multipart_state.put(
5068                upload_id,
5069                crate::multipart_state::MultipartUploadContext {
5070                    bucket,
5071                    key,
5072                    sse: sse_mode.clone(),
5073                    tags: request_tags,
5074                    object_lock_mode: explicit_lock_mode,
5075                    object_lock_retain_until: explicit_retain_until,
5076                    object_lock_legal_hold: explicit_legal_hold_on,
5077                },
5078            );
5079        }
5080        // SSE-C / SSE-KMS response echo (mirrors put_object L2036-L2050).
5081        match &sse_mode {
5082            crate::multipart_state::MultipartSseMode::SseC { key_md5, .. } => {
5083                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
5084                resp.output.sse_customer_key_md5 =
5085                    Some(base64::engine::general_purpose::STANDARD.encode(key_md5));
5086            }
5087            crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5088                resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5089                    ServerSideEncryption::AWS_KMS,
5090                ));
5091                resp.output.ssekms_key_id = Some(key_id.clone());
5092            }
5093            _ => {}
5094        }
5095        Ok(resp)
5096    }
5097
5098    async fn upload_part(
5099        &self,
5100        mut req: S3Request<UploadPartInput>,
5101    ) -> S3Result<S3Response<UploadPartOutput>> {
5102        // v0.8.12 HIGH-9 fix: same `s3:PutObject` gate as
5103        // `put_object` / `create_multipart_upload`. Even though
5104        // Create already passed the gate, a bucket policy that
5105        // *revokes* `s3:PutObject` mid-flight should stop further
5106        // parts (e.g. legal hold drops, retention shortened).
5107        let part_bucket = req.input.bucket.clone();
5108        let part_key = req.input.key.clone();
5109        self.enforce_policy(&req, "s3:PutObject", &part_bucket, Some(&part_key))?;
5110        self.enforce_rate_limit(&req, &part_bucket)?;
5111        // 各 part を圧縮して frame header 付きで forward。GET 時に
5112        // `decompress_multipart` が frame iter で順に解凍する。
5113        // **per-part codec dispatch**: dispatcher が body 先頭 sample から
5114        // codec を選ぶので、parquet 風の mixed-content multipart で part ごとに
5115        // 最適 codec を使える (整数列 part → Bitcomp、text 列 part → zstd 等)。
5116        //
5117        // v0.8 #54 BUG-5/BUG-10 fix: lookup the per-upload SSE
5118        // context captured by `create_multipart_upload` and (a) strip
5119        // any SSE-C request headers off `req.input` so the backend
5120        // doesn't see them — same root cause as v0.7 #48 BUG-2/3 on
5121        // single-PUT; MinIO refuses SSE-C parts over HTTP — and (b)
5122        // observe that an upload context exists for `upload_id`. The
5123        // actual encrypt happens once at `complete_multipart_upload`
5124        // time on the assembled body (the per-part-encrypt approach
5125        // would require a matching multi-segment decrypt path on GET;
5126        // encrypting the whole assembled body keeps the GET path's
5127        // `is_sse_encrypted` branch in get_object L2429 working
5128        // unchanged).
5129        let sse_ctx = self.multipart_state.get(req.input.upload_id.as_str());
5130        // v0.8.2 #62 (H-1 audit fix): SSE-C key consistency check.
5131        // The AWS S3 spec requires the same SSE-C key headers on
5132        // every UploadPart and rejects mismatches with 400. Prior to
5133        // #62 we silently stripped the headers (BUG-10 fix) without
5134        // validating them, allowing a client to send part 1 under
5135        // key-A and part 2 under key-B; both got stored, then
5136        // re-encrypted with key-A on Complete — the client thinks
5137        // part 2 is under key-B but a GET with key-B would in fact
5138        // hit the part-1 ciphertext that was actually encrypted with
5139        // key-A. That would either decrypt successfully (silent
5140        // corruption: client lost track of which key encrypts what)
5141        // or fail in a confusing way. Validate the per-part headers
5142        // now and reject with 400 InvalidArgument on mismatch /
5143        // omission / partial supply, matching real-S3 behaviour.
5144        if let Some(ref ctx) = sse_ctx {
5145            if let crate::multipart_state::MultipartSseMode::SseC {
5146                key_md5: ctx_md5, ..
5147            } = &ctx.sse
5148            {
5149                let alg = req.input.sse_customer_algorithm.take();
5150                let key_b64 = req.input.sse_customer_key.take();
5151                let md5_b64 = req.input.sse_customer_key_md5.take();
5152                match (alg, key_b64, md5_b64) {
5153                    (Some(a), Some(k), Some(m)) => {
5154                        // Parse + validate; if the per-part headers
5155                        // are themselves malformed (algorithm not
5156                        // AES256, MD5 mismatch, key not 32 bytes)
5157                        // surface the same 400 the single-PUT path
5158                        // would. Then compare the parsed MD5 to the
5159                        // upload-context's MD5; mismatch is a
5160                        // different-key UploadPart and must reject.
5161                        let part_material = crate::sse::parse_customer_key_headers(&a, &k, &m)
5162                            .map_err(sse_c_error_to_s3)?;
5163                        if part_material.key_md5 != *ctx_md5 {
5164                            return Err(S3Error::with_message(
5165                                S3ErrorCode::InvalidArgument,
5166                                "SSE-C key on UploadPart does not match the key supplied on CreateMultipartUpload",
5167                            ));
5168                        }
5169                        // OK — same key as Create. Headers are
5170                        // already taken off `req.input` so the
5171                        // backend never sees them.
5172                    }
5173                    (None, None, None) => {
5174                        // AWS S3 spec: SSE-C headers MUST be replayed
5175                        // on every UploadPart of an SSE-C multipart.
5176                        // Real-S3 returns 400 InvalidRequest in this
5177                        // case; mirror that.
5178                        return Err(S3Error::with_message(
5179                            S3ErrorCode::InvalidRequest,
5180                            "SSE-C requires customer-key headers on every UploadPart (CreateMultipartUpload was SSE-C)",
5181                        ));
5182                    }
5183                    _ => {
5184                        // Partial header set (e.g. algorithm + key
5185                        // but no MD5) — same handling as the
5186                        // single-PUT `extract_sse_c_material` helper.
5187                        return Err(S3Error::with_message(
5188                            S3ErrorCode::InvalidRequest,
5189                            "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
5190                        ));
5191                    }
5192                }
5193            } else {
5194                // CreateMultipartUpload was non-SSE-C (None / SseS4 /
5195                // SseKms). A part that arrives carrying SSE-C headers
5196                // is either a confused client or an attempt to
5197                // smuggle SSE-C around the gateway-internal SSE
5198                // recipe. Reject with 400 InvalidRequest rather than
5199                // silently strip — the strip would let the client
5200                // believe the part was encrypted under their key
5201                // when in fact the upload's encryption recipe is
5202                // whatever the Create captured.
5203                if req.input.sse_customer_algorithm.is_some()
5204                    || req.input.sse_customer_key.is_some()
5205                    || req.input.sse_customer_key_md5.is_some()
5206                {
5207                    return Err(S3Error::with_message(
5208                        S3ErrorCode::InvalidRequest,
5209                        "UploadPart sent SSE-C headers but CreateMultipartUpload was not SSE-C",
5210                    ));
5211                }
5212            }
5213        } else {
5214            // No upload context registered (gateway crashed between
5215            // Create and Part, or pre-#62 abandoned-upload restore).
5216            // We can't check key consistency in this case — strip
5217            // the headers and let the request through unchanged so
5218            // the backend's `NoSuchUpload` reply (or whatever it
5219            // chooses to do) flows back to the client.
5220            let _ = req.input.sse_customer_algorithm.take();
5221            let _ = req.input.sse_customer_key.take();
5222            let _ = req.input.sse_customer_key_md5.take();
5223        }
5224        let _sse_ctx = sse_ctx;
5225        if let Some(blob) = req.input.body.take() {
5226            let bytes = collect_blob(blob, self.max_body_bytes)
5227                .await
5228                .map_err(internal("collect upload_part body"))?;
5229            // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
5230            // checksum algorithms against the received part body.
5231            verify_client_body_checksums(
5232                &bytes,
5233                req.input.content_md5.as_deref(),
5234                req.input.checksum_crc32.as_deref(),
5235                req.input.checksum_crc32c.as_deref(),
5236                req.input.checksum_sha1.as_deref(),
5237                req.input.checksum_sha256.as_deref(),
5238                req.input.checksum_crc64nvme.as_deref(),
5239            )?;
5240            let sample_len = bytes.len().min(SAMPLE_BYTES);
5241            // v0.8 #56: full part body is already in memory here; use its
5242            // length as the size hint so the dispatcher can promote to GPU
5243            // if it's big enough.
5244            let codec_kind = self
5245                .dispatcher
5246                .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
5247                .await;
5248            let original_size = bytes.len() as u64;
5249            // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
5250            let (compress_res, tel) = self
5251                .registry
5252                .compress_with_telemetry(bytes, codec_kind)
5253                .await;
5254            stamp_gpu_compress_telemetry(&tel);
5255            let (compressed, manifest) =
5256                compress_res.map_err(internal("registry compress part"))?;
5257            let header = FrameHeader {
5258                codec: codec_kind,
5259                original_size,
5260                compressed_size: compressed.len() as u64,
5261                crc32c: manifest.crc32c,
5262            };
5263            let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
5264            write_frame(&mut framed, header, &compressed);
5265            // v0.2 #5: heuristic-based padding skip for likely-final parts.
5266            //
5267            // AWS SDK / aws-cli / boto3 always send the final (and only the
5268            // final) part below the configured part_size. So if the raw user
5269            // part is already smaller than S3's 5 MiB multipart minimum, this
5270            // is overwhelmingly likely to be the final part — and the final
5271            // part is exempt from S3's size constraint. Skipping padding here
5272            // saves up to ~5 MiB per object on highly compressible workloads.
5273            //
5274            // If a misbehaving client sends a tiny **non-final** part, S3
5275            // itself rejects with EntityTooSmall at CompleteMultipartUpload —
5276            // identical outcome to a vanilla S3 PUT, just earlier than
5277            // padding-then-complete would catch it.
5278            let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
5279            if !likely_final {
5280                pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
5281            }
5282            let framed_bytes = framed.freeze();
5283            let new_len = framed_bytes.len() as i64;
5284            // 同じ wire 互換問題が multipart にもある (content-length / checksum)
5285            req.input.content_length = Some(new_len);
5286            req.input.checksum_algorithm = None;
5287            req.input.checksum_crc32 = None;
5288            req.input.checksum_crc32c = None;
5289            req.input.checksum_crc64nvme = None;
5290            req.input.checksum_sha1 = None;
5291            req.input.checksum_sha256 = None;
5292            req.input.content_md5 = None;
5293            req.input.body = Some(bytes_to_blob(framed_bytes));
5294            debug!(
5295                part_number = ?req.input.part_number,
5296                upload_id = ?req.input.upload_id,
5297                original_size,
5298                framed_size = new_len,
5299                "S4 upload_part: framed compressed payload"
5300            );
5301        }
5302        self.backend.upload_part(req).await
5303    }
5304    async fn complete_multipart_upload(
5305        &self,
5306        mut req: S3Request<CompleteMultipartUploadInput>,
5307    ) -> S3Result<S3Response<CompleteMultipartUploadOutput>> {
5308        let bucket = req.input.bucket.clone();
5309        let key = req.input.key.clone();
5310        let upload_id = req.input.upload_id.clone();
5311        // v0.8.12 HIGH-9 fix: gate Complete on `s3:PutObject` (the
5312        // commit point for the multipart-assembled object).
5313        self.enforce_policy(&req, "s3:PutObject", &bucket, Some(&key))?;
5314        self.enforce_rate_limit(&req, &bucket)?;
5315        // v0.8.12 HIGH-6 fix: re-verify Object Lock on the target key
5316        // at Complete time. Without this an attacker with PutObject
5317        // permission could `CreateMultipartUpload` against a key
5318        // that's currently under retention / legal hold and silently
5319        // overwrite it on Complete (the single-PUT path runs the
5320        // same check at L2007). Compliance retention is never
5321        // bypassable; Governance only with explicit IAM permission
5322        // (HIGH-7 gate below).
5323        if let Some(mgr) = self.object_lock.as_ref()
5324            && let Some(state) = mgr.get(&bucket, &key)
5325        {
5326            // CompleteMultipartUpload doesn't carry the bypass header
5327            // (the s3s DTO matches AWS' wire schema). A locked key
5328            // therefore cannot be overwritten by Complete regardless
5329            // of caller permission — operators who need to break a
5330            // Governance lock do it via PutObjectRetention before
5331            // calling Complete.
5332            let now = chrono::Utc::now();
5333            if !state.can_delete(now, false) {
5334                crate::metrics::record_policy_denial("s3:PutObject", &bucket);
5335                return Err(S3Error::with_message(
5336                    S3ErrorCode::AccessDenied,
5337                    "Access Denied because target key is protected by object lock",
5338                ));
5339            }
5340        }
5341        // v0.8.1 #59: serialise concurrent Complete invocations on the
5342        // same `(bucket, key)`. The race window the lock closes is the
5343        // GET-assembled-body → encrypt → PUT-encrypted-body triple
5344        // below (BUG-5 fix); without serialisation, two Completes for
5345        // different `upload_id` but the same logical key could each
5346        // read the other's plaintext assembled body and overwrite the
5347        // peer's encrypted result. The guard is held to function exit
5348        // (drop on `Ok` / `Err`), covering version-id mint, object-
5349        // lock apply, tagging persist, and replication enqueue too.
5350        let completion_lock = self.multipart_state.completion_lock(&bucket, &key);
5351        let _completion_guard = completion_lock.lock().await;
5352        // v0.8 #54 — fetch the per-upload context captured on Create.
5353        // `None` means an abandoned / unknown upload_id (gateway
5354        // crashed between Create and Complete, or pre-v0.8 state
5355        // restore); we still let the backend do its thing for
5356        // transparency, but we can't apply any SSE / version / lock /
5357        // tag / replication post-processing because we never captured
5358        // the recipe.
5359        let ctx = self.multipart_state.get(upload_id.as_str());
5360        // v0.8 #54 BUG-10 fix: same SSE-C header strip as upload_part
5361        // — some clients (boto3 / aws-sdk-cpp older versions) replay
5362        // the SSE-C triple on Complete too, and MinIO will choke if
5363        // they reach the backend.
5364        let _ = req.input.sse_customer_algorithm.take();
5365        let _ = req.input.sse_customer_key.take();
5366        let _ = req.input.sse_customer_key_md5.take();
5367        let mut resp = self.backend.complete_multipart_upload(req).await?;
5368        // CompleteMultipartUpload 成功 → 完成した object を full fetch して frame
5369        // index を build、`<key>.s4index` sidecar として保存。これで Range GET の
5370        // partial fetch path が利用可能になる (Range request の帯域節約)。
5371        // 注: 巨大 object の場合この pass は重いが、Range query は一度 sidecar が
5372        // できれば爆速になるので 1 回の cost は payback される
5373        //
5374        // v0.8 #54 BUG-5..9: this same fetch is the choke-point for
5375        // the SSE encrypt re-PUT + versioning shadow-key rewrite +
5376        // replication source-bytes capture, so we GET once and reuse
5377        // the bytes for every post-processing step.
5378        let assembled_body: Option<bytes::Bytes> = if let Ok(uri) = safe_object_uri(&bucket, &key) {
5379            let get_input = GetObjectInput {
5380                bucket: bucket.clone(),
5381                key: key.clone(),
5382                ..Default::default()
5383            };
5384            let get_req = S3Request {
5385                input: get_input,
5386                method: http::Method::GET,
5387                uri,
5388                headers: http::HeaderMap::new(),
5389                extensions: http::Extensions::new(),
5390                credentials: None,
5391                region: None,
5392                service: None,
5393                trailing_headers: None,
5394            };
5395            match self.backend.get_object(get_req).await {
5396                Ok(get_resp) => match get_resp.output.body {
5397                    Some(blob) => collect_blob(blob, self.max_body_bytes).await.ok(),
5398                    None => None,
5399                },
5400                Err(e) => {
5401                    // v0.8.4 #71 (C-1 audit fix): a silent
5402                    // `Err(_) => None` here is a SSE plaintext
5403                    // leak. The post-processing block below only
5404                    // runs the SSE re-encrypt branch when
5405                    // `assembled_body.is_some()`, so swallowing a
5406                    // backend error skipped the encrypt step and
5407                    // left the multipart object on disk as
5408                    // plaintext, even on SSE-S4 / SSE-C / SSE-KMS
5409                    // configured buckets. Same root-cause family
5410                    // as v0.8 BUG-5; this branch closes the
5411                    // remaining read-side window.
5412                    //
5413                    // We distinguish two cases:
5414                    //  - `NoSuchKey`: the object is genuinely
5415                    //    missing post-Complete. This is rare and
5416                    //    typically races with a concurrent
5417                    //    DeleteObject; there is nothing to re-
5418                    //    encrypt and no SSE markers to honour, so
5419                    //    falling through to the legacy
5420                    //    `assembled_body = None` path is safe.
5421                    //  - everything else (5xx, network, auth,
5422                    //    etc.): we must FAIL the Complete so the
5423                    //    client can retry. Returning Ok with
5424                    //    `assembled_body = None` would silently
5425                    //    skip the SSE re-encrypt and leave the
5426                    //    backend bytes plaintext.
5427                    if matches!(e.code(), &S3ErrorCode::NoSuchKey) {
5428                        tracing::warn!(
5429                            bucket = %bucket,
5430                            key = %key,
5431                            "multipart Complete: backend GET returned NoSuchKey; \
5432                             skipping post-processing (object likely raced with DeleteObject)"
5433                        );
5434                        None
5435                    } else {
5436                        tracing::error!(
5437                            bucket = %bucket,
5438                            key = %key,
5439                            error = %e,
5440                            "multipart Complete: backend GET failed; failing the Complete \
5441                             so the client retries (silent fall-through would skip SSE \
5442                             re-encrypt and store plaintext)"
5443                        );
5444                        return Err(internal("multipart Complete: backend body fetch failed")(e));
5445                    }
5446                }
5447            }
5448        } else {
5449            None
5450        };
5451        // Sidecar build (existing behaviour, gated on assembled body).
5452        //
5453        // v0.8.12 HIGH-10 fix: skip the sidecar when the Complete is
5454        // going to SSE-encrypt the assembled body before re-PUT (the
5455        // single-PUT path applies the same suppression at L2271).
5456        // Stale offsets into the pre-encrypt body would break Range
5457        // GET on the encrypted on-disk bytes. `ctx.sse != None`
5458        // covers all three SSE modes captured at Create time.
5459        let mp_will_encrypt = ctx
5460            .as_ref()
5461            .map(|c| !matches!(c.sse, crate::multipart_state::MultipartSseMode::None))
5462            .unwrap_or(false);
5463        // v0.8.16 F-7: versioned multipart writes the assembled body
5464        // under `versioned_shadow_key(&key, vid)` *after* this
5465        // sidecar block, then deletes the original `<key>`. Stamping
5466        // the sidecar against the to-be-deleted `<key>` (which is
5467        // what H-g did) leaves an orphan `<key>.s4index` whose
5468        // source-ETag binding can never match the live shadow body
5469        // — the Range GET fast-path's stale-sidecar check then
5470        // falls through to a full read on every request, silently
5471        // disabling partial fetch. Skip the sidecar build entirely
5472        // for versioned buckets; a follow-up issue tracks writing
5473        // the sidecar under the shadow key with the shadow's ETag.
5474        let mp_skip_sidecar_for_versioning = self
5475            .versioning
5476            .as_ref()
5477            .map(|mgr| mgr.state(&bucket))
5478            .map(|state| state == crate::versioning::VersioningState::Enabled)
5479            .unwrap_or(false);
5480        if let Some(ref body) = assembled_body
5481            && !mp_will_encrypt
5482            && !mp_skip_sidecar_for_versioning
5483            && let Ok(mut index) = build_index_from_body(body)
5484        {
5485            // v0.8.15 H-g: stamp the source-ETag / source-compressed-size
5486            // binding on the multipart sidecar. The single-PUT path
5487            // does this at L2519-L2521 via the backend's PUT response,
5488            // but Complete returns its own ETag (an opaque manifest
5489            // hash) so we have to HEAD the freshly-completed object
5490            // to pick up what backend actually wrote, then bind the
5491            // sidecar to those values. Without the binding, a
5492            // subsequent backend-side mutation (lifecycle rewrite,
5493            // out-of-band CopyObject) wouldn't trip the staleness
5494            // check on the next Range GET — the GET would happily
5495            // slice the new bytes at the old sidecar offsets, with
5496            // silent data corruption.
5497            if let Ok(uri) = safe_object_uri(&bucket, &key) {
5498                let head_req = S3Request {
5499                    input: HeadObjectInput {
5500                        bucket: bucket.clone(),
5501                        key: key.clone(),
5502                        ..Default::default()
5503                    },
5504                    method: http::Method::HEAD,
5505                    uri,
5506                    headers: http::HeaderMap::new(),
5507                    extensions: http::Extensions::new(),
5508                    credentials: None,
5509                    region: None,
5510                    service: None,
5511                    trailing_headers: None,
5512                };
5513                if let Ok(head) = self.backend.head_object(head_req).await {
5514                    index.source_etag = head.output.e_tag.as_ref().map(|t| t.value().to_string());
5515                    index.source_compressed_size = head
5516                        .output
5517                        .content_length
5518                        .and_then(|n| u64::try_from(n).ok());
5519                }
5520                // HEAD failure is non-fatal — the sidecar still works
5521                // as a v1-style best-effort fast path; the Range GET
5522                // simply falls back to a full read on any consistency
5523                // signal.
5524            }
5525            self.write_sidecar(&bucket, &key, &index).await;
5526        }
5527        // From here on, post-processing depends on the context —
5528        // short-circuit when the upload had no captured recipe
5529        // (legacy / crashed-Create / pre-v0.8 state restore).
5530        if let Some(ctx) = ctx {
5531            // v0.8 #54 BUG-6 fix: mint a version-id when the bucket
5532            // is versioning-Enabled. The single-PUT path does this in
5533            // `put_object` ~L1968; multipart was the missing branch.
5534            // We mint here (post-Complete, before any re-PUT) so the
5535            // same vid threads into both the shadow-key rewrite and
5536            // the VersionEntry the manager records.
5537            let pending_version: Option<crate::versioning::PutOutcome> = self
5538                .versioning
5539                .as_ref()
5540                .map(|mgr| mgr.state(&bucket))
5541                .map(|state| match state {
5542                    crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
5543                        version_id: crate::versioning::VersioningManager::new_version_id(),
5544                        versioned_response: true,
5545                    },
5546                    crate::versioning::VersioningState::Suspended
5547                    | crate::versioning::VersioningState::Unversioned => {
5548                        crate::versioning::PutOutcome {
5549                            version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
5550                            versioned_response: false,
5551                        }
5552                    }
5553                });
5554            // v0.8 #54 BUG-5 fix: encrypt the assembled framed body
5555            // and re-PUT it to the backend so the on-disk bytes are
5556            // SSE-encrypted. The single-PUT path does this body-by-
5557            // body inside `put_object` (L1907-L1942); for multipart,
5558            // encrypt-per-part would require a multi-segment decrypt
5559            // path on GET — we instead do a single encrypt over the
5560            // assembled framed body so the existing GET decrypt
5561            // branch (`is_sse_encrypted` → `decrypt(body, source)` →
5562            // FrameIter) handles it unchanged.
5563            //
5564            // The cost is one extra round-trip per Complete for SSE-
5565            // enabled multipart (already-paid for the sidecar build).
5566            // For single-instance gateways pointing at a co-located
5567            // backend this is negligible; cross-region operators
5568            // would benefit from per-part encrypt + multi-segment
5569            // decrypt as a follow-up.
5570            let needs_re_put = matches!(
5571                ctx.sse,
5572                crate::multipart_state::MultipartSseMode::SseS4
5573                    | crate::multipart_state::MultipartSseMode::SseC { .. }
5574                    | crate::multipart_state::MultipartSseMode::SseKms { .. }
5575            ) || pending_version
5576                .as_ref()
5577                .map(|pv| pv.versioned_response)
5578                .unwrap_or(false);
5579            // v0.8.11 CRIT-2 fix: seed the replication body with the
5580            // pre-encrypt assembled bytes, but overwrite it with the
5581            // post-encrypt `new_body` once the re-PUT branch lands.
5582            // The previous "snapshot in advance" pattern shipped the
5583            // *plaintext* framed body to the destination bucket even
5584            // when SSE-S4 / SSE-C / SSE-KMS was active — the GET on
5585            // the destination would then fail to decrypt (or, worse,
5586            // succeed in handing out plaintext that the source had
5587            // promised was encrypted at rest). When `needs_re_put`
5588            // is false (no SSE, no versioning), the backend still
5589            // holds the original plaintext-framed bytes, and the
5590            // seed value is what the destination should receive.
5591            let mut replication_body = assembled_body.clone();
5592            let mut applied_metadata: Option<std::collections::HashMap<String, String>> = None;
5593            if needs_re_put && let Some(body) = assembled_body {
5594                // v0.8.1 #58: same Zeroizing pattern as put_object's
5595                // single-PUT KMS branch — DEK plaintext lives in
5596                // `Zeroizing<[u8; 32]>` for the lifetime of this
5597                // Complete handler, then is wiped on drop.
5598                let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
5599                    if let crate::multipart_state::MultipartSseMode::SseKms { ref key_id } = ctx.sse
5600                    {
5601                        let kms = self.kms.as_ref().ok_or_else(|| {
5602                        S3Error::with_message(
5603                            S3ErrorCode::InvalidRequest,
5604                            "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
5605                        )
5606                    })?;
5607                        let (dek, wrapped) =
5608                            kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
5609                        if dek.len() != 32 {
5610                            return Err(S3Error::with_message(
5611                                S3ErrorCode::InternalError,
5612                                format!(
5613                                    "KMS backend returned a DEK of {} bytes (expected 32)",
5614                                    dek.len()
5615                                ),
5616                            ));
5617                        }
5618                        let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
5619                            zeroize::Zeroizing::new([0u8; 32]);
5620                        dek_arr.copy_from_slice(&dek);
5621                        // `dek` (Zeroizing<Vec<u8>>) is dropped at scope end.
5622                        Some((dek_arr, wrapped))
5623                    } else {
5624                        None
5625                    };
5626                // Build the new metadata map: re-fetch via HEAD so
5627                // the multipart / codec markers the backend stamped
5628                // on Create flow through unchanged, then layer the
5629                // SSE markers on top.
5630                let head_req = S3Request {
5631                    input: HeadObjectInput {
5632                        bucket: bucket.clone(),
5633                        key: key.clone(),
5634                        ..Default::default()
5635                    },
5636                    method: http::Method::HEAD,
5637                    uri: safe_object_uri(&bucket, &key)?,
5638                    headers: http::HeaderMap::new(),
5639                    extensions: http::Extensions::new(),
5640                    credentials: None,
5641                    region: None,
5642                    service: None,
5643                    trailing_headers: None,
5644                };
5645                let mut new_metadata: std::collections::HashMap<String, String> =
5646                    match self.backend.head_object(head_req).await {
5647                        Ok(h) => h.output.metadata.unwrap_or_default(),
5648                        Err(_) => std::collections::HashMap::new(),
5649                    };
5650                let new_body = match &ctx.sse {
5651                    crate::multipart_state::MultipartSseMode::SseC { key, key_md5 } => {
5652                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5653                        new_metadata.insert("s4-sse-type".into(), "AES256".into());
5654                        new_metadata.insert(
5655                            "s4-sse-c-key-md5".into(),
5656                            base64::engine::general_purpose::STANDARD.encode(key_md5),
5657                        );
5658                        // v0.8.2 #62: `key` is `&Zeroizing<[u8; 32]>`;
5659                        // auto-deref through one explicit binding so
5660                        // `SseSource::CustomerKey` gets the `&[u8; 32]`
5661                        // it expects (mirrors the SSE-KMS DEK shape
5662                        // a few lines down).
5663                        let key_ref: &[u8; 32] = key;
5664                        crate::sse::encrypt_with_source(
5665                            &body,
5666                            crate::sse::SseSource::CustomerKey {
5667                                key: key_ref,
5668                                key_md5,
5669                            },
5670                        )
5671                    }
5672                    crate::multipart_state::MultipartSseMode::SseKms { .. } => {
5673                        let (dek, wrapped) = kms_wrap
5674                            .as_ref()
5675                            .expect("SseKms branch implies kms_wrap is Some");
5676                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5677                        new_metadata.insert("s4-sse-type".into(), "aws:kms".into());
5678                        new_metadata.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
5679                        // v0.8.1 #58: auto-deref from `&Zeroizing<[u8; 32]>`
5680                        // to `&[u8; 32]` (same shape as the put_object
5681                        // single-PUT branch).
5682                        let dek_ref: &[u8; 32] = dek;
5683                        crate::sse::encrypt_with_source(
5684                            &body,
5685                            crate::sse::SseSource::Kms {
5686                                dek: dek_ref,
5687                                wrapped,
5688                            },
5689                        )
5690                    }
5691                    crate::multipart_state::MultipartSseMode::SseS4 => {
5692                        let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
5693                            S3Error::with_message(
5694                                S3ErrorCode::InternalError,
5695                                "SSE-S4 captured at Create but keyring missing at Complete",
5696                            )
5697                        })?;
5698                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5699                        // SSE-S4 deliberately omits `s4-sse-type` so
5700                        // HEAD doesn't falsely advertise AWS-style
5701                        // SSE-S3 (matches the put_object L1929-L1939
5702                        // comment).
5703                        // v0.8 #52: same chunk_size dispatch as the
5704                        // single-PUT branch — multipart Complete
5705                        // re-encrypts the assembled body, so honoring
5706                        // the chunked path here is required to keep
5707                        // GET streaming on multipart-uploaded objects.
5708                        if self.sse_chunk_size > 0 {
5709                            crate::sse::encrypt_v2_chunked(&body, keyring, self.sse_chunk_size)
5710                                .map_err(|e| {
5711                                    S3Error::with_message(
5712                                        S3ErrorCode::InternalError,
5713                                        format!("SSE-S4 chunked encrypt failed at Complete: {e}"),
5714                                    )
5715                                })?
5716                        } else {
5717                            crate::sse::encrypt_v2(&body, keyring)
5718                        }
5719                    }
5720                    crate::multipart_state::MultipartSseMode::None => body.clone(),
5721                };
5722                // v0.8 #54 BUG-6 fix: write the re-PUT under the
5723                // shadow key so the version chain doesn't overwrite
5724                // the previous version on a versioned bucket. The
5725                // original (unshadowed) key was assembled by the
5726                // backend on Complete; we delete it after the shadow
5727                // PUT lands.
5728                let put_target_key = if let Some(pv) = pending_version.as_ref() {
5729                    if pv.versioned_response {
5730                        versioned_shadow_key(&key, &pv.version_id)
5731                    } else {
5732                        key.clone()
5733                    }
5734                } else {
5735                    key.clone()
5736                };
5737                let new_body_len = new_body.len() as i64;
5738                let put_req = S3Request {
5739                    input: PutObjectInput {
5740                        bucket: bucket.clone(),
5741                        key: put_target_key.clone(),
5742                        body: Some(bytes_to_blob(new_body.clone())),
5743                        metadata: Some(new_metadata.clone()),
5744                        content_length: Some(new_body_len),
5745                        ..Default::default()
5746                    },
5747                    method: http::Method::PUT,
5748                    uri: safe_object_uri(&bucket, &put_target_key)?,
5749                    headers: http::HeaderMap::new(),
5750                    extensions: http::Extensions::new(),
5751                    credentials: None,
5752                    region: None,
5753                    service: None,
5754                    trailing_headers: None,
5755                };
5756                self.backend.put_object(put_req).await?;
5757                // v0.8.11 CRIT-2 fix: refresh the replication snapshot
5758                // with the bytes that were actually persisted to the
5759                // backend (post-SSE-encrypt for SSE modes; identical to
5760                // `body` for `MultipartSseMode::None` + versioning-only
5761                // re-PUT). The destination then sees the same on-disk
5762                // shape the source does, and a destination GET decrypts
5763                // correctly when SSE is on.
5764                replication_body = Some(new_body.clone());
5765                // If we rewrote the storage key (versioning shadow),
5766                // we must drop the original (unshadowed) Complete-
5767                // assembled bytes so subsequent listings don't see a
5768                // duplicate.
5769                if put_target_key != key {
5770                    let del_req = S3Request {
5771                        input: DeleteObjectInput {
5772                            bucket: bucket.clone(),
5773                            key: key.clone(),
5774                            ..Default::default()
5775                        },
5776                        method: http::Method::DELETE,
5777                        uri: safe_object_uri(&bucket, &key)?,
5778                        headers: http::HeaderMap::new(),
5779                        extensions: http::Extensions::new(),
5780                        credentials: None,
5781                        region: None,
5782                        service: None,
5783                        trailing_headers: None,
5784                    };
5785                    let _ = self.backend.delete_object(del_req).await;
5786                }
5787                applied_metadata = Some(new_metadata);
5788            }
5789            // v0.8 #54 BUG-6 commit: register the new version with
5790            // the VersioningManager so list_object_versions /
5791            // GET ?versionId= see it.
5792            if let (Some(mgr), Some(pv)) = (self.versioning.as_ref(), pending_version.as_ref()) {
5793                let etag = resp
5794                    .output
5795                    .e_tag
5796                    .clone()
5797                    .map(ETag::into_value)
5798                    .unwrap_or_default();
5799                let now = chrono::Utc::now();
5800                mgr.commit_put_with_version(
5801                    &bucket,
5802                    &key,
5803                    crate::versioning::VersionEntry {
5804                        version_id: pv.version_id.clone(),
5805                        etag,
5806                        size: replication_body
5807                            .as_ref()
5808                            .map(|b| b.len() as u64)
5809                            .unwrap_or(0),
5810                        is_delete_marker: false,
5811                        created_at: now,
5812                    },
5813                );
5814                if pv.versioned_response {
5815                    resp.output.version_id = Some(pv.version_id.clone());
5816                }
5817            }
5818            // v0.8 #54 BUG-7 fix: persist any per-upload Object Lock
5819            // recipe + auto-apply the bucket default. Mirrors the
5820            // put_object L2057-L2074 block.
5821            if let Some(mgr) = self.object_lock.as_ref() {
5822                if ctx.object_lock_mode.is_some()
5823                    || ctx.object_lock_retain_until.is_some()
5824                    || ctx.object_lock_legal_hold
5825                {
5826                    let mut state = mgr.get(&bucket, &key).unwrap_or_default();
5827                    if let Some(m) = ctx.object_lock_mode {
5828                        state.mode = Some(m);
5829                    }
5830                    if let Some(u) = ctx.object_lock_retain_until {
5831                        state.retain_until = Some(u);
5832                    }
5833                    if ctx.object_lock_legal_hold {
5834                        state.legal_hold_on = true;
5835                    }
5836                    mgr.set(&bucket, &key, state);
5837                }
5838                mgr.apply_default_on_put(&bucket, &key, chrono::Utc::now());
5839            }
5840            // v0.8 #54 BUG-9 fix: persist the captured tags via the
5841            // TagManager so GetObjectTagging returns them.
5842            if let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), ctx.tags.as_ref()) {
5843                mgr.put_object_tags(&bucket, &key, tags.clone());
5844            }
5845            // SSE-C / SSE-KMS response echo. The
5846            // CompleteMultipartUploadOutput only exposes
5847            // `server_side_encryption` + `ssekms_key_id` (no
5848            // sse_customer_* — those round-tripped on Create / parts).
5849            match &ctx.sse {
5850                crate::multipart_state::MultipartSseMode::SseC { .. } => {
5851                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5852                        ServerSideEncryption::AES256,
5853                    ));
5854                }
5855                crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5856                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5857                        ServerSideEncryption::AWS_KMS,
5858                    ));
5859                    resp.output.ssekms_key_id = Some(key_id.clone());
5860                }
5861                _ => {}
5862            }
5863            // v0.8 #54 BUG-8 fix: fire cross-bucket replication just
5864            // like put_object L2165 does. We hand the dispatcher the
5865            // assembled body bytes (post-encrypt where applicable, so
5866            // the destination ends up byte-identical to the source's
5867            // on-disk shape) plus the metadata that was actually
5868            // committed.
5869            let replication_body_bytes = replication_body.unwrap_or_default();
5870            // v0.8.2 #61: thread the multipart-Complete `pending_version`
5871            // through so a versioning-Enabled source's destination
5872            // receives the same shadow-key path (mirror of the
5873            // single-PUT branch above).
5874            self.spawn_replication_if_matched(
5875                &bucket,
5876                &key,
5877                &ctx.tags,
5878                &replication_body_bytes,
5879                &applied_metadata,
5880                true,
5881                pending_version.as_ref(),
5882            );
5883            self.multipart_state.remove(upload_id.as_str());
5884        }
5885        // v0.8.1 #59 janitor: best-effort sweep of stale completion
5886        // locks while we are still on the critical path of a single
5887        // Complete (so steady-state workloads of unique keys don't
5888        // accumulate `DashMap` entries). The sweep only retires
5889        // entries whose `Arc::strong_count == 1`, so any other in-
5890        // flight Complete on a different key keeps its lock alive.
5891        // Our own `_completion_guard` keeps `bucket`/`key`'s entry
5892        // alive across this call; it's reaped on the next Complete or
5893        // the next caller-driven prune.
5894        self.multipart_state.prune_completion_locks();
5895        Ok(resp)
5896    }
5897    async fn abort_multipart_upload(
5898        &self,
5899        req: S3Request<AbortMultipartUploadInput>,
5900    ) -> S3Result<S3Response<AbortMultipartUploadOutput>> {
5901        // v0.8.12 HIGH-9 fix: gate Abort on `s3:AbortMultipartUpload`
5902        // — the AWS-spec action verb for this operation. Without the
5903        // gate, anyone who could guess an upload_id could throw away
5904        // someone else's in-flight multipart upload.
5905        let abort_bucket = req.input.bucket.clone();
5906        let abort_key = req.input.key.clone();
5907        self.enforce_policy(
5908            &req,
5909            "s3:AbortMultipartUpload",
5910            &abort_bucket,
5911            Some(&abort_key),
5912        )?;
5913        // v0.8 #54: drop the per-upload state (SSE-C key bytes / tag
5914        // set) promptly so an aborted upload doesn't leak the
5915        // customer's key into a long-running gateway's RSS.
5916        //
5917        // v0.8.4 #71 (H-7 audit fix): backend.abort_multipart_upload
5918        // FIRST, then drop in-process state ONLY on success. The
5919        // previous order ("remove → call backend") meant a transient
5920        // backend abort failure (5xx, network) wiped the SSE-C key
5921        // bytes locally while leaving the parts on the backend, so a
5922        // client retry would have to re-validate the SSE-C key against
5923        // a context the gateway no longer has — and the retried abort
5924        // would still hit the unaborted backend parts. Calling the
5925        // backend first lets the failure propagate to the client with
5926        // state intact for a clean retry; only on success do we wipe
5927        // the local state.
5928        let upload_id = req.input.upload_id.as_str().to_owned();
5929        let resp = self.backend.abort_multipart_upload(req).await?;
5930        self.multipart_state.remove(&upload_id);
5931        Ok(resp)
5932    }
5933    async fn list_multipart_uploads(
5934        &self,
5935        req: S3Request<ListMultipartUploadsInput>,
5936    ) -> S3Result<S3Response<ListMultipartUploadsOutput>> {
5937        self.backend.list_multipart_uploads(req).await
5938    }
5939    async fn list_parts(
5940        &self,
5941        req: S3Request<ListPartsInput>,
5942    ) -> S3Result<S3Response<ListPartsOutput>> {
5943        self.backend.list_parts(req).await
5944    }
5945
5946    // =========================================================================
5947    // Phase 2 — pure passthrough delegations。S4 はこれらに対して圧縮 hook を
5948    // 持たないので、backend (= AWS S3) の動作と完全に同一。
5949    //
5950    // 既知の制限事項:
5951    // - copy_object / upload_part_copy: source object が S4-compressed の場合、
5952    //   backend が bytes を copy するだけなので metadata (s4-codec etc) も一緒に
5953    //   coppied される (AWS S3 default = MetadataDirective COPY)。GET は manifest
5954    //   経由で正しく decompress できる。MetadataDirective REPLACE で上書き
5955    //   されると圧縮 metadata が消えて壊れる — 顧客側の運用で注意
5956    // - list_object_versions: versioning enabled bucket では各 version も S4
5957    //   metadata を維持する。古い version も S4 経由で正しく GET できる。
5958    // =========================================================================
5959
5960    // ---- Object ACL / tagging / attributes ----
5961    async fn get_object_acl(
5962        &self,
5963        req: S3Request<GetObjectAclInput>,
5964    ) -> S3Result<S3Response<GetObjectAclOutput>> {
5965        // v0.8.17 G-2: reserved-name guard. Without it a hostile
5966        // client can `GetObjectAcl(<key>.s4index)` to confirm the
5967        // sidecar exists, an information leak the F-13 GET reject
5968        // closed for the same object.
5969        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5970        self.backend.get_object_acl(req).await
5971    }
5972    async fn put_object_acl(
5973        &self,
5974        req: S3Request<PutObjectAclInput>,
5975    ) -> S3Result<S3Response<PutObjectAclOutput>> {
5976        // v0.8.17 G-2: reserved-name guard. `put-object-acl
5977        // --acl public-read` against `<key>.s4index` would grant
5978        // external read access to the internal sidecar, bypassing
5979        // the F-13 GET reject via the backend's public-URL path.
5980        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5981        self.backend.put_object_acl(req).await
5982    }
5983    // v0.6 #39: object tagging — when a `TagManager` is attached the
5984    // configuration / per-(bucket, key) state lives in the manager and
5985    // these handlers serve directly from it; when no manager is
5986    // attached they fall back to the backend (legacy passthrough so
5987    // v0.5 deployments are unaffected).
5988    async fn get_object_tagging(
5989        &self,
5990        req: S3Request<GetObjectTaggingInput>,
5991    ) -> S3Result<S3Response<GetObjectTaggingOutput>> {
5992        // v0.8.17 G-2: reserved-name guard.
5993        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5994        let Some(mgr) = self.tagging.as_ref() else {
5995            return self.backend.get_object_tagging(req).await;
5996        };
5997        let tags = mgr
5998            .get_object_tags(&req.input.bucket, &req.input.key)
5999            .unwrap_or_default();
6000        Ok(S3Response::new(GetObjectTaggingOutput {
6001            tag_set: tagset_to_aws(&tags),
6002            ..Default::default()
6003        }))
6004    }
6005    async fn put_object_tagging(
6006        &self,
6007        req: S3Request<PutObjectTaggingInput>,
6008    ) -> S3Result<S3Response<PutObjectTaggingOutput>> {
6009        // v0.8.17 G-2: reserved-name guard.
6010        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
6011        let Some(mgr) = self.tagging.as_ref() else {
6012            return self.backend.put_object_tagging(req).await;
6013        };
6014        let bucket = req.input.bucket.clone();
6015        let key = req.input.key.clone();
6016        let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6017            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6018        // v0.6 #39: gate via IAM policy with both the request tags
6019        // (`s3:RequestObjectTag/<key>`) and any existing tags on the
6020        // target object (`s3:ExistingObjectTag/<key>`).
6021        let existing = mgr.get_object_tags(&bucket, &key);
6022        self.enforce_policy_with_extra(
6023            &req,
6024            "s3:PutObjectTagging",
6025            &bucket,
6026            Some(&key),
6027            Some(&parsed),
6028            existing.as_ref(),
6029        )?;
6030        mgr.put_object_tags(&bucket, &key, parsed);
6031        Ok(S3Response::new(PutObjectTaggingOutput::default()))
6032    }
6033    async fn delete_object_tagging(
6034        &self,
6035        req: S3Request<DeleteObjectTaggingInput>,
6036    ) -> S3Result<S3Response<DeleteObjectTaggingOutput>> {
6037        // v0.8.17 G-2: reserved-name guard.
6038        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
6039        let Some(mgr) = self.tagging.as_ref() else {
6040            return self.backend.delete_object_tagging(req).await;
6041        };
6042        let bucket = req.input.bucket.clone();
6043        let key = req.input.key.clone();
6044        let existing = mgr.get_object_tags(&bucket, &key);
6045        self.enforce_policy_with_extra(
6046            &req,
6047            "s3:DeleteObjectTagging",
6048            &bucket,
6049            Some(&key),
6050            None,
6051            existing.as_ref(),
6052        )?;
6053        mgr.delete_object_tags(&bucket, &key);
6054        Ok(S3Response::new(DeleteObjectTaggingOutput::default()))
6055    }
6056    async fn get_object_attributes(
6057        &self,
6058        req: S3Request<GetObjectAttributesInput>,
6059    ) -> S3Result<S3Response<GetObjectAttributesOutput>> {
6060        // v0.8.17 G-2: reserved-name guard. Attributes leak the
6061        // sidecar's size + ETag, same shape as F-13's GET concern.
6062        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
6063        self.backend.get_object_attributes(req).await
6064    }
6065    async fn restore_object(
6066        &self,
6067        req: S3Request<RestoreObjectInput>,
6068    ) -> S3Result<S3Response<RestoreObjectOutput>> {
6069        // v0.8.17 G-2: reserved-name guard.
6070        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
6071        self.backend.restore_object(req).await
6072    }
6073    async fn upload_part_copy(
6074        &self,
6075        req: S3Request<UploadPartCopyInput>,
6076    ) -> S3Result<S3Response<UploadPartCopyOutput>> {
6077        // v0.8.12 HIGH-9 fix: same per-action gates as `copy_object` —
6078        // destination PUT + source GET.
6079        let dst_bucket = req.input.bucket.clone();
6080        let dst_key = req.input.key.clone();
6081        // v0.8.17 G-2: reserved-name guard on both destination
6082        // and source. Mirrors what `copy_object` enforces.
6083        self.check_not_reserved_key(&dst_key, ReservedKeyMode::Mutating)?;
6084        if let CopySource::Bucket { key, .. } = &req.input.copy_source {
6085            self.check_not_reserved_key(key, ReservedKeyMode::Read)?;
6086        }
6087        self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
6088        if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
6089            self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
6090        }
6091        self.enforce_rate_limit(&req, &dst_bucket)?;
6092        // v0.2 #6: byte-range aware copy when the source is S4-framed.
6093        //
6094        // For a framed source (multipart upload OR single-PUT framed-v2),
6095        // a naive byte-range passthrough would copy compressed bytes that
6096        // don't align with S4 frame boundaries — silently corrupting the
6097        // result. Instead we GET the source through S4 (which handles
6098        // decompression + Range), re-compress + re-frame as a new part,
6099        // and forward as upload_part. For non-framed sources (S4-untouched
6100        // raw objects), passthrough is correct and we keep the original
6101        // (cheaper) code path.
6102        // v0.8.4 #74: propagate the optional `?versionId=<vid>` from the
6103        // copy-source header. Without this, a versioned source bucket
6104        // copy that pins a specific old version would silently fall
6105        // back to "latest", assembling wrong bytes into the destination
6106        // multipart object (silent data corruption).
6107        let CopySource::Bucket {
6108            bucket: src_bucket,
6109            key: src_key,
6110            version_id: src_version_id,
6111        } = &req.input.copy_source
6112        else {
6113            return self.backend.upload_part_copy(req).await;
6114        };
6115        let src_bucket = src_bucket.to_string();
6116        let src_key = src_key.to_string();
6117        let src_version_id: Option<String> = src_version_id.as_deref().map(str::to_owned);
6118
6119        // Probe metadata to decide whether the source needs S4-aware copy.
6120        let head_input = HeadObjectInput {
6121            bucket: src_bucket.clone(),
6122            key: src_key.clone(),
6123            version_id: src_version_id.clone(),
6124            ..Default::default()
6125        };
6126        let head_req = S3Request {
6127            input: head_input,
6128            method: http::Method::HEAD,
6129            uri: req.uri.clone(),
6130            headers: req.headers.clone(),
6131            extensions: http::Extensions::new(),
6132            credentials: req.credentials.clone(),
6133            region: req.region.clone(),
6134            service: req.service.clone(),
6135            trailing_headers: None,
6136        };
6137        let needs_s4_copy = match self.backend.head_object(head_req).await {
6138            Ok(h) => {
6139                is_multipart_object(&h.output.metadata) || is_framed_v2_object(&h.output.metadata)
6140            }
6141            Err(_) => false,
6142        };
6143        if !needs_s4_copy {
6144            return self.backend.upload_part_copy(req).await;
6145        }
6146
6147        // Resolve the optional source byte range to pass to GET.
6148        let source_range = req
6149            .input
6150            .copy_source_range
6151            .as_ref()
6152            .map(|r| parse_copy_source_range(r))
6153            .transpose()
6154            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
6155
6156        // GET source via S4 (handles decompression + sidecar partial fetch
6157        // when range is present). The result is the requested user-visible
6158        // byte range, fully decompressed. version_id is propagated so
6159        // pinned-version copies fetch the exact version requested.
6160        let mut get_input = GetObjectInput {
6161            bucket: src_bucket.clone(),
6162            key: src_key.clone(),
6163            version_id: src_version_id.clone(),
6164            ..Default::default()
6165        };
6166        get_input.range = source_range;
6167        let get_req = S3Request {
6168            input: get_input,
6169            method: http::Method::GET,
6170            uri: req.uri.clone(),
6171            headers: req.headers.clone(),
6172            extensions: http::Extensions::new(),
6173            credentials: req.credentials.clone(),
6174            region: req.region.clone(),
6175            service: req.service.clone(),
6176            trailing_headers: None,
6177        };
6178        let get_resp = self.get_object(get_req).await?;
6179        let blob = get_resp.output.body.ok_or_else(|| {
6180            S3Error::with_message(
6181                S3ErrorCode::InternalError,
6182                "upload_part_copy: empty body from source GET",
6183            )
6184        })?;
6185        let bytes = collect_blob(blob, self.max_body_bytes)
6186            .await
6187            .map_err(internal("collect upload_part_copy source body"))?;
6188
6189        // Compress + frame as a fresh part (mirrors upload_part path).
6190        let sample_len = bytes.len().min(SAMPLE_BYTES);
6191        // v0.8 #56: same size-hint promotion as the upload_part path.
6192        let codec_kind = self
6193            .dispatcher
6194            .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
6195            .await;
6196        let original_size = bytes.len() as u64;
6197        // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
6198        let (compress_res, tel) = self
6199            .registry
6200            .compress_with_telemetry(bytes, codec_kind)
6201            .await;
6202        stamp_gpu_compress_telemetry(&tel);
6203        let (compressed, manifest) =
6204            compress_res.map_err(internal("registry compress upload_part_copy"))?;
6205        let header = FrameHeader {
6206            codec: codec_kind,
6207            original_size,
6208            compressed_size: compressed.len() as u64,
6209            crc32c: manifest.crc32c,
6210        };
6211        let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
6212        write_frame(&mut framed, header, &compressed);
6213        let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
6214        if !likely_final {
6215            pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
6216        }
6217        let framed_bytes = framed.freeze();
6218        let framed_len = framed_bytes.len() as i64;
6219
6220        // Forward as upload_part to the destination multipart upload.
6221        let part_input = UploadPartInput {
6222            bucket: req.input.bucket.clone(),
6223            key: req.input.key.clone(),
6224            part_number: req.input.part_number,
6225            upload_id: req.input.upload_id.clone(),
6226            body: Some(bytes_to_blob(framed_bytes)),
6227            content_length: Some(framed_len),
6228            ..Default::default()
6229        };
6230        let part_req = S3Request {
6231            input: part_input,
6232            method: http::Method::PUT,
6233            uri: req.uri.clone(),
6234            headers: req.headers.clone(),
6235            extensions: http::Extensions::new(),
6236            credentials: req.credentials.clone(),
6237            region: req.region.clone(),
6238            service: req.service.clone(),
6239            trailing_headers: None,
6240        };
6241        let upload_resp = self.backend.upload_part(part_req).await?;
6242
6243        let copy_output = UploadPartCopyOutput {
6244            copy_part_result: Some(CopyPartResult {
6245                e_tag: upload_resp.output.e_tag.clone(),
6246                ..Default::default()
6247            }),
6248            ..Default::default()
6249        };
6250        Ok(S3Response::new(copy_output))
6251    }
6252
6253    // ---- Object lock / retention / legal hold (v0.5 #30) ----
6254    //
6255    // When an `ObjectLockManager` is attached the configuration / per-object
6256    // state lives in the manager and these handlers serve directly from it;
6257    // when no manager is attached they fall back to the backend (legacy
6258    // passthrough so v0.4 deployments are unaffected).
6259    async fn get_object_lock_configuration(
6260        &self,
6261        req: S3Request<GetObjectLockConfigurationInput>,
6262    ) -> S3Result<S3Response<GetObjectLockConfigurationOutput>> {
6263        self.enforce_policy(
6264            &req,
6265            "s3:GetBucketObjectLockConfiguration",
6266            &req.input.bucket,
6267            None,
6268        )?;
6269        if let Some(mgr) = self.object_lock.as_ref() {
6270            let cfg = mgr
6271                .bucket_default(&req.input.bucket)
6272                .map(|d| ObjectLockConfiguration {
6273                    object_lock_enabled: Some(ObjectLockEnabled::from_static(
6274                        ObjectLockEnabled::ENABLED,
6275                    )),
6276                    rule: Some(ObjectLockRule {
6277                        default_retention: Some(DefaultRetention {
6278                            days: Some(d.retention_days as i32),
6279                            mode: Some(ObjectLockRetentionMode::from_static(match d.mode {
6280                                crate::object_lock::LockMode::Governance => {
6281                                    ObjectLockRetentionMode::GOVERNANCE
6282                                }
6283                                crate::object_lock::LockMode::Compliance => {
6284                                    ObjectLockRetentionMode::COMPLIANCE
6285                                }
6286                            })),
6287                            years: None,
6288                        }),
6289                    }),
6290                });
6291            let output = GetObjectLockConfigurationOutput {
6292                object_lock_configuration: cfg,
6293            };
6294            return Ok(S3Response::new(output));
6295        }
6296        self.backend.get_object_lock_configuration(req).await
6297    }
6298    async fn put_object_lock_configuration(
6299        &self,
6300        req: S3Request<PutObjectLockConfigurationInput>,
6301    ) -> S3Result<S3Response<PutObjectLockConfigurationOutput>> {
6302        self.enforce_policy(
6303            &req,
6304            "s3:PutBucketObjectLockConfiguration",
6305            &req.input.bucket,
6306            None,
6307        )?;
6308        if let Some(mgr) = self.object_lock.as_ref() {
6309            let bucket = req.input.bucket.clone();
6310            if let Some(cfg) = req.input.object_lock_configuration.as_ref()
6311                && let Some(rule) = cfg.rule.as_ref()
6312                && let Some(d) = rule.default_retention.as_ref()
6313            {
6314                let mode = d
6315                    .mode
6316                    .as_ref()
6317                    .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()))
6318                    .ok_or_else(|| {
6319                        S3Error::with_message(
6320                            S3ErrorCode::InvalidRequest,
6321                            "Object Lock default retention requires a valid Mode (GOVERNANCE | COMPLIANCE)",
6322                        )
6323                    })?;
6324                // S3 spec: exactly one of Days / Years (we accept Days
6325                // outright and convert Years → Days for storage; Years
6326                // is just a UX shorthand on the wire).
6327                let days: u32 = match (d.days, d.years) {
6328                    (Some(d), None) if d > 0 => d as u32,
6329                    (None, Some(y)) if y > 0 => (y as u32).saturating_mul(365),
6330                    _ => {
6331                        return Err(S3Error::with_message(
6332                            S3ErrorCode::InvalidRequest,
6333                            "Object Lock default retention requires exactly one of Days or Years (positive integer)",
6334                        ));
6335                    }
6336                };
6337                mgr.set_bucket_default(
6338                    &bucket,
6339                    crate::object_lock::BucketObjectLockDefault {
6340                        mode,
6341                        retention_days: days,
6342                    },
6343                );
6344            }
6345            return Ok(S3Response::new(PutObjectLockConfigurationOutput::default()));
6346        }
6347        self.backend.put_object_lock_configuration(req).await
6348    }
6349    async fn get_object_legal_hold(
6350        &self,
6351        req: S3Request<GetObjectLegalHoldInput>,
6352    ) -> S3Result<S3Response<GetObjectLegalHoldOutput>> {
6353        let key = req.input.key.clone();
6354        self.enforce_policy(&req, "s3:GetObjectLegalHold", &req.input.bucket, Some(&key))?;
6355        if let Some(mgr) = self.object_lock.as_ref() {
6356            let on = mgr
6357                .get(&req.input.bucket, &req.input.key)
6358                .map(|s| s.legal_hold_on)
6359                .unwrap_or(false);
6360            let status = ObjectLockLegalHoldStatus::from_static(if on {
6361                ObjectLockLegalHoldStatus::ON
6362            } else {
6363                ObjectLockLegalHoldStatus::OFF
6364            });
6365            let output = GetObjectLegalHoldOutput {
6366                legal_hold: Some(ObjectLockLegalHold {
6367                    status: Some(status),
6368                }),
6369            };
6370            return Ok(S3Response::new(output));
6371        }
6372        self.backend.get_object_legal_hold(req).await
6373    }
6374    async fn put_object_legal_hold(
6375        &self,
6376        req: S3Request<PutObjectLegalHoldInput>,
6377    ) -> S3Result<S3Response<PutObjectLegalHoldOutput>> {
6378        let key = req.input.key.clone();
6379        self.enforce_policy(&req, "s3:PutObjectLegalHold", &req.input.bucket, Some(&key))?;
6380        if let Some(mgr) = self.object_lock.as_ref() {
6381            let on = req
6382                .input
6383                .legal_hold
6384                .as_ref()
6385                .and_then(|h| h.status.as_ref())
6386                .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
6387                .unwrap_or(false);
6388            mgr.set_legal_hold(&req.input.bucket, &req.input.key, on);
6389            return Ok(S3Response::new(PutObjectLegalHoldOutput::default()));
6390        }
6391        self.backend.put_object_legal_hold(req).await
6392    }
6393    async fn get_object_retention(
6394        &self,
6395        req: S3Request<GetObjectRetentionInput>,
6396    ) -> S3Result<S3Response<GetObjectRetentionOutput>> {
6397        let key = req.input.key.clone();
6398        self.enforce_policy(&req, "s3:GetObjectRetention", &req.input.bucket, Some(&key))?;
6399        if let Some(mgr) = self.object_lock.as_ref() {
6400            let retention = mgr
6401                .get(&req.input.bucket, &req.input.key)
6402                .filter(|s| s.mode.is_some() || s.retain_until.is_some())
6403                .map(|s| {
6404                    let mode = s.mode.map(|m| {
6405                        ObjectLockRetentionMode::from_static(match m {
6406                            crate::object_lock::LockMode::Governance => {
6407                                ObjectLockRetentionMode::GOVERNANCE
6408                            }
6409                            crate::object_lock::LockMode::Compliance => {
6410                                ObjectLockRetentionMode::COMPLIANCE
6411                            }
6412                        })
6413                    });
6414                    let until = s.retain_until.map(chrono_utc_to_timestamp);
6415                    ObjectLockRetention {
6416                        mode,
6417                        retain_until_date: until,
6418                    }
6419                });
6420            let output = GetObjectRetentionOutput { retention };
6421            return Ok(S3Response::new(output));
6422        }
6423        self.backend.get_object_retention(req).await
6424    }
6425    async fn put_object_retention(
6426        &self,
6427        req: S3Request<PutObjectRetentionInput>,
6428    ) -> S3Result<S3Response<PutObjectRetentionOutput>> {
6429        let key = req.input.key.clone();
6430        self.enforce_policy(&req, "s3:PutObjectRetention", &req.input.bucket, Some(&key))?;
6431        if let Some(mgr) = self.object_lock.as_ref() {
6432            let bucket = req.input.bucket.clone();
6433            let key = req.input.key.clone();
6434            // v0.8.12 HIGH-7 fix: the bypass header gates Governance
6435            // shortening only when the caller has the matching IAM
6436            // action explicitly allowed; otherwise it's silently
6437            // dropped to `false` and the "shortening Governance
6438            // requires bypass" branch below rejects.
6439            let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
6440            let bypass = if bypass_header {
6441                self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
6442                    .is_ok()
6443            } else {
6444                false
6445            };
6446            let retention = req.input.retention.as_ref().ok_or_else(|| {
6447                S3Error::with_message(
6448                    S3ErrorCode::InvalidRequest,
6449                    "PutObjectRetention requires a Retention element",
6450                )
6451            })?;
6452            let new_mode = retention
6453                .mode
6454                .as_ref()
6455                .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
6456            let new_until = retention
6457                .retain_until_date
6458                .as_ref()
6459                .map(timestamp_to_chrono_utc)
6460                .unwrap_or(None);
6461            let now = chrono::Utc::now();
6462            let existing = mgr.get(&bucket, &key).unwrap_or_default();
6463            // S3 immutability rules:
6464            //   - Compliance is one-way: once set, mode cannot move to
6465            //     Governance, and retain-until cannot be shortened.
6466            //   - Governance can be lengthened freely; shortened only
6467            //     with bypass=true.
6468            if let Some(existing_mode) = existing.mode
6469                && existing_mode == crate::object_lock::LockMode::Compliance
6470                && existing.is_locked(now)
6471            {
6472                if matches!(new_mode, Some(crate::object_lock::LockMode::Governance)) {
6473                    return Err(S3Error::with_message(
6474                        S3ErrorCode::AccessDenied,
6475                        "Cannot downgrade Compliance retention to Governance while lock is active",
6476                    ));
6477                }
6478                if let (Some(prev), Some(next)) = (existing.retain_until, new_until)
6479                    && next < prev
6480                {
6481                    return Err(S3Error::with_message(
6482                        S3ErrorCode::AccessDenied,
6483                        "Cannot shorten Compliance retention while lock is active",
6484                    ));
6485                }
6486            }
6487            if let Some(existing_mode) = existing.mode
6488                && existing_mode == crate::object_lock::LockMode::Governance
6489                && existing.is_locked(now)
6490                && !bypass
6491                && let (Some(prev), Some(next)) = (existing.retain_until, new_until)
6492                && next < prev
6493            {
6494                return Err(S3Error::with_message(
6495                    S3ErrorCode::AccessDenied,
6496                    "Shortening Governance retention requires x-amz-bypass-governance-retention: true",
6497                ));
6498            }
6499            let mut state = existing;
6500            if new_mode.is_some() {
6501                state.mode = new_mode;
6502            }
6503            if new_until.is_some() {
6504                state.retain_until = new_until;
6505            }
6506            mgr.set(&bucket, &key, state);
6507            return Ok(S3Response::new(PutObjectRetentionOutput::default()));
6508        }
6509        self.backend.put_object_retention(req).await
6510    }
6511
6512    // ---- Versioning ----
6513    // list_object_versions is implemented above in the compression-hook
6514    // section so it filters S4-internal sidecars (v0.4 #17) AND, when a
6515    // VersioningManager is attached (v0.5 #34), serves chains directly
6516    // from the in-memory index.
6517    async fn get_bucket_versioning(
6518        &self,
6519        req: S3Request<GetBucketVersioningInput>,
6520    ) -> S3Result<S3Response<GetBucketVersioningOutput>> {
6521        // v0.5 #34: when a VersioningManager is attached, the bucket's
6522        // versioning state lives in the manager (= S4-server's
6523        // authoritative source). Pass-through hits the backend only
6524        // when no manager is configured (legacy v0.4 behaviour).
6525        if let Some(mgr) = self.versioning.as_ref() {
6526            let output = match mgr.state(&req.input.bucket).as_aws_status() {
6527                Some(s) => GetBucketVersioningOutput {
6528                    status: Some(BucketVersioningStatus::from(s.to_owned())),
6529                    ..Default::default()
6530                },
6531                None => GetBucketVersioningOutput::default(),
6532            };
6533            return Ok(S3Response::new(output));
6534        }
6535        self.backend.get_bucket_versioning(req).await
6536    }
6537    async fn put_bucket_versioning(
6538        &self,
6539        req: S3Request<PutBucketVersioningInput>,
6540    ) -> S3Result<S3Response<PutBucketVersioningOutput>> {
6541        // v0.6 #42: MFA gating on the `PutBucketVersioning` request
6542        // itself. S3 spec: when the request body carries an
6543        // `MfaDelete` element (either `Enabled` or `Disabled`), the
6544        // request must include a valid `x-amz-mfa` token — both for
6545        // the *first* enable (so the operator can't quietly side-step
6546        // the gate by never enabling it) and for any subsequent
6547        // change (so a leaked credential alone can't disable MFA
6548        // Delete to bypass it on subsequent DELETEs). Requests that
6549        // omit the `MfaDelete` element entirely (i.e. they flip only
6550        // `Status`) skip this gate, matching AWS.
6551        if let Some(mgr) = self.mfa_delete.as_ref()
6552            && let Some(target_enabled) = req
6553                .input
6554                .versioning_configuration
6555                .mfa_delete
6556                .as_ref()
6557                .map(|m| m.as_str().eq_ignore_ascii_case("Enabled"))
6558        {
6559            let bucket = req.input.bucket.clone();
6560            let header = req.input.mfa.as_deref();
6561            let secret = mgr.lookup_secret(&bucket);
6562            let verified = match (header, secret.as_ref()) {
6563                (Some(h), Some(s)) => match crate::mfa::parse_mfa_header(h) {
6564                    Ok((serial, code)) => {
6565                        serial == s.serial
6566                            && crate::mfa::verify_totp(&s.secret_base32, &code, current_unix_secs())
6567                    }
6568                    Err(_) => false,
6569                },
6570                _ => false,
6571            };
6572            if !verified {
6573                crate::metrics::record_mfa_delete_denial(&bucket);
6574                let err = if header.is_none() {
6575                    crate::mfa::MfaError::Missing
6576                } else {
6577                    crate::mfa::MfaError::InvalidCode
6578                };
6579                return Err(mfa_error_to_s3(err));
6580            }
6581            mgr.set_bucket_state(&bucket, target_enabled);
6582        }
6583        // v0.5 #34: stash the new state in the manager, then forward to
6584        // the backend so any downstream that *also* tracks state
6585        // (e.g. a real S3 backend) stays in sync. Manager-attached but
6586        // backend rejection is treated as a soft-fail (state is still
6587        // owned by the manager).
6588        if let Some(mgr) = self.versioning.as_ref() {
6589            let new_state = match req
6590                .input
6591                .versioning_configuration
6592                .status
6593                .as_ref()
6594                .map(|s| s.as_str())
6595            {
6596                Some(s) if s.eq_ignore_ascii_case("Enabled") => {
6597                    crate::versioning::VersioningState::Enabled
6598                }
6599                Some(s) if s.eq_ignore_ascii_case("Suspended") => {
6600                    crate::versioning::VersioningState::Suspended
6601                }
6602                _ => crate::versioning::VersioningState::Unversioned,
6603            };
6604            mgr.set_state(&req.input.bucket, new_state);
6605            return Ok(S3Response::new(PutBucketVersioningOutput::default()));
6606        }
6607        self.backend.put_bucket_versioning(req).await
6608    }
6609
6610    // ---- Bucket location ----
6611    async fn get_bucket_location(
6612        &self,
6613        req: S3Request<GetBucketLocationInput>,
6614    ) -> S3Result<S3Response<GetBucketLocationOutput>> {
6615        self.backend.get_bucket_location(req).await
6616    }
6617
6618    // ---- Bucket policy ----
6619    async fn get_bucket_policy(
6620        &self,
6621        req: S3Request<GetBucketPolicyInput>,
6622    ) -> S3Result<S3Response<GetBucketPolicyOutput>> {
6623        self.backend.get_bucket_policy(req).await
6624    }
6625    async fn put_bucket_policy(
6626        &self,
6627        req: S3Request<PutBucketPolicyInput>,
6628    ) -> S3Result<S3Response<PutBucketPolicyOutput>> {
6629        self.backend.put_bucket_policy(req).await
6630    }
6631    async fn delete_bucket_policy(
6632        &self,
6633        req: S3Request<DeleteBucketPolicyInput>,
6634    ) -> S3Result<S3Response<DeleteBucketPolicyOutput>> {
6635        self.backend.delete_bucket_policy(req).await
6636    }
6637    async fn get_bucket_policy_status(
6638        &self,
6639        req: S3Request<GetBucketPolicyStatusInput>,
6640    ) -> S3Result<S3Response<GetBucketPolicyStatusOutput>> {
6641        self.backend.get_bucket_policy_status(req).await
6642    }
6643
6644    // ---- Bucket ACL ----
6645    async fn get_bucket_acl(
6646        &self,
6647        req: S3Request<GetBucketAclInput>,
6648    ) -> S3Result<S3Response<GetBucketAclOutput>> {
6649        self.backend.get_bucket_acl(req).await
6650    }
6651    async fn put_bucket_acl(
6652        &self,
6653        req: S3Request<PutBucketAclInput>,
6654    ) -> S3Result<S3Response<PutBucketAclOutput>> {
6655        self.backend.put_bucket_acl(req).await
6656    }
6657
6658    // ---- Bucket CORS (v0.6 #38) ----
6659    async fn get_bucket_cors(
6660        &self,
6661        req: S3Request<GetBucketCorsInput>,
6662    ) -> S3Result<S3Response<GetBucketCorsOutput>> {
6663        if let Some(mgr) = self.cors.as_ref() {
6664            let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6665                S3Error::with_message(
6666                    S3ErrorCode::NoSuchCORSConfiguration,
6667                    "The CORS configuration does not exist".to_string(),
6668                )
6669            })?;
6670            let rules: Vec<CORSRule> = cfg
6671                .rules
6672                .into_iter()
6673                .map(|r| CORSRule {
6674                    allowed_headers: if r.allowed_headers.is_empty() {
6675                        None
6676                    } else {
6677                        Some(r.allowed_headers)
6678                    },
6679                    allowed_methods: r.allowed_methods,
6680                    allowed_origins: r.allowed_origins,
6681                    expose_headers: if r.expose_headers.is_empty() {
6682                        None
6683                    } else {
6684                        Some(r.expose_headers)
6685                    },
6686                    id: r.id,
6687                    max_age_seconds: r.max_age_seconds.map(|s| s as i32),
6688                })
6689                .collect();
6690            return Ok(S3Response::new(GetBucketCorsOutput {
6691                cors_rules: Some(rules),
6692            }));
6693        }
6694        self.backend.get_bucket_cors(req).await
6695    }
6696    async fn put_bucket_cors(
6697        &self,
6698        req: S3Request<PutBucketCorsInput>,
6699    ) -> S3Result<S3Response<PutBucketCorsOutput>> {
6700        if let Some(mgr) = self.cors.as_ref() {
6701            let cfg = crate::cors::CorsConfig {
6702                rules: req
6703                    .input
6704                    .cors_configuration
6705                    .cors_rules
6706                    .into_iter()
6707                    .map(|r| crate::cors::CorsRule {
6708                        allowed_origins: r.allowed_origins,
6709                        allowed_methods: r.allowed_methods,
6710                        allowed_headers: r.allowed_headers.unwrap_or_default(),
6711                        expose_headers: r.expose_headers.unwrap_or_default(),
6712                        max_age_seconds: r
6713                            .max_age_seconds
6714                            .and_then(|s| if s < 0 { None } else { Some(s as u32) }),
6715                        id: r.id,
6716                    })
6717                    .collect(),
6718            };
6719            // v0.8.15 M-3: AWS S3 rejects `AllowedMethods` outside
6720            // the canonical {GET,PUT,POST,DELETE,HEAD} set (including
6721            // the `*` wildcard). Validate at PutBucketCors time so
6722            // operators see the misconfiguration in the API response
6723            // instead of having silently-broken preflights at the
6724            // browser later.
6725            if let Err(e) = crate::cors::CorsManager::validate(&cfg) {
6726                return Err(S3Error::with_message(
6727                    S3ErrorCode::InvalidArgument,
6728                    e.to_string(),
6729                ));
6730            }
6731            mgr.put(&req.input.bucket, cfg);
6732            return Ok(S3Response::new(PutBucketCorsOutput::default()));
6733        }
6734        self.backend.put_bucket_cors(req).await
6735    }
6736    async fn delete_bucket_cors(
6737        &self,
6738        req: S3Request<DeleteBucketCorsInput>,
6739    ) -> S3Result<S3Response<DeleteBucketCorsOutput>> {
6740        if let Some(mgr) = self.cors.as_ref() {
6741            mgr.delete(&req.input.bucket);
6742            return Ok(S3Response::new(DeleteBucketCorsOutput::default()));
6743        }
6744        self.backend.delete_bucket_cors(req).await
6745    }
6746
6747    // ---- Bucket lifecycle (v0.6 #37) ----
6748    async fn get_bucket_lifecycle_configuration(
6749        &self,
6750        req: S3Request<GetBucketLifecycleConfigurationInput>,
6751    ) -> S3Result<S3Response<GetBucketLifecycleConfigurationOutput>> {
6752        if let Some(mgr) = self.lifecycle.as_ref() {
6753            let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6754                S3Error::with_message(
6755                    S3ErrorCode::NoSuchLifecycleConfiguration,
6756                    "The lifecycle configuration does not exist".to_string(),
6757                )
6758            })?;
6759            let rules: Vec<LifecycleRule> = cfg.rules.iter().map(internal_rule_to_dto).collect();
6760            return Ok(S3Response::new(GetBucketLifecycleConfigurationOutput {
6761                rules: Some(rules),
6762                transition_default_minimum_object_size: None,
6763            }));
6764        }
6765        self.backend.get_bucket_lifecycle_configuration(req).await
6766    }
6767    async fn put_bucket_lifecycle_configuration(
6768        &self,
6769        req: S3Request<PutBucketLifecycleConfigurationInput>,
6770    ) -> S3Result<S3Response<PutBucketLifecycleConfigurationOutput>> {
6771        if let Some(mgr) = self.lifecycle.as_ref() {
6772            let bucket = req.input.bucket.clone();
6773            let dto_cfg = req.input.lifecycle_configuration.unwrap_or_default();
6774            let cfg = dto_lifecycle_to_internal(&dto_cfg);
6775            mgr.put(&bucket, cfg);
6776            return Ok(S3Response::new(
6777                PutBucketLifecycleConfigurationOutput::default(),
6778            ));
6779        }
6780        self.backend.put_bucket_lifecycle_configuration(req).await
6781    }
6782    async fn delete_bucket_lifecycle(
6783        &self,
6784        req: S3Request<DeleteBucketLifecycleInput>,
6785    ) -> S3Result<S3Response<DeleteBucketLifecycleOutput>> {
6786        if let Some(mgr) = self.lifecycle.as_ref() {
6787            mgr.delete(&req.input.bucket);
6788            return Ok(S3Response::new(DeleteBucketLifecycleOutput::default()));
6789        }
6790        self.backend.delete_bucket_lifecycle(req).await
6791    }
6792
6793    // ---- Bucket tagging (v0.6 #39) ----
6794    async fn get_bucket_tagging(
6795        &self,
6796        req: S3Request<GetBucketTaggingInput>,
6797    ) -> S3Result<S3Response<GetBucketTaggingOutput>> {
6798        let Some(mgr) = self.tagging.as_ref() else {
6799            return self.backend.get_bucket_tagging(req).await;
6800        };
6801        let tags = mgr.get_bucket_tags(&req.input.bucket).unwrap_or_default();
6802        Ok(S3Response::new(GetBucketTaggingOutput {
6803            tag_set: tagset_to_aws(&tags),
6804        }))
6805    }
6806    async fn put_bucket_tagging(
6807        &self,
6808        req: S3Request<PutBucketTaggingInput>,
6809    ) -> S3Result<S3Response<PutBucketTaggingOutput>> {
6810        let Some(mgr) = self.tagging.as_ref() else {
6811            return self.backend.put_bucket_tagging(req).await;
6812        };
6813        let bucket = req.input.bucket.clone();
6814        let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6815            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6816        self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6817        mgr.put_bucket_tags(&bucket, parsed);
6818        Ok(S3Response::new(PutBucketTaggingOutput::default()))
6819    }
6820    async fn delete_bucket_tagging(
6821        &self,
6822        req: S3Request<DeleteBucketTaggingInput>,
6823    ) -> S3Result<S3Response<DeleteBucketTaggingOutput>> {
6824        let Some(mgr) = self.tagging.as_ref() else {
6825            return self.backend.delete_bucket_tagging(req).await;
6826        };
6827        let bucket = req.input.bucket.clone();
6828        self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6829        mgr.delete_bucket_tags(&bucket);
6830        Ok(S3Response::new(DeleteBucketTaggingOutput::default()))
6831    }
6832
6833    // ---- Bucket encryption ----
6834    async fn get_bucket_encryption(
6835        &self,
6836        req: S3Request<GetBucketEncryptionInput>,
6837    ) -> S3Result<S3Response<GetBucketEncryptionOutput>> {
6838        self.backend.get_bucket_encryption(req).await
6839    }
6840    async fn put_bucket_encryption(
6841        &self,
6842        req: S3Request<PutBucketEncryptionInput>,
6843    ) -> S3Result<S3Response<PutBucketEncryptionOutput>> {
6844        self.backend.put_bucket_encryption(req).await
6845    }
6846    async fn delete_bucket_encryption(
6847        &self,
6848        req: S3Request<DeleteBucketEncryptionInput>,
6849    ) -> S3Result<S3Response<DeleteBucketEncryptionOutput>> {
6850        self.backend.delete_bucket_encryption(req).await
6851    }
6852
6853    // ---- Bucket logging ----
6854    async fn get_bucket_logging(
6855        &self,
6856        req: S3Request<GetBucketLoggingInput>,
6857    ) -> S3Result<S3Response<GetBucketLoggingOutput>> {
6858        self.backend.get_bucket_logging(req).await
6859    }
6860    async fn put_bucket_logging(
6861        &self,
6862        req: S3Request<PutBucketLoggingInput>,
6863    ) -> S3Result<S3Response<PutBucketLoggingOutput>> {
6864        self.backend.put_bucket_logging(req).await
6865    }
6866
6867    // ---- Bucket notification (v0.6 #35) ----
6868    //
6869    // When a `NotificationManager` is attached, S4 itself owns per-bucket
6870    // notification configurations and the PUT / GET handlers route through
6871    // the manager. The wire DTO's queue / topic configurations map onto
6872    // S4's `Destination::Sqs` / `Destination::Sns`; LambdaFunction and
6873    // EventBridge configurations are accepted on PUT but silently dropped
6874    // (out of scope for v0.6 #35). When no manager is attached the legacy
6875    // backend-passthrough behaviour applies.
6876    async fn get_bucket_notification_configuration(
6877        &self,
6878        req: S3Request<GetBucketNotificationConfigurationInput>,
6879    ) -> S3Result<S3Response<GetBucketNotificationConfigurationOutput>> {
6880        if let Some(mgr) = self.notifications.as_ref() {
6881            let cfg = mgr.get(&req.input.bucket).unwrap_or_default();
6882            let dto = notif_to_dto(&cfg);
6883            return Ok(S3Response::new(GetBucketNotificationConfigurationOutput {
6884                event_bridge_configuration: dto.event_bridge_configuration,
6885                lambda_function_configurations: dto.lambda_function_configurations,
6886                queue_configurations: dto.queue_configurations,
6887                topic_configurations: dto.topic_configurations,
6888            }));
6889        }
6890        self.backend
6891            .get_bucket_notification_configuration(req)
6892            .await
6893    }
6894    async fn put_bucket_notification_configuration(
6895        &self,
6896        req: S3Request<PutBucketNotificationConfigurationInput>,
6897    ) -> S3Result<S3Response<PutBucketNotificationConfigurationOutput>> {
6898        if let Some(mgr) = self.notifications.as_ref() {
6899            let cfg = notif_from_dto(&req.input.notification_configuration);
6900            mgr.put(&req.input.bucket, cfg);
6901            return Ok(S3Response::new(
6902                PutBucketNotificationConfigurationOutput::default(),
6903            ));
6904        }
6905        self.backend
6906            .put_bucket_notification_configuration(req)
6907            .await
6908    }
6909
6910    // ---- Bucket request payment ----
6911    async fn get_bucket_request_payment(
6912        &self,
6913        req: S3Request<GetBucketRequestPaymentInput>,
6914    ) -> S3Result<S3Response<GetBucketRequestPaymentOutput>> {
6915        self.backend.get_bucket_request_payment(req).await
6916    }
6917    async fn put_bucket_request_payment(
6918        &self,
6919        req: S3Request<PutBucketRequestPaymentInput>,
6920    ) -> S3Result<S3Response<PutBucketRequestPaymentOutput>> {
6921        self.backend.put_bucket_request_payment(req).await
6922    }
6923
6924    // ---- Bucket website ----
6925    async fn get_bucket_website(
6926        &self,
6927        req: S3Request<GetBucketWebsiteInput>,
6928    ) -> S3Result<S3Response<GetBucketWebsiteOutput>> {
6929        self.backend.get_bucket_website(req).await
6930    }
6931    async fn put_bucket_website(
6932        &self,
6933        req: S3Request<PutBucketWebsiteInput>,
6934    ) -> S3Result<S3Response<PutBucketWebsiteOutput>> {
6935        self.backend.put_bucket_website(req).await
6936    }
6937    async fn delete_bucket_website(
6938        &self,
6939        req: S3Request<DeleteBucketWebsiteInput>,
6940    ) -> S3Result<S3Response<DeleteBucketWebsiteOutput>> {
6941        self.backend.delete_bucket_website(req).await
6942    }
6943
6944    // ---- Bucket replication (v0.6 #40) ----
6945    async fn get_bucket_replication(
6946        &self,
6947        req: S3Request<GetBucketReplicationInput>,
6948    ) -> S3Result<S3Response<GetBucketReplicationOutput>> {
6949        if let Some(mgr) = self.replication.as_ref() {
6950            return match mgr.get(&req.input.bucket) {
6951                Some(cfg) => Ok(S3Response::new(GetBucketReplicationOutput {
6952                    replication_configuration: Some(replication_to_dto(&cfg)),
6953                })),
6954                None => Err(S3Error::with_message(
6955                    S3ErrorCode::Custom("ReplicationConfigurationNotFoundError".into()),
6956                    format!(
6957                        "no replication configuration on bucket {}",
6958                        req.input.bucket
6959                    ),
6960                )),
6961            };
6962        }
6963        self.backend.get_bucket_replication(req).await
6964    }
6965    async fn put_bucket_replication(
6966        &self,
6967        req: S3Request<PutBucketReplicationInput>,
6968    ) -> S3Result<S3Response<PutBucketReplicationOutput>> {
6969        if let Some(mgr) = self.replication.as_ref() {
6970            let cfg = replication_from_dto(&req.input.replication_configuration);
6971            mgr.put(&req.input.bucket, cfg);
6972            return Ok(S3Response::new(PutBucketReplicationOutput::default()));
6973        }
6974        self.backend.put_bucket_replication(req).await
6975    }
6976    async fn delete_bucket_replication(
6977        &self,
6978        req: S3Request<DeleteBucketReplicationInput>,
6979    ) -> S3Result<S3Response<DeleteBucketReplicationOutput>> {
6980        if let Some(mgr) = self.replication.as_ref() {
6981            mgr.delete(&req.input.bucket);
6982            return Ok(S3Response::new(DeleteBucketReplicationOutput::default()));
6983        }
6984        self.backend.delete_bucket_replication(req).await
6985    }
6986
6987    // ---- Bucket accelerate ----
6988    async fn get_bucket_accelerate_configuration(
6989        &self,
6990        req: S3Request<GetBucketAccelerateConfigurationInput>,
6991    ) -> S3Result<S3Response<GetBucketAccelerateConfigurationOutput>> {
6992        self.backend.get_bucket_accelerate_configuration(req).await
6993    }
6994    async fn put_bucket_accelerate_configuration(
6995        &self,
6996        req: S3Request<PutBucketAccelerateConfigurationInput>,
6997    ) -> S3Result<S3Response<PutBucketAccelerateConfigurationOutput>> {
6998        self.backend.put_bucket_accelerate_configuration(req).await
6999    }
7000
7001    // ---- Bucket ownership controls ----
7002    async fn get_bucket_ownership_controls(
7003        &self,
7004        req: S3Request<GetBucketOwnershipControlsInput>,
7005    ) -> S3Result<S3Response<GetBucketOwnershipControlsOutput>> {
7006        self.backend.get_bucket_ownership_controls(req).await
7007    }
7008    async fn put_bucket_ownership_controls(
7009        &self,
7010        req: S3Request<PutBucketOwnershipControlsInput>,
7011    ) -> S3Result<S3Response<PutBucketOwnershipControlsOutput>> {
7012        self.backend.put_bucket_ownership_controls(req).await
7013    }
7014    async fn delete_bucket_ownership_controls(
7015        &self,
7016        req: S3Request<DeleteBucketOwnershipControlsInput>,
7017    ) -> S3Result<S3Response<DeleteBucketOwnershipControlsOutput>> {
7018        self.backend.delete_bucket_ownership_controls(req).await
7019    }
7020
7021    // ---- Public access block ----
7022    async fn get_public_access_block(
7023        &self,
7024        req: S3Request<GetPublicAccessBlockInput>,
7025    ) -> S3Result<S3Response<GetPublicAccessBlockOutput>> {
7026        self.backend.get_public_access_block(req).await
7027    }
7028    async fn put_public_access_block(
7029        &self,
7030        req: S3Request<PutPublicAccessBlockInput>,
7031    ) -> S3Result<S3Response<PutPublicAccessBlockOutput>> {
7032        self.backend.put_public_access_block(req).await
7033    }
7034    async fn delete_public_access_block(
7035        &self,
7036        req: S3Request<DeletePublicAccessBlockInput>,
7037    ) -> S3Result<S3Response<DeletePublicAccessBlockOutput>> {
7038        self.backend.delete_public_access_block(req).await
7039    }
7040
7041    // ====================================================================
7042    // v0.6 #41: S3 Select — server-side SQL filter on object body.
7043    //
7044    // Fetch the object via the regular `get_object` path (so SSE-C /
7045    // SSE-S4 / SSE-KMS / S4 codec all decompress + decrypt transparently),
7046    // run a small SQL subset (CSV + JSON Lines, equality / inequality /
7047    // LIKE / AND / OR / NOT) over the in-memory body, and stream the
7048    // matched rows back as AWS event-stream `Records` + `Stats` + `End`
7049    // frames.
7050    //
7051    // Limitations (deliberate, documented):
7052    //   - Parquet input is rejected with NotImplemented.
7053    //   - Aggregates / GROUP BY / JOIN / ORDER BY / LIMIT are rejected at
7054    //     parse time as InvalidRequest (s3s 0.13 doesn't expose AWS's
7055    //     domain-specific `InvalidSqlExpression` code).
7056    //   - The body is fully buffered before SQL evaluation (S3 Select
7057    //     streaming-during-evaluation is v0.7 scope).
7058    //   - GPU-accelerated WHERE evaluation is stubbed out (always None).
7059    async fn select_object_content(
7060        &self,
7061        req: S3Request<SelectObjectContentInput>,
7062    ) -> S3Result<S3Response<SelectObjectContentOutput>> {
7063        use crate::select::{
7064            EventStreamWriter, SelectInputFormat, SelectOutputFormat, run_select_csv,
7065            run_select_jsonlines,
7066        };
7067
7068        let select_bucket = req.input.bucket.clone();
7069        let select_key = req.input.key.clone();
7070        self.enforce_rate_limit(&req, &select_bucket)?;
7071        self.enforce_policy(&req, "s3:GetObject", &select_bucket, Some(&select_key))?;
7072
7073        let request = req.input.request;
7074        let sql = request.expression.clone();
7075        if request.expression_type.as_str() != "SQL" {
7076            return Err(S3Error::with_message(
7077                S3ErrorCode::InvalidExpressionType,
7078                format!(
7079                    "ExpressionType must be SQL, got: {}",
7080                    request.expression_type.as_str()
7081                ),
7082            ));
7083        }
7084
7085        let input_format = if let Some(_json) = request.input_serialization.json.as_ref() {
7086            SelectInputFormat::JsonLines
7087        } else if let Some(csv) = request.input_serialization.csv.as_ref() {
7088            let has_header = csv
7089                .file_header_info
7090                .as_ref()
7091                .map(|h| {
7092                    let s = h.as_str();
7093                    s.eq_ignore_ascii_case("USE") || s.eq_ignore_ascii_case("IGNORE")
7094                })
7095                .unwrap_or(false);
7096            let delim = csv
7097                .field_delimiter
7098                .as_deref()
7099                .and_then(|s| s.chars().next())
7100                .unwrap_or(',');
7101            SelectInputFormat::Csv {
7102                has_header,
7103                delimiter: delim,
7104            }
7105        } else if request.input_serialization.parquet.is_some() {
7106            return Err(S3Error::with_message(
7107                S3ErrorCode::NotImplemented,
7108                "Parquet input is not supported by this S3 Select implementation (v0.6: CSV / JSON Lines only)",
7109            ));
7110        } else {
7111            return Err(S3Error::with_message(
7112                S3ErrorCode::InvalidRequest,
7113                "InputSerialization requires exactly one of CSV / JSON / Parquet",
7114            ));
7115        };
7116        if let Some(ct) = request.input_serialization.compression_type.as_ref()
7117            && !ct.as_str().eq_ignore_ascii_case("NONE")
7118        {
7119            return Err(S3Error::with_message(
7120                S3ErrorCode::NotImplemented,
7121                format!(
7122                    "InputSerialization CompressionType={} is not supported (v0.6: NONE only)",
7123                    ct.as_str()
7124                ),
7125            ));
7126        }
7127
7128        let output_format = if request.output_serialization.json.is_some() {
7129            SelectOutputFormat::Json
7130        } else if request.output_serialization.csv.is_some() {
7131            SelectOutputFormat::Csv
7132        } else {
7133            return Err(S3Error::with_message(
7134                S3ErrorCode::InvalidRequest,
7135                "OutputSerialization requires exactly one of CSV / JSON",
7136            ));
7137        };
7138
7139        let get_input = GetObjectInput {
7140            bucket: select_bucket.clone(),
7141            key: select_key.clone(),
7142            sse_customer_algorithm: req.input.sse_customer_algorithm.clone(),
7143            sse_customer_key: req.input.sse_customer_key.clone(),
7144            sse_customer_key_md5: req.input.sse_customer_key_md5.clone(),
7145            ..Default::default()
7146        };
7147        let get_req = S3Request {
7148            input: get_input,
7149            method: http::Method::GET,
7150            uri: format!("/{}/{}", select_bucket, select_key)
7151                .parse()
7152                .map_err(|e| {
7153                    S3Error::with_message(
7154                        S3ErrorCode::InternalError,
7155                        format!("constructing inner GET URI: {e}"),
7156                    )
7157                })?,
7158            headers: http::HeaderMap::new(),
7159            extensions: http::Extensions::new(),
7160            credentials: req.credentials.clone(),
7161            region: req.region.clone(),
7162            service: req.service.clone(),
7163            trailing_headers: None,
7164        };
7165        let mut get_resp = self.get_object(get_req).await?;
7166        let blob = get_resp.output.body.take().ok_or_else(|| {
7167            S3Error::with_message(
7168                S3ErrorCode::InternalError,
7169                "Select: object body was empty after GET",
7170            )
7171        })?;
7172        let body_bytes = crate::blob::collect_blob(blob, self.max_body_bytes)
7173            .await
7174            .map_err(internal("collect Select body"))?;
7175        let scanned = body_bytes.len() as u64;
7176
7177        let matched_payload = match input_format {
7178            SelectInputFormat::JsonLines => run_select_jsonlines(&sql, &body_bytes, output_format)
7179                .map_err(|e| select_error_to_s3(e, "JSON Lines"))?,
7180            SelectInputFormat::Csv { .. } => {
7181                run_select_csv(&sql, &body_bytes, input_format, output_format)
7182                    .map_err(|e| select_error_to_s3(e, "CSV"))?
7183            }
7184        };
7185
7186        let returned = matched_payload.len() as u64;
7187        let processed = scanned;
7188        let mut events: Vec<S3Result<SelectObjectContentEvent>> = Vec::with_capacity(3);
7189        if !matched_payload.is_empty() {
7190            events.push(Ok(SelectObjectContentEvent::Records(RecordsEvent {
7191                payload: Some(bytes::Bytes::from(matched_payload)),
7192            })));
7193        }
7194        events.push(Ok(SelectObjectContentEvent::Stats(StatsEvent {
7195            details: Some(Stats {
7196                bytes_scanned: Some(scanned as i64),
7197                bytes_processed: Some(processed as i64),
7198                bytes_returned: Some(returned as i64),
7199            }),
7200        })));
7201        events.push(Ok(SelectObjectContentEvent::End(EndEvent {})));
7202        // Touch EventStreamWriter so the public API stays linked into the
7203        // build (the actual wire framing is delegated to s3s).
7204        let _writer = EventStreamWriter::new();
7205
7206        let stream = SelectObjectContentEventStream::new(futures::stream::iter(events));
7207        let output = SelectObjectContentOutput {
7208            payload: Some(stream),
7209        };
7210        Ok(S3Response::new(output))
7211    }
7212
7213    // ---- Bucket Inventory configuration (v0.6 #36) ----
7214    //
7215    // When an `InventoryManager` is attached, S4-server owns the
7216    // configuration store and these handlers no longer pass through to
7217    // the backend. The mapping between the s3s-typed
7218    // `InventoryConfiguration` and the inventory module's internal
7219    // `InventoryConfig` is intentionally lossy: only the fields S4
7220    // actually uses for periodic CSV emission survive the round trip
7221    // (id, source bucket, destination bucket / prefix, format, included
7222    // versions, schedule frequency). Optional fields, encryption, and
7223    // filter prefixes are accepted on PUT and re-surfaced on GET via
7224    // a best-effort default-shape `InventoryConfiguration` so the
7225    // client sees a roundtrip-clean response.
7226    async fn put_bucket_inventory_configuration(
7227        &self,
7228        req: S3Request<PutBucketInventoryConfigurationInput>,
7229    ) -> S3Result<S3Response<PutBucketInventoryConfigurationOutput>> {
7230        if let Some(mgr) = self.inventory.as_ref() {
7231            let cfg = inv_from_dto(
7232                &req.input.bucket,
7233                &req.input.id,
7234                &req.input.inventory_configuration,
7235            );
7236            mgr.put(cfg);
7237            return Ok(S3Response::new(
7238                PutBucketInventoryConfigurationOutput::default(),
7239            ));
7240        }
7241        self.backend.put_bucket_inventory_configuration(req).await
7242    }
7243
7244    async fn get_bucket_inventory_configuration(
7245        &self,
7246        req: S3Request<GetBucketInventoryConfigurationInput>,
7247    ) -> S3Result<S3Response<GetBucketInventoryConfigurationOutput>> {
7248        if let Some(mgr) = self.inventory.as_ref() {
7249            let cfg = mgr.get(&req.input.bucket, &req.input.id);
7250            if let Some(cfg) = cfg {
7251                let out = GetBucketInventoryConfigurationOutput {
7252                    inventory_configuration: Some(inv_to_dto(&cfg)),
7253                };
7254                return Ok(S3Response::new(out));
7255            }
7256            // AWS returns `NoSuchConfiguration` (404) when the id has no
7257            // matching inventory configuration on the bucket. The
7258            // generated `S3ErrorCode` enum doesn't expose a typed variant
7259            // for this code, so we round-trip through `from_bytes` which
7260            // wraps unknown codes as `Custom(...)` (= the AWS-canonical
7261            // error-code string survives into the XML response envelope).
7262            let code =
7263                S3ErrorCode::from_bytes(b"NoSuchConfiguration").unwrap_or(S3ErrorCode::NoSuchKey);
7264            return Err(S3Error::with_message(
7265                code,
7266                format!(
7267                    "no inventory configuration with id={} on bucket={}",
7268                    req.input.id, req.input.bucket
7269                ),
7270            ));
7271        }
7272        self.backend.get_bucket_inventory_configuration(req).await
7273    }
7274
7275    async fn list_bucket_inventory_configurations(
7276        &self,
7277        req: S3Request<ListBucketInventoryConfigurationsInput>,
7278    ) -> S3Result<S3Response<ListBucketInventoryConfigurationsOutput>> {
7279        if let Some(mgr) = self.inventory.as_ref() {
7280            let list = mgr.list_for_bucket(&req.input.bucket);
7281            let dto_list: Vec<InventoryConfiguration> = list.iter().map(inv_to_dto).collect();
7282            let out = ListBucketInventoryConfigurationsOutput {
7283                continuation_token: req.input.continuation_token.clone(),
7284                inventory_configuration_list: if dto_list.is_empty() {
7285                    None
7286                } else {
7287                    Some(dto_list)
7288                },
7289                is_truncated: Some(false),
7290                next_continuation_token: None,
7291            };
7292            return Ok(S3Response::new(out));
7293        }
7294        self.backend.list_bucket_inventory_configurations(req).await
7295    }
7296
7297    async fn delete_bucket_inventory_configuration(
7298        &self,
7299        req: S3Request<DeleteBucketInventoryConfigurationInput>,
7300    ) -> S3Result<S3Response<DeleteBucketInventoryConfigurationOutput>> {
7301        if let Some(mgr) = self.inventory.as_ref() {
7302            mgr.delete(&req.input.bucket, &req.input.id);
7303            return Ok(S3Response::new(
7304                DeleteBucketInventoryConfigurationOutput::default(),
7305            ));
7306        }
7307        self.backend
7308            .delete_bucket_inventory_configuration(req)
7309            .await
7310    }
7311}
7312
7313// ---------------------------------------------------------------------------
7314// v0.6 #36: Convert between the s3s-typed `InventoryConfiguration` (the wire
7315// surface) and our internal `crate::inventory::InventoryConfig`. Only the
7316// fields S4 actually uses for CSV emission survive the round trip; the
7317// missing fields (filter prefix, optional fields, encryption) are dropped on
7318// PUT and re-rendered as the AWS-default shape on GET so the client sees a
7319// well-formed `InventoryConfiguration`.
7320// ---------------------------------------------------------------------------
7321
7322fn inv_from_dto(
7323    bucket: &str,
7324    id: &str,
7325    dto: &InventoryConfiguration,
7326) -> crate::inventory::InventoryConfig {
7327    let frequency_hours = match dto.schedule.frequency.as_str() {
7328        "Weekly" => 24 * 7,
7329        // Daily is the default; anything S4 doesn't recognise (incl.
7330        // empty, which is the s3s-default) maps to Daily so the
7331        // operator's PUT doesn't silently turn into a no-op cadence.
7332        _ => 24,
7333    };
7334    // Parquet/ORC are not supported (issue #36 scope); we still accept
7335    // the PUT so callers don't fail-loud, but we record CSV and rely on
7336    // the operator catching the discrepancy on GET.
7337    let format = crate::inventory::InventoryFormat::Csv;
7338    crate::inventory::InventoryConfig {
7339        id: id.to_owned(),
7340        bucket: bucket.to_owned(),
7341        destination_bucket: dto.destination.s3_bucket_destination.bucket.clone(),
7342        destination_prefix: dto
7343            .destination
7344            .s3_bucket_destination
7345            .prefix
7346            .clone()
7347            .unwrap_or_default(),
7348        frequency_hours,
7349        format,
7350        included_object_versions: crate::inventory::IncludedVersions::from_aws_str(
7351            dto.included_object_versions.as_str(),
7352        ),
7353    }
7354}
7355
7356fn inv_to_dto(cfg: &crate::inventory::InventoryConfig) -> InventoryConfiguration {
7357    InventoryConfiguration {
7358        id: cfg.id.clone(),
7359        is_enabled: true,
7360        included_object_versions: InventoryIncludedObjectVersions::from(
7361            cfg.included_object_versions.as_aws_str().to_owned(),
7362        ),
7363        destination: InventoryDestination {
7364            s3_bucket_destination: InventoryS3BucketDestination {
7365                account_id: None,
7366                bucket: cfg.destination_bucket.clone(),
7367                encryption: None,
7368                format: InventoryFormat::from(cfg.format.as_aws_str().to_owned()),
7369                prefix: if cfg.destination_prefix.is_empty() {
7370                    None
7371                } else {
7372                    Some(cfg.destination_prefix.clone())
7373                },
7374            },
7375        },
7376        schedule: InventorySchedule {
7377            // `frequency_hours == 168` -> Weekly; everything else maps to
7378            // Daily for the wire response (the manager keeps the precise
7379            // hour count internally for due-checking).
7380            frequency: InventoryFrequency::from(
7381                if cfg.frequency_hours == 24 * 7 {
7382                    "Weekly"
7383                } else {
7384                    "Daily"
7385                }
7386                .to_owned(),
7387            ),
7388        },
7389        filter: None,
7390        optional_fields: None,
7391    }
7392}
7393
7394// ---------------------------------------------------------------------------
7395// v0.6 #35: Convert between the s3s-typed `NotificationConfiguration` (the
7396// wire surface) and our internal `crate::notifications::NotificationConfig`.
7397//
7398// We support TopicConfiguration (-> Destination::Sns) and QueueConfiguration
7399// (-> Destination::Sqs). LambdaFunction and EventBridge configurations are
7400// silently dropped on PUT (out of scope for v0.6 #35); the GET response only
7401// surfaces topic / queue rules.
7402//
7403// The webhook destination has no AWS-native wire form: operators configure
7404// webhooks via the JSON snapshot file (`--notifications-state-file`) or by
7405// poking `NotificationManager::put` directly from a custom binary. This
7406// keeps the wire surface AWS-compatible while still letting the always-
7407// available `Webhook` destination be reachable.
7408// ---------------------------------------------------------------------------
7409
7410fn notif_from_dto(dto: &NotificationConfiguration) -> crate::notifications::NotificationConfig {
7411    let mut rules: Vec<crate::notifications::NotificationRule> = Vec::new();
7412    if let Some(topics) = dto.topic_configurations.as_ref() {
7413        for (idx, t) in topics.iter().enumerate() {
7414            let events = events_from_dto(&t.events);
7415            let (prefix, suffix) = filter_from_dto(t.filter.as_ref());
7416            rules.push(crate::notifications::NotificationRule {
7417                id: t.id.clone().unwrap_or_else(|| format!("topic-{idx}")),
7418                events,
7419                destination: crate::notifications::Destination::Sns {
7420                    topic_arn: t.topic_arn.clone(),
7421                },
7422                filter_prefix: prefix,
7423                filter_suffix: suffix,
7424            });
7425        }
7426    }
7427    if let Some(queues) = dto.queue_configurations.as_ref() {
7428        for (idx, q) in queues.iter().enumerate() {
7429            let events = events_from_dto(&q.events);
7430            let (prefix, suffix) = filter_from_dto(q.filter.as_ref());
7431            rules.push(crate::notifications::NotificationRule {
7432                id: q.id.clone().unwrap_or_else(|| format!("queue-{idx}")),
7433                events,
7434                destination: crate::notifications::Destination::Sqs {
7435                    queue_arn: q.queue_arn.clone(),
7436                },
7437                filter_prefix: prefix,
7438                filter_suffix: suffix,
7439            });
7440        }
7441    }
7442    crate::notifications::NotificationConfig { rules }
7443}
7444
7445fn notif_to_dto(cfg: &crate::notifications::NotificationConfig) -> NotificationConfiguration {
7446    let mut topics: Vec<TopicConfiguration> = Vec::new();
7447    let mut queues: Vec<QueueConfiguration> = Vec::new();
7448    for rule in &cfg.rules {
7449        let events: Vec<Event> = rule
7450            .events
7451            .iter()
7452            .map(|e| Event::from(e.as_aws_str().to_owned()))
7453            .collect();
7454        let filter = filter_to_dto(rule.filter_prefix.as_deref(), rule.filter_suffix.as_deref());
7455        match &rule.destination {
7456            crate::notifications::Destination::Sns { topic_arn } => {
7457                topics.push(TopicConfiguration {
7458                    events,
7459                    filter,
7460                    id: Some(rule.id.clone()),
7461                    topic_arn: topic_arn.clone(),
7462                });
7463            }
7464            crate::notifications::Destination::Sqs { queue_arn } => {
7465                queues.push(QueueConfiguration {
7466                    events,
7467                    filter,
7468                    id: Some(rule.id.clone()),
7469                    queue_arn: queue_arn.clone(),
7470                });
7471            }
7472            // Webhook destinations have no AWS wire equivalent — they
7473            // round-trip through the JSON snapshot only. Skip them on the
7474            // GET surface (an SDK consumer wouldn't know what to do with
7475            // them anyway).
7476            crate::notifications::Destination::Webhook { .. } => {}
7477        }
7478    }
7479    NotificationConfiguration {
7480        event_bridge_configuration: None,
7481        lambda_function_configurations: None,
7482        queue_configurations: if queues.is_empty() {
7483            None
7484        } else {
7485            Some(queues)
7486        },
7487        topic_configurations: if topics.is_empty() {
7488            None
7489        } else {
7490            Some(topics)
7491        },
7492    }
7493}
7494
7495fn events_from_dto(events: &[Event]) -> Vec<crate::notifications::EventType> {
7496    events
7497        .iter()
7498        .filter_map(|e| crate::notifications::EventType::from_aws_str(e.as_ref()))
7499        .collect()
7500}
7501
7502fn filter_from_dto(
7503    f: Option<&NotificationConfigurationFilter>,
7504) -> (Option<String>, Option<String>) {
7505    let Some(f) = f else {
7506        return (None, None);
7507    };
7508    let Some(key) = f.key.as_ref() else {
7509        return (None, None);
7510    };
7511    let Some(rules) = key.filter_rules.as_ref() else {
7512        return (None, None);
7513    };
7514    let mut prefix = None;
7515    let mut suffix = None;
7516    for r in rules {
7517        let name = r.name.as_ref().map(|n| n.as_str().to_ascii_lowercase());
7518        let value = r.value.clone();
7519        match name.as_deref() {
7520            Some("prefix") => prefix = value,
7521            Some("suffix") => suffix = value,
7522            _ => {}
7523        }
7524    }
7525    (prefix, suffix)
7526}
7527
7528fn filter_to_dto(
7529    prefix: Option<&str>,
7530    suffix: Option<&str>,
7531) -> Option<NotificationConfigurationFilter> {
7532    if prefix.is_none() && suffix.is_none() {
7533        return None;
7534    }
7535    let mut rules: Vec<FilterRule> = Vec::new();
7536    if let Some(p) = prefix {
7537        rules.push(FilterRule {
7538            name: Some(FilterRuleName::from("prefix".to_owned())),
7539            value: Some(p.to_owned()),
7540        });
7541    }
7542    if let Some(s) = suffix {
7543        rules.push(FilterRule {
7544            name: Some(FilterRuleName::from("suffix".to_owned())),
7545            value: Some(s.to_owned()),
7546        });
7547    }
7548    Some(NotificationConfigurationFilter {
7549        key: Some(S3KeyFilter {
7550            filter_rules: Some(rules),
7551        }),
7552    })
7553}
7554
7555// ---------------------------------------------------------------------------
7556// v0.6 #40: Convert between the s3s-typed `ReplicationConfiguration` (the
7557// wire surface) and our internal `crate::replication::ReplicationConfig`.
7558// AWS's `ReplicationRuleFilter` is a sum type — `Prefix | Tag | And { Prefix,
7559// Tags }`; we flatten it into the single `(prefix, tag-vec)` representation
7560// the matcher needs. Sub-blocks v0.6 #40 does not implement
7561// (DeleteMarkerReplication / SourceSelectionCriteria / ReplicationTime /
7562// Metrics / EncryptionConfiguration) round-trip as `None` on GET — operators
7563// who set them on PUT see them silently dropped, mirroring "feature not
7564// supported in this release" semantics.
7565// ---------------------------------------------------------------------------
7566
7567fn replication_from_dto(dto: &ReplicationConfiguration) -> crate::replication::ReplicationConfig {
7568    let rules = dto
7569        .rules
7570        .iter()
7571        .enumerate()
7572        .map(|(idx, r)| {
7573            let id =
7574                r.id.as_ref()
7575                    .map(|s| s.as_str().to_owned())
7576                    .unwrap_or_else(|| format!("rule-{idx}"));
7577            let priority = r.priority.unwrap_or(0).max(0) as u32;
7578            let status_enabled = r.status.as_str() == ReplicationRuleStatus::ENABLED;
7579            let filter = replication_filter_from_dto(r.filter.as_ref(), r.prefix.as_deref());
7580            let destination_bucket = r.destination.bucket.clone();
7581            let destination_storage_class = r
7582                .destination
7583                .storage_class
7584                .as_ref()
7585                .map(|s| s.as_str().to_owned());
7586            crate::replication::ReplicationRule {
7587                id,
7588                priority,
7589                status_enabled,
7590                filter,
7591                destination_bucket,
7592                destination_storage_class,
7593            }
7594        })
7595        .collect();
7596    crate::replication::ReplicationConfig {
7597        role: dto.role.clone(),
7598        rules,
7599    }
7600}
7601
7602fn replication_to_dto(cfg: &crate::replication::ReplicationConfig) -> ReplicationConfiguration {
7603    let rules = cfg
7604        .rules
7605        .iter()
7606        .map(|r| {
7607            let status = if r.status_enabled {
7608                ReplicationRuleStatus::from_static(ReplicationRuleStatus::ENABLED)
7609            } else {
7610                ReplicationRuleStatus::from_static(ReplicationRuleStatus::DISABLED)
7611            };
7612            let destination = Destination {
7613                access_control_translation: None,
7614                account: None,
7615                bucket: r.destination_bucket.clone(),
7616                encryption_configuration: None,
7617                metrics: None,
7618                replication_time: None,
7619                storage_class: r
7620                    .destination_storage_class
7621                    .as_ref()
7622                    .map(|s| StorageClass::from(s.clone())),
7623            };
7624            let filter = Some(replication_filter_to_dto(&r.filter));
7625            ReplicationRule {
7626                delete_marker_replication: None,
7627                destination,
7628                existing_object_replication: None,
7629                filter,
7630                id: Some(r.id.clone()),
7631                prefix: None,
7632                priority: Some(r.priority as i32),
7633                source_selection_criteria: None,
7634                status,
7635            }
7636        })
7637        .collect();
7638    ReplicationConfiguration {
7639        role: cfg.role.clone(),
7640        rules,
7641    }
7642}
7643
7644fn replication_filter_from_dto(
7645    f: Option<&ReplicationRuleFilter>,
7646    rule_level_prefix: Option<&str>,
7647) -> crate::replication::ReplicationFilter {
7648    let mut prefix: Option<String> = rule_level_prefix.map(str::to_owned);
7649    let mut tags: Vec<(String, String)> = Vec::new();
7650    if let Some(f) = f {
7651        if let Some(p) = f.prefix.as_ref()
7652            && prefix.is_none()
7653        {
7654            prefix = Some(p.clone());
7655        }
7656        if let Some(t) = f.tag.as_ref()
7657            && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7658        {
7659            tags.push((k.clone(), v.clone()));
7660        }
7661        if let Some(and) = f.and.as_ref() {
7662            if let Some(p) = and.prefix.as_ref()
7663                && prefix.is_none()
7664            {
7665                prefix = Some(p.clone());
7666            }
7667            if let Some(ts) = and.tags.as_ref() {
7668                for t in ts {
7669                    if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7670                        tags.push((k.clone(), v.clone()));
7671                    }
7672                }
7673            }
7674        }
7675    }
7676    crate::replication::ReplicationFilter { prefix, tags }
7677}
7678
7679fn replication_filter_to_dto(f: &crate::replication::ReplicationFilter) -> ReplicationRuleFilter {
7680    if f.tags.is_empty() {
7681        ReplicationRuleFilter {
7682            and: None,
7683            prefix: f.prefix.clone(),
7684            tag: None,
7685        }
7686    } else if f.tags.len() == 1 && f.prefix.is_none() {
7687        let (k, v) = &f.tags[0];
7688        ReplicationRuleFilter {
7689            and: None,
7690            prefix: None,
7691            tag: Some(Tag {
7692                key: Some(k.clone()),
7693                value: Some(v.clone()),
7694            }),
7695        }
7696    } else {
7697        let tags: Vec<Tag> = f
7698            .tags
7699            .iter()
7700            .map(|(k, v)| Tag {
7701                key: Some(k.clone()),
7702                value: Some(v.clone()),
7703            })
7704            .collect();
7705        ReplicationRuleFilter {
7706            and: Some(ReplicationRuleAndOperator {
7707                prefix: f.prefix.clone(),
7708                tags: Some(tags),
7709            }),
7710            prefix: None,
7711            tag: None,
7712        }
7713    }
7714}
7715
7716// ---------------------------------------------------------------------------
7717// v0.6 #37: Convert between the s3s-typed `BucketLifecycleConfiguration`
7718// (the wire surface) and our internal `crate::lifecycle::LifecycleConfig`.
7719// The internal representation flattens AWS's "Filter | And" disjunction
7720// into a single `LifecycleFilter` struct of optional fields plus a tag
7721// vector. Fields S4's evaluator does not consume
7722// (`expired_object_delete_marker`, `noncurrent_version_transitions`,
7723// `transition_default_minimum_object_size`, the storage class on the
7724// noncurrent expiration) are dropped on PUT and re-rendered as their
7725// AWS-default shape on GET so the client always sees a well-formed
7726// configuration.
7727// ---------------------------------------------------------------------------
7728
7729fn dto_lifecycle_to_internal(
7730    dto: &BucketLifecycleConfiguration,
7731) -> crate::lifecycle::LifecycleConfig {
7732    crate::lifecycle::LifecycleConfig {
7733        rules: dto.rules.iter().map(dto_rule_to_internal).collect(),
7734    }
7735}
7736
7737fn dto_rule_to_internal(rule: &LifecycleRule) -> crate::lifecycle::LifecycleRule {
7738    let status = crate::lifecycle::LifecycleStatus::from_aws_str(rule.status.as_str());
7739    let filter = rule
7740        .filter
7741        .as_ref()
7742        .map(dto_filter_to_internal)
7743        .unwrap_or_default();
7744    let expiration_days = rule
7745        .expiration
7746        .as_ref()
7747        .and_then(|e| e.days)
7748        .and_then(|d| u32::try_from(d).ok());
7749    let expiration_date = rule
7750        .expiration
7751        .as_ref()
7752        .and_then(|e| e.date.as_ref())
7753        .and_then(timestamp_to_chrono_utc);
7754    let transitions: Vec<crate::lifecycle::TransitionRule> = rule
7755        .transitions
7756        .as_ref()
7757        .map(|ts| {
7758            ts.iter()
7759                .filter_map(|t| {
7760                    let days = u32::try_from(t.days?).ok()?;
7761                    let storage_class = t.storage_class.as_ref()?.as_str().to_owned();
7762                    Some(crate::lifecycle::TransitionRule {
7763                        days,
7764                        storage_class,
7765                    })
7766                })
7767                .collect()
7768        })
7769        .unwrap_or_default();
7770    let noncurrent_version_expiration_days = rule
7771        .noncurrent_version_expiration
7772        .as_ref()
7773        .and_then(|n| n.noncurrent_days)
7774        .and_then(|d| u32::try_from(d).ok());
7775    let abort_incomplete_multipart_upload_days = rule
7776        .abort_incomplete_multipart_upload
7777        .as_ref()
7778        .and_then(|a| a.days_after_initiation)
7779        .and_then(|d| u32::try_from(d).ok());
7780    crate::lifecycle::LifecycleRule {
7781        id: rule.id.clone().unwrap_or_default(),
7782        status,
7783        filter,
7784        expiration_days,
7785        expiration_date,
7786        transitions,
7787        noncurrent_version_expiration_days,
7788        abort_incomplete_multipart_upload_days,
7789    }
7790}
7791
7792fn dto_filter_to_internal(filter: &LifecycleRuleFilter) -> crate::lifecycle::LifecycleFilter {
7793    let mut prefix = filter.prefix.clone();
7794    let mut tags: Vec<(String, String)> = Vec::new();
7795    let mut size_gt: Option<u64> = filter
7796        .object_size_greater_than
7797        .and_then(|n| u64::try_from(n).ok());
7798    let mut size_lt: Option<u64> = filter
7799        .object_size_less_than
7800        .and_then(|n| u64::try_from(n).ok());
7801    if let Some(t) = &filter.tag
7802        && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7803    {
7804        tags.push((k.clone(), v.clone()));
7805    }
7806    if let Some(and) = &filter.and {
7807        if prefix.is_none() {
7808            prefix = and.prefix.clone();
7809        }
7810        if size_gt.is_none() {
7811            size_gt = and
7812                .object_size_greater_than
7813                .and_then(|n| u64::try_from(n).ok());
7814        }
7815        if size_lt.is_none() {
7816            size_lt = and
7817                .object_size_less_than
7818                .and_then(|n| u64::try_from(n).ok());
7819        }
7820        if let Some(ts) = &and.tags {
7821            for t in ts {
7822                if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7823                    tags.push((k.clone(), v.clone()));
7824                }
7825            }
7826        }
7827    }
7828    crate::lifecycle::LifecycleFilter {
7829        prefix,
7830        tags,
7831        object_size_greater_than: size_gt,
7832        object_size_less_than: size_lt,
7833    }
7834}
7835
7836fn internal_rule_to_dto(rule: &crate::lifecycle::LifecycleRule) -> LifecycleRule {
7837    let expiration = if rule.expiration_days.is_some() || rule.expiration_date.is_some() {
7838        Some(LifecycleExpiration {
7839            date: rule.expiration_date.map(chrono_utc_to_timestamp),
7840            days: rule.expiration_days.map(|d| d as i32),
7841            expired_object_delete_marker: None,
7842        })
7843    } else {
7844        None
7845    };
7846    let transitions: Option<TransitionList> = if rule.transitions.is_empty() {
7847        None
7848    } else {
7849        Some(
7850            rule.transitions
7851                .iter()
7852                .map(|t| Transition {
7853                    date: None,
7854                    days: Some(t.days as i32),
7855                    storage_class: Some(TransitionStorageClass::from(t.storage_class.clone())),
7856                })
7857                .collect(),
7858        )
7859    };
7860    let noncurrent_version_expiration =
7861        rule.noncurrent_version_expiration_days
7862            .map(|d| NoncurrentVersionExpiration {
7863                newer_noncurrent_versions: None,
7864                noncurrent_days: Some(d as i32),
7865            });
7866    let abort_incomplete_multipart_upload =
7867        rule.abort_incomplete_multipart_upload_days
7868            .map(|d| AbortIncompleteMultipartUpload {
7869                days_after_initiation: Some(d as i32),
7870            });
7871    let filter = if rule.filter.tags.is_empty()
7872        && rule.filter.object_size_greater_than.is_none()
7873        && rule.filter.object_size_less_than.is_none()
7874    {
7875        rule.filter.prefix.as_ref().map(|p| LifecycleRuleFilter {
7876            and: None,
7877            object_size_greater_than: None,
7878            object_size_less_than: None,
7879            prefix: Some(p.clone()),
7880            tag: None,
7881        })
7882    } else if rule.filter.tags.len() == 1
7883        && rule.filter.prefix.is_none()
7884        && rule.filter.object_size_greater_than.is_none()
7885        && rule.filter.object_size_less_than.is_none()
7886    {
7887        let (k, v) = rule.filter.tags[0].clone();
7888        Some(LifecycleRuleFilter {
7889            and: None,
7890            object_size_greater_than: None,
7891            object_size_less_than: None,
7892            prefix: None,
7893            tag: Some(Tag {
7894                key: Some(k),
7895                value: Some(v),
7896            }),
7897        })
7898    } else {
7899        let tags = if rule.filter.tags.is_empty() {
7900            None
7901        } else {
7902            Some(
7903                rule.filter
7904                    .tags
7905                    .iter()
7906                    .map(|(k, v)| Tag {
7907                        key: Some(k.clone()),
7908                        value: Some(v.clone()),
7909                    })
7910                    .collect(),
7911            )
7912        };
7913        Some(LifecycleRuleFilter {
7914            and: Some(LifecycleRuleAndOperator {
7915                object_size_greater_than: rule
7916                    .filter
7917                    .object_size_greater_than
7918                    .and_then(|n| i64::try_from(n).ok()),
7919                object_size_less_than: rule
7920                    .filter
7921                    .object_size_less_than
7922                    .and_then(|n| i64::try_from(n).ok()),
7923                prefix: rule.filter.prefix.clone(),
7924                tags,
7925            }),
7926            object_size_greater_than: None,
7927            object_size_less_than: None,
7928            prefix: None,
7929            tag: None,
7930        })
7931    };
7932    LifecycleRule {
7933        abort_incomplete_multipart_upload,
7934        expiration,
7935        filter,
7936        id: if rule.id.is_empty() {
7937            None
7938        } else {
7939            Some(rule.id.clone())
7940        },
7941        noncurrent_version_expiration,
7942        noncurrent_version_transitions: None,
7943        prefix: None,
7944        status: ExpirationStatus::from(rule.status.as_aws_str().to_owned()),
7945        transitions,
7946    }
7947}
7948
7949// (timestamp <-> chrono helpers `timestamp_to_chrono_utc` /
7950// `chrono_utc_to_timestamp` are defined earlier in this file for the
7951// tagging/notifications work; the lifecycle DTO converters reuse them.)
7952
7953// ---------------------------------------------------------------------------
7954// v0.5 #33: SigV4a (asymmetric ECDSA-P256) integration hook.
7955//
7956// Kept as a self-contained block at the bottom of the file so it doesn't
7957// touch the existing `S4Service` struct, `new()`, or any of the per-op
7958// handlers above. The hook is wired in by the binary at server-build time
7959// as a hyper middleware layer (see `main.rs`), NOT inside `S4Service`.
7960//
7961// Lifecycle:
7962//   1. `SigV4aGate::new(store)` is constructed once at boot from the
7963//      operator-supplied credential directory.
7964//   2. For each incoming request, `SigV4aGate::pre_route(&req,
7965//      &requested_region, &canonical_request_bytes)` is invoked BEFORE
7966//      the request hits the S3 framework. If the request claims SigV4a
7967//      and verifies, control returns to the framework. Otherwise a 403
7968//      `SignatureDoesNotMatch` is produced.
7969//   3. Plain SigV4 (HMAC-SHA256) requests pass through untouched.
7970// ---------------------------------------------------------------------------
7971
7972/// Gate that fronts the S3 service path with SigV4a verification (v0.5 #33).
7973///
7974/// Wraps a [`crate::sigv4a::SigV4aCredentialStore`] and exposes a single
7975/// `pre_route` entry point that returns `Ok(())` for both
7976/// "request is plain SigV4 — pass through" and "request is SigV4a and
7977/// verified", and an `Err(...)` containing a 403-equivalent diagnostic
7978/// otherwise. Cheap to clone (the inner store is `Arc`-backed).
7979///
7980/// v0.8.4 #76 (audit H-6): the gate now enforces an `x-amz-date`
7981/// freshness window (default 15 min, AWS-spec) and a strict credential
7982/// scope shape (`<key>/<YYYYMMDD>/s3/aws4_request`), shutting the
7983/// captured-request replay vector — previously a stolen valid SigV4a
7984/// signature could be replayed indefinitely (including DELETE).
7985#[derive(Debug, Clone)]
7986pub struct SigV4aGate {
7987    store: crate::sigv4a::SharedSigV4aCredentialStore,
7988    /// v0.8.4 #76: how far the request's `x-amz-date` may drift from
7989    /// the server's clock before being rejected with 403
7990    /// `RequestTimeTooSkewed`. Matches the AWS S3 spec default of
7991    /// 15 min when constructed via [`SigV4aGate::new`]; the operator
7992    /// can override via [`SigV4aGate::with_skew_tolerance`] (CLI flag
7993    /// `--sigv4a-skew-tolerance-seconds`).
7994    skew_tolerance: chrono::Duration,
7995}
7996
7997impl SigV4aGate {
7998    /// Default `x-amz-date` skew tolerance — 15 min, matching AWS S3.
7999    pub const DEFAULT_SKEW_TOLERANCE_SECS: i64 = 900;
8000
8001    #[must_use]
8002    pub fn new(store: crate::sigv4a::SharedSigV4aCredentialStore) -> Self {
8003        Self {
8004            store,
8005            skew_tolerance: chrono::Duration::seconds(Self::DEFAULT_SKEW_TOLERANCE_SECS),
8006        }
8007    }
8008
8009    /// v0.8.4 #76: override the `x-amz-date` skew tolerance (default
8010    /// 15 min). Operators can widen this for high-clock-drift
8011    /// environments or tighten it for compliance regimes that demand
8012    /// stricter freshness.
8013    #[must_use]
8014    pub fn with_skew_tolerance(mut self, skew: chrono::Duration) -> Self {
8015        self.skew_tolerance = skew;
8016        self
8017    }
8018
8019    /// Read the configured skew tolerance — exposed mostly for test +
8020    /// observability use.
8021    #[must_use]
8022    pub fn skew_tolerance(&self) -> chrono::Duration {
8023        self.skew_tolerance
8024    }
8025
8026    /// Inspect an incoming HTTP request. Behaviour:
8027    ///
8028    /// - Not SigV4a (no `X-Amz-Region-Set` and no SigV4a `Authorization`
8029    ///   prefix) → returns `Ok(())`; the framework's existing SigV4
8030    ///   path handles the request.
8031    /// - SigV4a + valid signature + region match + fresh x-amz-date
8032    ///   → `Ok(())`.
8033    /// - SigV4a + unknown access-key-id → `Err` with `InvalidAccessKeyId`.
8034    /// - SigV4a + bad signature / region mismatch → `Err` with
8035    ///   `SignatureDoesNotMatch`.
8036    /// - SigV4a + missing or skewed `x-amz-date` → `Err` with one of
8037    ///   the v0.8.4 #76 freshness variants (`RequestTimeTooSkewed`
8038    ///   et al.).
8039    ///
8040    /// `canonical_request_bytes` is the SigV4a string-to-sign (or
8041    /// canonical-request bytes; the caller decides) that the framework
8042    /// has already produced for this request. Keeping it as a parameter
8043    /// instead of rebuilding it inside the hook avoids duplicating the
8044    /// canonicalisation logic.
8045    pub fn pre_route<B>(
8046        &self,
8047        req: &http::Request<B>,
8048        requested_region: &str,
8049        canonical_request_bytes: &[u8],
8050    ) -> Result<(), SigV4aGateError> {
8051        self.pre_route_at(
8052            req,
8053            requested_region,
8054            canonical_request_bytes,
8055            chrono::Utc::now(),
8056        )
8057    }
8058
8059    /// Like [`SigV4aGate::pre_route`] but takes an explicit `now` for
8060    /// tests that need to pin the freshness clock. Production callers
8061    /// use `pre_route` (which calls `chrono::Utc::now()`).
8062    pub fn pre_route_at<B>(
8063        &self,
8064        req: &http::Request<B>,
8065        requested_region: &str,
8066        canonical_request_bytes: &[u8],
8067        now: chrono::DateTime<chrono::Utc>,
8068    ) -> Result<(), SigV4aGateError> {
8069        if !crate::sigv4a::detect(req) {
8070            return Ok(());
8071        }
8072        let auth_hdr = req
8073            .headers()
8074            .get(http::header::AUTHORIZATION)
8075            .and_then(|v| v.to_str().ok())
8076            .ok_or(SigV4aGateError::MissingAuthorization)?;
8077        let parsed = crate::sigv4a::parse_authorization_header(auth_hdr)
8078            .map_err(|_| SigV4aGateError::MalformedAuthorization)?;
8079        let region_set = req
8080            .headers()
8081            .get(crate::sigv4a::REGION_SET_HEADER)
8082            .and_then(|v| v.to_str().ok())
8083            .unwrap_or("*");
8084        let key = self
8085            .store
8086            .get(&parsed.access_key_id)
8087            .ok_or_else(|| SigV4aGateError::UnknownAccessKey(parsed.access_key_id.clone()))?;
8088        // v0.8.4 #76: snapshot the request headers into a
8089        // lowercase-keyed flat map so `verify_request` can do the
8090        // x-amz-date freshness checks without taking a generic
8091        // `HeaderMap` dep. Cheap because the headers list is tiny.
8092        //
8093        // v0.8.5 #84 (audit H-4): detect duplicate header names while
8094        // we flatten — `HashMap::insert` would silently overwrite the
8095        // first value with the second, mirroring the auth-confusion
8096        // vector the canonical-request builder also defends against.
8097        // Reject upfront so the rest of the gate (freshness check,
8098        // ECDSA verify) never sees a half-truncated header set. We
8099        // detect by checking `contains_key` *before* insertion rather
8100        // than by counting via `headers().get_all`, because the
8101        // upstream `HeaderMap` iteration yields each duplicate entry
8102        // as its own (name, value) pair — the second-seen entry is
8103        // exactly what `contains_key` traps.
8104        let mut header_map: std::collections::HashMap<String, String> =
8105            std::collections::HashMap::with_capacity(req.headers().len());
8106        for (name, value) in req.headers() {
8107            if let Ok(v) = value.to_str() {
8108                let lower = name.as_str().to_ascii_lowercase();
8109                if header_map.contains_key(&lower) {
8110                    return Err(SigV4aGateError::Verify(
8111                        crate::sigv4a::SigV4aError::DuplicateSignedHeader { header: lower },
8112                    ));
8113                }
8114                header_map.insert(lower, v.to_string());
8115            }
8116        }
8117        crate::sigv4a::verify_request(
8118            &parsed,
8119            &header_map,
8120            canonical_request_bytes,
8121            key,
8122            region_set,
8123            requested_region,
8124            now,
8125            self.skew_tolerance,
8126        )
8127        .map_err(SigV4aGateError::Verify)?;
8128        Ok(())
8129    }
8130}
8131
8132/// Failure modes from [`SigV4aGate::pre_route`]. All variants map to
8133/// HTTP 403 with one of the two AWS-standard error codes
8134/// (`InvalidAccessKeyId` / `SignatureDoesNotMatch` / `RequestTimeTooSkewed`)
8135/// — see [`SigV4aGateError::s3_error_code`].
8136#[derive(Debug, thiserror::Error)]
8137pub enum SigV4aGateError {
8138    #[error("missing Authorization header")]
8139    MissingAuthorization,
8140    #[error("malformed SigV4a Authorization header")]
8141    MalformedAuthorization,
8142    #[error("unknown SigV4a access-key-id: {0}")]
8143    UnknownAccessKey(String),
8144    #[error("SigV4a verification failed: {0}")]
8145    Verify(#[source] crate::sigv4a::SigV4aError),
8146}
8147
8148impl SigV4aGateError {
8149    /// AWS S3 error code that should accompany the response.
8150    ///
8151    /// v0.8.4 #76 (audit H-6): the freshness check surfaces
8152    /// `RequestTimeTooSkewed` (matches AWS spec); date / scope shape
8153    /// failures surface as `InvalidRequest` (400); other failures stay
8154    /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` (403) so the wire
8155    /// surface stays AWS-compatible.
8156    #[must_use]
8157    pub fn s3_error_code(&self) -> &'static str {
8158        match self {
8159            Self::UnknownAccessKey(_) => "InvalidAccessKeyId",
8160            Self::Verify(crate::sigv4a::SigV4aError::RequestTimeTooSkewed { .. }) => {
8161                "RequestTimeTooSkewed"
8162            }
8163            Self::Verify(
8164                crate::sigv4a::SigV4aError::MissingXAmzDate
8165                | crate::sigv4a::SigV4aError::InvalidDateFormat
8166                | crate::sigv4a::SigV4aError::DateScopeMismatch
8167                | crate::sigv4a::SigV4aError::XAmzDateNotSigned
8168                | crate::sigv4a::SigV4aError::InvalidTerminator
8169                | crate::sigv4a::SigV4aError::WrongService { .. }
8170                | crate::sigv4a::SigV4aError::InvalidCredentialScope,
8171            ) => "InvalidRequest",
8172            _ => "SignatureDoesNotMatch",
8173        }
8174    }
8175
8176    /// HTTP status code to accompany the response. v0.8.4 #76: format
8177    /// errors that are clearly client mistakes (missing / malformed
8178    /// `x-amz-date`, malformed credential scope, wrong service) are
8179    /// surfaced as 400 InvalidRequest; the rest stay 403.
8180    #[must_use]
8181    pub fn http_status(&self) -> http::StatusCode {
8182        match self {
8183            Self::Verify(
8184                crate::sigv4a::SigV4aError::MissingXAmzDate
8185                | crate::sigv4a::SigV4aError::InvalidDateFormat
8186                | crate::sigv4a::SigV4aError::DateScopeMismatch
8187                | crate::sigv4a::SigV4aError::XAmzDateNotSigned
8188                | crate::sigv4a::SigV4aError::InvalidTerminator
8189                | crate::sigv4a::SigV4aError::WrongService { .. }
8190                | crate::sigv4a::SigV4aError::InvalidCredentialScope,
8191            ) => http::StatusCode::BAD_REQUEST,
8192            _ => http::StatusCode::FORBIDDEN,
8193        }
8194    }
8195}
8196
8197#[cfg(test)]
8198mod tests {
8199    use super::*;
8200
8201    #[test]
8202    fn manifest_roundtrip_via_metadata() {
8203        let original = ChunkManifest {
8204            codec: CodecKind::CpuZstd,
8205            original_size: 1234,
8206            compressed_size: 567,
8207            crc32c: 0xdead_beef,
8208        };
8209        let mut meta: Option<Metadata> = None;
8210        write_manifest(&mut meta, &original);
8211        let extracted = extract_manifest(&meta).expect("manifest must round-trip");
8212        assert_eq!(extracted.codec, original.codec);
8213        assert_eq!(extracted.original_size, original.original_size);
8214        assert_eq!(extracted.compressed_size, original.compressed_size);
8215        assert_eq!(extracted.crc32c, original.crc32c);
8216    }
8217
8218    #[test]
8219    fn missing_metadata_yields_none() {
8220        let meta: Option<Metadata> = None;
8221        assert!(extract_manifest(&meta).is_none());
8222    }
8223
8224    #[test]
8225    fn partial_metadata_yields_none() {
8226        let mut meta = Metadata::new();
8227        meta.insert(META_CODEC.into(), "cpu-zstd".into());
8228        let opt = Some(meta);
8229        assert!(extract_manifest(&opt).is_none());
8230    }
8231
8232    #[test]
8233    fn parse_copy_source_range_basic() {
8234        let r = parse_copy_source_range("bytes=10-20").unwrap();
8235        match r {
8236            s3s::dto::Range::Int { first, last } => {
8237                assert_eq!(first, 10);
8238                assert_eq!(last, Some(20));
8239            }
8240            _ => panic!("expected Int range"),
8241        }
8242    }
8243
8244    #[test]
8245    fn parse_copy_source_range_rejects_inverted() {
8246        let err = parse_copy_source_range("bytes=20-10").unwrap_err();
8247        assert!(err.contains("last < first"));
8248    }
8249
8250    #[test]
8251    fn parse_copy_source_range_rejects_missing_prefix() {
8252        let err = parse_copy_source_range("10-20").unwrap_err();
8253        assert!(err.contains("must start with 'bytes='"));
8254    }
8255
8256    #[test]
8257    fn parse_copy_source_range_rejects_open_ended() {
8258        // S3 upload_part_copy spec requires N-M (closed); suffix and
8259        // open-ended forms are not allowed for this header.
8260        assert!(parse_copy_source_range("bytes=10-").is_err());
8261        assert!(parse_copy_source_range("bytes=-10").is_err());
8262    }
8263
8264    // v0.7 #49: safe_object_uri must round-trip every legal S3 key
8265    // (which includes spaces, slashes, control chars, raw UTF-8) into
8266    // a parseable `http::Uri` instead of panicking like the previous
8267    // `format!(...).parse().unwrap()` call sites did.
8268
8269    #[test]
8270    fn safe_object_uri_basic_ascii() {
8271        let uri = safe_object_uri("bucket", "key").expect("ascii must be safe");
8272        assert_eq!(uri.path(), "/bucket/key");
8273    }
8274
8275    #[test]
8276    fn safe_object_uri_encodes_spaces() {
8277        let uri = safe_object_uri("bucket", "key with spaces").expect("must encode spaces");
8278        // RFC 3986 path-segment encoding turns ' ' into %20.
8279        assert!(
8280            uri.path().contains("%20"),
8281            "expected percent-encoded space, got {}",
8282            uri.path()
8283        );
8284        assert!(uri.path().starts_with("/bucket/"));
8285    }
8286
8287    #[test]
8288    fn safe_object_uri_preserves_slashes() {
8289        // S3 keys legally contain '/' as a logical path separator —
8290        // the helper must NOT escape it (otherwise the synthetic URI
8291        // changes the perceived hierarchy).
8292        let uri = safe_object_uri("bucket", "key/with/slashes").expect("slashes must round-trip");
8293        assert_eq!(uri.path(), "/bucket/key/with/slashes");
8294    }
8295
8296    #[test]
8297    fn safe_object_uri_handles_newline_without_panic() {
8298        // Newlines are control chars in URIs; whether the result is
8299        // Ok (encoded as %0A) or Err (parse rejects), the helper
8300        // MUST NOT panic. Either outcome is acceptable.
8301        let _ = safe_object_uri("bucket", "key\n");
8302    }
8303
8304    #[test]
8305    fn safe_object_uri_handles_null_byte_without_panic() {
8306        let _ = safe_object_uri("bucket", "key\0bad");
8307    }
8308
8309    #[test]
8310    fn safe_object_uri_handles_unicode_without_panic() {
8311        // RTL override, BOM, plain Japanese — none should panic.
8312        let _ = safe_object_uri("bucket", "rtl\u{202E}override");
8313        let _ = safe_object_uri("bucket", "\u{FEFF}bom-key");
8314        let _ = safe_object_uri("bucket", "日本語キー");
8315    }
8316
8317    #[test]
8318    fn safe_object_uri_no_panic_for_every_byte() {
8319        // Exhaustive byte coverage: 0x00..=0xFF as a 1-byte key.
8320        // None of these may panic. (0x80..=0xFF are not valid UTF-8
8321        // by themselves; we go through `String::from_utf8_lossy` so
8322        // the helper sees a real `&str` regardless of the raw byte.)
8323        for b in 0u8..=255 {
8324            let s = String::from_utf8_lossy(&[b]).into_owned();
8325            let _ = safe_object_uri("bucket", &s);
8326        }
8327    }
8328
8329    /// v0.8.1 #58: smoke test for the DEK-handling shape used by the
8330    /// SSE-KMS branches of `put_object` and `complete_multipart_upload`.
8331    /// Mirrors the call pattern (generate_dek → length check → copy
8332    /// into stack `[u8; 32]` → reborrow as `&[u8; 32]` for `SseSource`)
8333    /// without spinning up a full `S4Service`.
8334    ///
8335    /// The real assertion this guards against is a regression where
8336    /// the `Zeroizing` wrapper is accidentally dropped before the
8337    /// stack copy lands (e.g. someone refactors to use
8338    /// `let dek = kms.generate_dek(...).await?.0; drop(dek); ...`)
8339    /// or where `&**dek` is rewritten in a way that doesn't compile.
8340    #[tokio::test]
8341    async fn kms_dek_lifetime_within_function_scope() {
8342        use crate::kms::{KmsBackend, LocalKms};
8343        use std::collections::HashMap;
8344        use std::path::PathBuf;
8345        use zeroize::Zeroizing;
8346
8347        let mut keks = HashMap::new();
8348        keks.insert("scope".to_string(), [33u8; 32]);
8349        let kms = LocalKms::from_keks(PathBuf::from("/tmp/kms-scope-test"), keks);
8350
8351        // Mirror the put_object KMS branch shape exactly.
8352        let (dek, wrapped) = kms.generate_dek("scope").await.unwrap();
8353        assert_eq!(dek.len(), 32);
8354        let mut dek_arr: Zeroizing<[u8; 32]> = Zeroizing::new([0u8; 32]);
8355        dek_arr.copy_from_slice(&dek);
8356
8357        // The reborrow used at the SseSource construction site —
8358        // mirrors the call-site pattern where `let dek_ref: &[u8; 32]`
8359        // auto-derefs from a `Zeroizing<[u8; 32]>` reference.
8360        let dek_ref: &[u8; 32] = &dek_arr;
8361        // Sanity: the reborrow points at the same bytes.
8362        assert_eq!(dek_ref, &*dek_arr);
8363        // Wrapped key id flows through unchanged.
8364        assert_eq!(wrapped.key_id, "scope");
8365
8366        // At end of scope, both `dek` (Zeroizing<Vec<u8>>) and
8367        // `dek_arr` (Zeroizing<[u8; 32]>) are dropped, wiping the
8368        // backing memory. Cannot directly assert the wipe (would be
8369        // UB to read freed memory), so this test instead enforces
8370        // that the call shape compiles and executes; the wipe itself
8371        // is exercised by the `zeroize` crate's own test suite.
8372    }
8373
8374    /// v0.8.5 #86 (audit M-2): the replication dispatcher must
8375    /// `acquire_owned()` a permit from `replication_semaphore` before
8376    /// kicking off the destination PUT, so a saturated semaphore
8377    /// back-pressures the in-flight queue depth instead of letting it
8378    /// grow without bound. We exercise the field directly (initial
8379    /// permit count, override via `with_replication_max_concurrent`,
8380    /// permit drop on `Drop`) — the full `spawn_replication_if_matched`
8381    /// integration is exercised by the existing replication tests in
8382    /// `tests/feature_e2e.rs` once a `ReplicationManager` is attached.
8383    #[tokio::test]
8384    async fn replication_semaphore_caps_concurrent_dispatchers() {
8385        // Build a minimal `S4Service` directly — no handler path is
8386        // exercised, only the constructor + setter + accessor shape.
8387        let registry = Arc::new(
8388            CodecRegistry::new(CodecKind::Passthrough)
8389                .with(Arc::new(s4_codec::passthrough::Passthrough)),
8390        );
8391        let dispatcher = Arc::new(s4_codec::dispatcher::AlwaysDispatcher(
8392            CodecKind::Passthrough,
8393        ));
8394        let s4 = S4Service::new(NoopBackend, registry, dispatcher);
8395
8396        // Default cap matches the documented constant.
8397        assert_eq!(
8398            s4.replication_semaphore().available_permits(),
8399            S4Service::<NoopBackend>::DEFAULT_REPLICATION_MAX_CONCURRENT,
8400            "fresh S4Service must expose DEFAULT_REPLICATION_MAX_CONCURRENT permits"
8401        );
8402
8403        // Override via the builder — replaces the underlying `Semaphore`.
8404        let s4 = s4.with_replication_max_concurrent(2);
8405        assert_eq!(
8406            s4.replication_semaphore().available_permits(),
8407            2,
8408            "with_replication_max_concurrent(2) must expose exactly 2 permits"
8409        );
8410
8411        // Acquiring permits must reduce `available_permits()` and
8412        // dropping them must restore the count — this is the contract
8413        // `spawn_replication_if_matched` relies on for back-pressure.
8414        let sem = Arc::clone(s4.replication_semaphore());
8415        let p1 = sem.clone().acquire_owned().await.expect("permit 1");
8416        let p2 = sem.clone().acquire_owned().await.expect("permit 2");
8417        assert_eq!(
8418            sem.available_permits(),
8419            0,
8420            "two acquired permits must zero `available_permits()`"
8421        );
8422        // A third `try_acquire_owned` must fail — the cap is enforced
8423        // synchronously, no extra spawn slips through.
8424        assert!(
8425            sem.clone().try_acquire_owned().is_err(),
8426            "third acquire must back-pressure: cap was 2"
8427        );
8428        drop(p1);
8429        drop(p2);
8430        assert_eq!(
8431            sem.available_permits(),
8432            2,
8433            "dropping permits must restore cap"
8434        );
8435
8436        // Lower-bound clamp: a 0 cap would deadlock all dispatchers,
8437        // so the setter clamps it to 1 instead of accepting it
8438        // (callers are warned in the CLI doc).
8439        let s4 = s4.with_replication_max_concurrent(0);
8440        assert_eq!(
8441            s4.replication_semaphore().available_permits(),
8442            1,
8443            "cap=0 must be clamped to 1 to avoid total deadlock"
8444        );
8445    }
8446
8447    /// v0.8.5 #86 (audit M-1): the access-log flusher must return a
8448    /// `JoinHandle<()>` that the caller can `abort()` on shutdown
8449    /// without leaving a dangling task. The pre-#86 call site dropped
8450    /// the handle at end-of-block (silently detaching it); the fix is
8451    /// hoisting it into a process-lived `Vec` so the graceful-shutdown
8452    /// branch in `main.rs` can wait for clean exit. This test exercises
8453    /// the `JoinHandle.abort()` shape directly so a future refactor that
8454    /// stops returning the handle (or returns a non-abortable wrapper)
8455    /// trips this regression guard.
8456    #[tokio::test]
8457    async fn flusher_handle_can_be_aborted_cleanly() {
8458        // Stand up a minimal `AccessLog` pointing at a tmp dir so the
8459        // flusher's `create_dir_all` succeeds. The dir is cleaned up
8460        // by the OS / test harness; we don't assert on the contents.
8461        let tmp = std::env::temp_dir().join(format!(
8462            "s4-86-flusher-{}-{}",
8463            std::process::id(),
8464            std::time::SystemTime::now()
8465                .duration_since(std::time::UNIX_EPOCH)
8466                .map(|d| d.as_nanos())
8467                .unwrap_or(0)
8468        ));
8469        let dest = crate::access_log::AccessLogDest { dir: tmp.clone() };
8470        let log = crate::access_log::AccessLog::new(dest);
8471        let handle = log.spawn_flusher(None);
8472        assert!(
8473            !handle.is_finished(),
8474            "freshly-spawned flusher must not yet be finished"
8475        );
8476        handle.abort();
8477        // `await`-ing an aborted handle returns `Err(JoinError)` whose
8478        // `is_cancelled()` is true.
8479        let join_result = handle.await;
8480        assert!(
8481            join_result.is_err(),
8482            "aborted flusher must surface JoinError, got Ok"
8483        );
8484        assert!(
8485            join_result.unwrap_err().is_cancelled(),
8486            "JoinError must report .is_cancelled() = true after abort()"
8487        );
8488        let _ = std::fs::remove_dir_all(&tmp);
8489    }
8490
8491    /// Stub backend used solely by the v0.8.5 #86 unit tests above —
8492    /// the `S4Service` constructor needs `B: S3` but the tests only
8493    /// exercise builder / accessor shape, never a handler call. Every
8494    /// `S3` method falls through to the trait's default
8495    /// `NotImplemented` (which `s3s` provides automatically).
8496    struct NoopBackend;
8497
8498    #[async_trait::async_trait]
8499    impl S3 for NoopBackend {}
8500
8501    /// v0.8.5 #81 (audit H-7): the panic-catch wrapper at the
8502    /// dispatcher spawn site must intercept a panicking inner future,
8503    /// log at ERROR, and bump the per-kind counter — instead of letting
8504    /// the panic propagate as a `JoinError` that no operator dashboard
8505    /// scrapes. We exercise the wrapper directly (rather than driving a
8506    /// full `spawn_replication_if_matched` end-to-end, which would
8507    /// require a full `S4Service` + backend) because the wrapper shape
8508    /// is the load-bearing piece — any inner-future swap would still
8509    /// route through the same `AssertUnwindSafe(...).catch_unwind()`
8510    /// closure we want to lock in here.
8511    #[tokio::test]
8512    async fn dispatcher_panic_caught_and_metric_bumped() {
8513        use futures::FutureExt as _;
8514
8515        let handle = crate::metrics::test_metrics_handle();
8516        let kind = "replication";
8517
8518        // Mirror the production wrapper shape verbatim — if the
8519        // production code ever stops using `AssertUnwindSafe.catch_unwind`
8520        // this test shouldn't keep passing on a hand-rolled copy that
8521        // diverged.
8522        let panicking = async {
8523            panic!("simulated dispatcher panic");
8524        };
8525        let result = std::panic::AssertUnwindSafe(panicking).catch_unwind().await;
8526        assert!(
8527            result.is_err(),
8528            "catch_unwind must surface the panic instead of swallowing it"
8529        );
8530        // Bump the production counter via the same helper the wrapper
8531        // calls so the rendered output gates on the production code
8532        // path, not a parallel bookkeeping copy.
8533        crate::metrics::record_dispatcher_panic(kind);
8534
8535        let rendered = handle.render();
8536        assert!(
8537            rendered.contains("s4_dispatcher_panics_total"),
8538            "expected s4_dispatcher_panics_total in metrics output, got: {rendered}"
8539        );
8540        assert!(
8541            rendered.contains("kind=\"replication\""),
8542            "expected kind=\"replication\" label in metrics output, got: {rendered}"
8543        );
8544    }
8545
8546    /// v0.9 #106-audit-R2 P2-INT-2: the shared trailer-verify helper
8547    /// short-circuits when the `x-amz-trailer` header is absent (no
8548    /// claim → nothing to verify).
8549    #[test]
8550    fn verify_client_trailer_checksums_passes_when_no_header() {
8551        let computed = crate::streaming_checksum::ComputedDigests::default();
8552        verify_client_trailer_checksums(None, None, &computed).expect("no claim → Ok");
8553    }
8554
8555    /// Helper that only announces non-checksum trailers (e.g. the
8556    /// `x-amz-trailer-signature` SDKs add for SigV4 streaming) is also
8557    /// a no-op — the filter discards them before anything else runs.
8558    #[test]
8559    fn verify_client_trailer_checksums_ignores_non_checksum_trailers() {
8560        let computed = crate::streaming_checksum::ComputedDigests::default();
8561        verify_client_trailer_checksums(Some("x-amz-trailer-signature"), None, &computed)
8562            .expect("non-checksum trailers must not fail");
8563    }
8564
8565    /// Fail-closed: announced checksum trailer + no trailing-headers
8566    /// handle = `BadDigest`. This is the core regression fence for the
8567    /// buffered-path silent-skip the P2-INT-2 fix closes.
8568    #[test]
8569    fn verify_client_trailer_checksums_no_handle_fails_closed() {
8570        let computed = crate::streaming_checksum::ComputedDigests::default();
8571        let err = verify_client_trailer_checksums(Some("x-amz-checksum-crc32c"), None, &computed)
8572            .expect_err("announced trailer with no handle must fail closed");
8573        assert_eq!(err.code().as_str(), "BadDigest");
8574        assert!(
8575            err.message()
8576                .unwrap_or_default()
8577                .contains("trailing-headers handle"),
8578            "error message must hint at the missing handle, got {err:?}"
8579        );
8580    }
8581
8582    /// Case-insensitive trailer name match — AWS SDKs may use any
8583    /// casing per RFC 9110 §5.1. The filter must still detect the
8584    /// `x-amz-checksum-` prefix; the helper then propagates the bad-
8585    /// digest reject via the missing handle.
8586    #[test]
8587    fn verify_client_trailer_checksums_case_insensitive_filter() {
8588        let computed = crate::streaming_checksum::ComputedDigests::default();
8589        let err = verify_client_trailer_checksums(Some("X-Amz-Checksum-Crc32c"), None, &computed)
8590            .expect_err("upper-case trailer name must still be detected");
8591        assert_eq!(err.code().as_str(), "BadDigest");
8592    }
8593
8594    /// Mixed announce: one checksum trailer and one unrelated trailer.
8595    /// The filter retains the checksum one and routes to the fail-closed
8596    /// branch when the handle is absent.
8597    #[test]
8598    fn verify_client_trailer_checksums_mixed_announce_still_validates() {
8599        let computed = crate::streaming_checksum::ComputedDigests::default();
8600        let err = verify_client_trailer_checksums(
8601            Some("x-amz-checksum-sha256, x-amz-trailer-signature"),
8602            None,
8603            &computed,
8604        )
8605        .expect_err("mixed announce with checksum entry must still fail closed");
8606        assert_eq!(err.code().as_str(), "BadDigest");
8607    }
8608}
s4_server/service.rs

s4_server/
service.rs