s4_server/
service.rs

1//! `s3s::S3` 実装 — `s3s_aws::Proxy` への delegation を default にしつつ、
2//! `put_object` / `get_object` 経路で `s4_codec::CodecRegistry` を呼ぶ。
3//!
4//! ## カバー範囲 (Phase 1 月 2)
5//!
6//! - 圧縮 hook あり: `put_object`, `get_object`
7//! - 純 delegation (圧縮なし): `head_bucket`, `list_buckets`, `create_bucket`, `delete_bucket`,
8//!   `head_object`, `delete_object`, `delete_objects`, `copy_object`, `list_objects`,
9//!   `list_objects_v2`, `create_multipart_upload`, `upload_part`,
10//!   `complete_multipart_upload`, `abort_multipart_upload`, `list_multipart_uploads`,
11//!   `list_parts`
12//! - 未対応 (デフォルトで NotImplemented): その他 80+ ops (Tagging / ACL / Lifecycle 等は Phase 2)
13//!
14//! ## アーキテクチャ
15//!
16//! - `S4Service<B>` は backend (B: S3) と `Arc<CodecRegistry>` と `Arc<dyn CodecDispatcher>`
17//!   を保持する。`CodecRegistry` 経由で複数 codec を抱えられるので、ひとつの S4 インスタンスが
18//!   複数 codec で書かれた object を透過的に GET できる
19//! - PUT: dispatcher が body の先頭 sample から codec を選び、registry で compress、
20//!   manifest を S3 metadata に書いて backend に forward
21//! - GET: backend から取得 → metadata から manifest を復元 → registry.decompress で
22//!   manifest 指定の codec で解凍 → 元の bytes を return
23//!
24//! ## 既知の制限事項
25//!
26//! - **Multipart Upload は per-part 圧縮が未実装**: 現状は upload_part を素通し。
27//!   Phase 1 月 2 後半で per-part compress + complete_multipart_upload で manifest 集約。
28//! - **PUT body は memory に collect**: max_body_bytes 上限あり (default 5 GiB = S3 単発 PUT 上限)。
29//!   Streaming-aware 圧縮は Phase 2。
30
31use std::sync::Arc;
32
33use base64::Engine as _;
34use bytes::BytesMut;
35use s3s::dto::*;
36use s3s::{S3, S3Error, S3ErrorCode, S3Request, S3Response, S3Result};
37use s4_codec::index::{FrameIndex, build_index_from_body, decode_index, encode_index, sidecar_key};
38use s4_codec::multipart::{
39    FRAME_HEADER_BYTES, FrameHeader, FrameIter, S3_MULTIPART_MIN_PART_BYTES, pad_to_minimum,
40    write_frame,
41};
42use s4_codec::{ChunkManifest, CodecDispatcher, CodecKind, CodecRegistry, CompressTelemetry};
43use std::time::Instant;
44use tracing::{debug, info};
45
46use crate::blob::{
47    bytes_to_blob, chain_sample_with_rest, collect_blob, collect_with_sample, peek_sample,
48};
49use crate::streaming::{
50    Crc32cVerifyingReader, async_read_to_blob, blob_to_async_read, cpu_zstd_decompress_stream,
51    pick_chunk_size, streaming_compress_to_frames, supports_streaming_compress,
52    supports_streaming_decompress,
53};
54
55/// PUT body の先頭 sampling で渡す最大 byte 数。
56const SAMPLE_BYTES: usize = 4096;
57
58/// v0.8 #55: stamp the GPU pipeline metrics (`s4_gpu_compress_seconds`,
59/// `s4_gpu_throughput_bytes_per_sec`, `s4_gpu_oom_total`) from a
60/// `CompressTelemetry` returned by `CodecRegistry::compress_with_telemetry`.
61/// CPU codecs (`gpu_seconds = None`) are no-ops here — they're already
62/// covered by the existing `s4_request_latency_seconds` / `s4_bytes_*`
63/// counters in the request-level `record_put` / `record_get` calls.
64#[inline]
65fn stamp_gpu_compress_telemetry(tel: &CompressTelemetry) {
66    if let Some(secs) = tel.gpu_seconds {
67        crate::metrics::record_gpu_compress(tel.codec, secs, tel.bytes_in, tel.bytes_out);
68    }
69    if tel.oom {
70        crate::metrics::record_gpu_oom(tel.codec);
71    }
72}
73
74/// v0.7 #49: percent-encoding set covering everything that is **not** an
75/// `unreserved` character per RFC 3986 §2.3, **plus** we additionally
76/// encode the path-reserved sub-delims that `http::Uri` rejects in a
77/// path segment (`?`, `#`, `%`, control bytes, space, etc.). We
78/// deliberately keep `/` un-encoded because S3 keys legally use `/` as
79/// a logical separator and the rest of the synthetic URI relies on the
80/// path layout `/{bucket}/{key}` round-tripping byte-for-byte.
81const URI_KEY_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
82    .add(b' ')
83    .add(b'"')
84    .add(b'#')
85    .add(b'<')
86    .add(b'>')
87    .add(b'?')
88    .add(b'`')
89    .add(b'{')
90    .add(b'}')
91    .add(b'|')
92    .add(b'\\')
93    .add(b'^')
94    .add(b'[')
95    .add(b']')
96    .add(b'%');
97
98/// v0.7 #49: build the synthetic `/{bucket}/{key}` request URI used by
99/// the sidecar / replication helpers when they re-enter the backend
100/// trait without going through the HTTP layer. S3 object keys can
101/// contain spaces, control bytes, and arbitrary Unicode that would
102/// make `format!(...).parse::<http::Uri>()` panic; we percent-encode
103/// the key bytes (RFC 3986 path segment) and the bucket name (defensive
104/// — bucket names are normally DNS-safe, but the helper is the single
105/// choke-point) before splicing them in. If the encoded form *still*
106/// fails to parse (extremely unlikely once everything outside the
107/// unreserved set is escaped) we surface a typed `400 InvalidObjectName`
108/// instead of crashing the worker.
109pub(crate) fn safe_object_uri(bucket: &str, key: &str) -> S3Result<http::Uri> {
110    use percent_encoding::utf8_percent_encode;
111    let bucket_enc = utf8_percent_encode(bucket, URI_KEY_ENCODE_SET);
112    let key_enc = utf8_percent_encode(key, URI_KEY_ENCODE_SET);
113    let raw = format!("/{bucket_enc}/{key_enc}");
114    raw.parse::<http::Uri>().map_err(|e| {
115        // S3 spec uses `InvalidObjectName` (HTTP 400) for keys that
116        // can't be represented in a request URI. The generated
117        // `S3ErrorCode` enum doesn't expose a typed variant for it,
118        // so we round-trip through `from_bytes` which preserves the
119        // canonical wire string while falling back to InvalidArgument
120        // if even that lookup fails (cannot happen at runtime — kept
121        // as a belt-and-suspenders branch so this helper never
122        // panics).
123        let code =
124            S3ErrorCode::from_bytes(b"InvalidObjectName").unwrap_or(S3ErrorCode::InvalidArgument);
125        S3Error::with_message(
126            code,
127            format!("object key cannot be encoded as a request URI: {e}"),
128        )
129    })
130}
131
132/// v0.8.12 HIGH-12 fix: verify a client-supplied integrity checksum
133/// against the received body BEFORE we strip the header on the way
134/// to the backend. Returns `Err(BadDigest)` on mismatch (matches
135/// AWS S3 wire behaviour); `Ok(())` when the supplied digest matches
136/// OR when the supplied algorithm is one we don't yet implement
137/// (the latter is logged so operators see the gap — fail-open on
138/// unsupported algorithms is the documented trade in the v0.8.11
139/// CHANGELOG, with full coverage tracked as a follow-up issue).
140///
141/// Algorithms covered: `Content-MD5` (base64 MD5),
142/// `x-amz-checksum-crc32c` (base64 big-endian u32),
143/// `x-amz-checksum-sha256` (base64 SHA-256). The remaining S3
144/// checksum algorithms (CRC32 non-Castagnoli, SHA-1, CRC64-NVME)
145/// are accepted and silently passed; verifying them needs new
146/// dependencies and was held back to keep the v0.8.12 surface
147/// bounded.
148#[allow(clippy::too_many_arguments)]
149fn verify_client_body_checksums(
150    body: &[u8],
151    content_md5_b64: Option<&str>,
152    checksum_crc32_b64: Option<&str>,
153    checksum_crc32c_b64: Option<&str>,
154    checksum_sha1_b64: Option<&str>,
155    checksum_sha256_b64: Option<&str>,
156    checksum_crc64nvme_b64: Option<&str>,
157) -> S3Result<()> {
158    use base64::Engine as _;
159    use md5::Md5;
160    use sha2::Sha256;
161    // `Digest` from md-5 / sha2 brings the `new`, `update`, `finalize`
162    // trait methods into scope. Bind anonymously so this `use` is
163    // never flagged as unused while still serving its real purpose.
164    use md5::Digest as _;
165    let b64 = base64::engine::general_purpose::STANDARD;
166    let bad = |what: &str| {
167        let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
168        S3Error::with_message(
169            code,
170            format!("client-supplied {what} did not match the received body"),
171        )
172    };
173    if let Some(claimed) = content_md5_b64 {
174        let want = b64.decode(claimed).map_err(|_| {
175            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed Content-MD5")
176        })?;
177        if want.len() != 16 {
178            return Err(S3Error::with_message(
179                S3ErrorCode::InvalidDigest,
180                "Content-MD5 must decode to 16 bytes",
181            ));
182        }
183        let mut h = Md5::new();
184        h.update(body);
185        let got = h.finalize();
186        // `subtle::ConstantTimeEq` would be ideal but the existing
187        // `constant_time_eq` helper in sse.rs is private; use a
188        // straightforward byte compare. The attacker doesn't get to
189        // choose the body retroactively, so a timing oracle here
190        // doesn't help them. `&got[..]` derefs the GenericArray
191        // into a `&[u8]` (the deprecated `.as_slice()` is gone in
192        // generic-array 1.x; CI runs `-D warnings`).
193        if got[..] != *want.as_slice() {
194            return Err(bad("Content-MD5"));
195        }
196    }
197    if let Some(claimed) = checksum_crc32c_b64 {
198        let want = b64.decode(claimed).map_err(|_| {
199            S3Error::with_message(
200                S3ErrorCode::InvalidDigest,
201                "malformed x-amz-checksum-crc32c",
202            )
203        })?;
204        if want.len() != 4 {
205            return Err(S3Error::with_message(
206                S3ErrorCode::InvalidDigest,
207                "x-amz-checksum-crc32c must decode to 4 bytes (big-endian u32)",
208            ));
209        }
210        let got = crc32c::crc32c(body).to_be_bytes();
211        if got != want.as_slice() {
212            return Err(bad("x-amz-checksum-crc32c"));
213        }
214    }
215    if let Some(claimed) = checksum_sha256_b64 {
216        let want = b64.decode(claimed).map_err(|_| {
217            S3Error::with_message(
218                S3ErrorCode::InvalidDigest,
219                "malformed x-amz-checksum-sha256",
220            )
221        })?;
222        if want.len() != 32 {
223            return Err(S3Error::with_message(
224                S3ErrorCode::InvalidDigest,
225                "x-amz-checksum-sha256 must decode to 32 bytes",
226            ));
227        }
228        let mut h = Sha256::new();
229        h.update(body);
230        let got = h.finalize();
231        if got[..] != *want.as_slice() {
232            return Err(bad("x-amz-checksum-sha256"));
233        }
234    }
235    // v0.8.12 #128 (MED-C): CRC32 (IEEE 802.3 — the non-Castagnoli
236    // variant AWS uses for `x-amz-checksum-crc32`). 4-byte
237    // big-endian value, base64-encoded.
238    if let Some(claimed) = checksum_crc32_b64 {
239        let want = b64.decode(claimed).map_err(|_| {
240            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-crc32")
241        })?;
242        if want.len() != 4 {
243            return Err(S3Error::with_message(
244                S3ErrorCode::InvalidDigest,
245                "x-amz-checksum-crc32 must decode to 4 bytes (big-endian u32)",
246            ));
247        }
248        let mut h = crc32fast::Hasher::new();
249        h.update(body);
250        let got = h.finalize().to_be_bytes();
251        if got != want.as_slice() {
252            return Err(bad("x-amz-checksum-crc32"));
253        }
254    }
255    // v0.8.12 #128 (MED-C): SHA-1. 20-byte digest, base64-encoded.
256    if let Some(claimed) = checksum_sha1_b64 {
257        use sha1::Sha1;
258        let want = b64.decode(claimed).map_err(|_| {
259            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-sha1")
260        })?;
261        if want.len() != 20 {
262            return Err(S3Error::with_message(
263                S3ErrorCode::InvalidDigest,
264                "x-amz-checksum-sha1 must decode to 20 bytes",
265            ));
266        }
267        let mut h = Sha1::new();
268        h.update(body);
269        let got = h.finalize();
270        if got[..] != *want.as_slice() {
271            return Err(bad("x-amz-checksum-sha1"));
272        }
273    }
274    // v0.8.12 #128 (MED-C): CRC64-NVME — AWS's newest checksum
275    // algorithm. NVMe spec: poly 0xad93d23594c93659, init / xorout
276    // 0xffffffffffffffff, refin / refout true. The reflected
277    // polynomial + 256-entry lookup table are computed lazily on
278    // first call (small enough to inline rather than pull in a
279    // dedicated crc64 crate).
280    if let Some(claimed) = checksum_crc64nvme_b64 {
281        let want = b64.decode(claimed).map_err(|_| {
282            S3Error::with_message(
283                S3ErrorCode::InvalidDigest,
284                "malformed x-amz-checksum-crc64nvme",
285            )
286        })?;
287        if want.len() != 8 {
288            return Err(S3Error::with_message(
289                S3ErrorCode::InvalidDigest,
290                "x-amz-checksum-crc64nvme must decode to 8 bytes (big-endian u64)",
291            ));
292        }
293        let got = crc64_nvme(body).to_be_bytes();
294        if got != want.as_slice() {
295            return Err(bad("x-amz-checksum-crc64nvme"));
296        }
297    }
298    Ok(())
299}
300
301/// v0.8.12 #128 (MED-C): CRC-64/NVME (AWS S3 `x-amz-checksum-crc64nvme`).
302/// NVMe spec: poly 0xad93d23594c93659, init 0xffffffffffffffff, refin
303/// true, refout true, xorout 0xffffffffffffffff. The reflected
304/// polynomial table is computed lazily on first call via
305/// [`std::sync::OnceLock`]; subsequent calls share the 256-entry table.
306fn crc64_nvme(bytes: &[u8]) -> u64 {
307    use std::sync::OnceLock;
308    static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
309    let tbl = TABLE.get_or_init(|| {
310        // Reflected polynomial (bit-reverse of 0xad93d23594c93659).
311        const POLY_REFLECTED: u64 = 0x9a6c_9329_ac4b_c9b5;
312        let mut t = [0u64; 256];
313        let mut i = 0usize;
314        while i < 256 {
315            let mut c = i as u64;
316            let mut j = 0;
317            while j < 8 {
318                c = if c & 1 != 0 {
319                    (c >> 1) ^ POLY_REFLECTED
320                } else {
321                    c >> 1
322                };
323                j += 1;
324            }
325            t[i] = c;
326            i += 1;
327        }
328        t
329    });
330    let mut crc: u64 = !0u64;
331    for &b in bytes {
332        let idx = ((crc as u8) ^ b) as usize;
333        crc = (crc >> 8) ^ tbl[idx];
334    }
335    !crc
336}
337
338/// v0.4 #20: captured at the start of a handler, before the request is
339/// consumed by the backend call, so the matching `record_access` at
340/// end-of-request can fill in the structured access log entry.
341struct AccessLogPreamble {
342    remote_ip: Option<String>,
343    requester: Option<String>,
344    request_uri: String,
345    user_agent: Option<String>,
346}
347
348pub struct S4Service<B: S3> {
349    /// Wrapped in `Arc` so the v0.6 #40 cross-bucket replication
350    /// dispatcher can clone it into a detached `tokio::spawn` task
351    /// (Arc::clone is cheap; backend trait methods take `&self` so no
352    /// other handler is affected by the indirection).
353    backend: Arc<B>,
354    registry: Arc<CodecRegistry>,
355    dispatcher: Arc<dyn CodecDispatcher>,
356    max_body_bytes: usize,
357    policy: Option<crate::policy::SharedPolicy>,
358    /// v0.3 #13: surfaced as the `aws:SecureTransport` Condition key. Set
359    /// to `true` when the listener is wrapped in TLS (or ACME), so policies
360    /// gating "deny if not over TLS" can do their job. Defaults to `false`
361    /// (HTTP); set via [`S4Service::with_secure_transport`] at boot.
362    secure_transport: bool,
363    /// v0.4 #19: optional per-(principal, bucket) token-bucket limiter.
364    rate_limits: Option<crate::rate_limit::SharedRateLimits>,
365    /// v0.4 #20: optional S3-style access log emitter.
366    access_log: Option<crate::access_log::SharedAccessLog>,
367    /// v0.4 #21 / v0.5 #29: optional server-side encryption keyring
368    /// (AES-256-GCM). When set, every PUT body gets wrapped in S4E2
369    /// (with the keyring's active key id) after the compress + framing
370    /// steps; every GET that sniffs as S4E1/S4E2 is decrypted before
371    /// frame parsing. A `with_sse_key(...)` call wraps the supplied
372    /// key in a 1-slot keyring so single-key (v0.4) operators get the
373    /// same behaviour they had before, just on the v2 frame.
374    sse_keyring: Option<crate::sse::SharedSseKeyring>,
375    /// v0.5 #34: optional first-class versioning state machine. When
376    /// `Some(...)`, S4-server itself owns the per-bucket versioning
377    /// state + per-(bucket, key) version chain; PUT / GET / DELETE /
378    /// list_object_versions / get_bucket_versioning /
379    /// put_bucket_versioning handlers consult the manager instead of
380    /// passing through. When `None` (default), the legacy
381    /// backend-passthrough behaviour applies so existing v0.4
382    /// deployments are unaffected until they explicitly call
383    /// `with_versioning(...)`.
384    versioning: Option<Arc<crate::versioning::VersioningManager>>,
385    /// v0.5 #28: optional SSE-KMS envelope-encryption backend. When
386    /// `Some(...)`, PUTs carrying `x-amz-server-side-encryption: aws:kms`
387    /// generate a fresh DEK via the backend, encrypt the body with it
388    /// (S4E4 frame), and persist only the wrapped DEK. GETs sniffing as
389    /// S4E4 unwrap the DEK through the same backend before decrypt.
390    /// `kms_default_key_id` is used when the request omits an explicit
391    /// `x-amz-server-side-encryption-aws-kms-key-id` (mirrors AWS S3
392    /// bucket-default behaviour).
393    kms: Option<Arc<dyn crate::kms::KmsBackend>>,
394    kms_default_key_id: Option<String>,
395    /// v0.5 #30: optional Object Lock (WORM) enforcement layer. When
396    /// `Some(...)`, `delete_object` and overwrite-style `put_object`
397    /// consult the manager and refuse the operation with HTTP 403
398    /// `AccessDenied` while the object is locked (Compliance until
399    /// expiry, Governance unless the bypass header is set, or any time
400    /// a legal hold is on). PUT also auto-applies the bucket-default
401    /// retention to brand-new objects when configured. When `None`
402    /// (default), the legacy backend-passthrough behaviour applies, so
403    /// existing v0.4 deployments are unaffected until they explicitly
404    /// call `with_object_lock(...)`.
405    object_lock: Option<Arc<crate::object_lock::ObjectLockManager>>,
406    /// v0.6 #38: optional first-class CORS bucket configuration manager.
407    /// When `Some(...)`, S4-server itself owns per-bucket CORS rules and
408    /// `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
409    /// consult the manager instead of passing through to the backend.
410    /// `handle_preflight` (public method on `S4Service`) routes OPTIONS-
411    /// style preflight matching through the same store; the actual HTTP
412    /// OPTIONS routing wire-up at the listener level is a follow-up
413    /// (s3s framework does not surface OPTIONS as a typed handler).
414    cors: Option<Arc<crate::cors::CorsManager>>,
415    /// v0.6 #36: optional first-class S3 Inventory manager. When
416    /// `Some(...)`, S4-server itself owns per-(bucket, id) inventory
417    /// configurations and `put_bucket_inventory_configuration` /
418    /// `get_bucket_inventory_configuration` /
419    /// `list_bucket_inventory_configurations` /
420    /// `delete_bucket_inventory_configuration` consult the manager
421    /// instead of passing through to the backend. The actual periodic
422    /// CSV emission is driven by a tokio task in `main.rs` that calls
423    /// `InventoryManager::run_once_for_test` on a fixed cadence; the
424    /// service handlers below only deal with config-level CRUD.
425    inventory: Option<Arc<crate::inventory::InventoryManager>>,
426    /// v0.6 #35: optional first-class S3 bucket-notification manager.
427    /// When `Some(...)`, S4-server itself owns per-bucket notification
428    /// configurations and `put_bucket_notification_configuration` /
429    /// `get_bucket_notification_configuration` consult the manager
430    /// instead of passing through to the backend. Successful PUT /
431    /// DELETE handlers fire matching destinations on a detached tokio
432    /// task (best-effort; see `crate::notifications::dispatch_event`).
433    notifications: Option<Arc<crate::notifications::NotificationManager>>,
434    /// v0.6 #37: optional first-class S3 Lifecycle configuration
435    /// manager. When `Some(...)`, S4-server itself owns per-bucket
436    /// lifecycle rules and `put_bucket_lifecycle_configuration` /
437    /// `get_bucket_lifecycle_configuration` /
438    /// `delete_bucket_lifecycle` consult the manager instead of
439    /// passing through to the backend. The actual background scanner
440    /// (list_objects_v2 -> evaluate -> delete / metadata-rewrite per
441    /// rule) is a v0.7+ follow-up; the test path
442    /// `S4Service::run_lifecycle_once_for_test` exercises the
443    /// evaluator end-to-end so this v0.6 #37 wiring is enough to ship
444    /// the configuration-management half without putting a
445    /// half-wired bucket-walk in front of users.
446    lifecycle: Option<Arc<crate::lifecycle::LifecycleManager>>,
447    /// v0.6 #39: optional first-class object + bucket Tagging manager.
448    /// When `Some(...)`, S4-server itself owns per-(bucket, key) and
449    /// per-bucket tag state — `PutObjectTagging` /
450    /// `GetObjectTagging` / `DeleteObjectTagging` /
451    /// `PutBucketTagging` / `GetBucketTagging` /
452    /// `DeleteBucketTagging` route through the manager (replacing the
453    /// previous backend-passthrough behaviour). `put_object` also
454    /// pre-parses the `x-amz-tagging` header / `Tagging` input field
455    /// so the IAM policy evaluator can gate on
456    /// `s3:RequestObjectTag/<key>` and `s3:ExistingObjectTag/<key>`.
457    /// On a successful PUT the parsed tags are persisted; on a
458    /// successful DELETE the matching tag entry is dropped.
459    tagging: Option<Arc<crate::tagging::TagManager>>,
460    /// v0.6 #40: optional first-class cross-bucket replication manager.
461    /// When `Some(...)`, S4-server itself owns per-bucket replication
462    /// rules; `PutBucketReplication` / `GetBucketReplication` /
463    /// `DeleteBucketReplication` route through the manager (replacing
464    /// the previous backend-passthrough behaviour). On every successful
465    /// `put_object` the manager's rule list is consulted; the
466    /// highest-priority matching enabled rule wins, the per-key status
467    /// is recorded as `Pending`, and the source body and metadata are
468    /// handed to a detached tokio task that PUTs to the destination
469    /// bucket through the same backend. The replica is stamped with
470    /// `x-amz-replication-status: REPLICA` in its metadata; the
471    /// source-side status is updated to `Completed` on success or
472    /// `Failed` after the 3-attempt retry budget is exhausted (drop
473    /// counter bumps in either-side case so dashboards see the loss).
474    /// `head_object` / `get_object` echo the recorded status back as
475    /// `x-amz-replication-status` so consumers can poll progress.
476    /// Limited to single-instance (same `S4Service`) replication; true
477    /// cross-region (multi-instance) is a v0.7+ follow-up.
478    replication: Option<Arc<crate::replication::ReplicationManager>>,
479    /// v0.6 #42: optional MFA-Delete enforcement layer. When `Some(...)`,
480    /// every DELETE / DELETE-version / delete-marker / `PutBucketVersioning`
481    /// request against a bucket whose MFA-Delete state is `Enabled`
482    /// must carry `x-amz-mfa: <serial> <code>` (RFC 6238 6-digit TOTP);
483    /// missing or invalid tokens return HTTP 403 `AccessDenied`. When
484    /// `None` (default), the gate is a no-op so existing v0.4 / v0.5
485    /// deployments are unaffected until they explicitly call
486    /// `with_mfa_delete(...)`.
487    mfa_delete: Option<Arc<crate::mfa::MfaDeleteManager>>,
488    /// v0.5 #32: when `true`, every PUT must carry an SSE indicator
489    /// (`x-amz-server-side-encryption`, the SSE-C customer-key headers,
490    /// or be matched against a configured server-managed keyring/KMS).
491    /// Set by `--compliance-mode strict` after the boot-time
492    /// prerequisite check passes.
493    compliance_strict: bool,
494    /// v0.7 #47: optional SigV4a (asymmetric ECDSA-P256-SHA256) verify
495    /// gate. When `Some(...)`, the listener-side middleware (see
496    /// [`crate::routing::try_sigv4a_verify`]) inspects every incoming
497    /// request and short-circuits SigV4a-signed ones — verifying the
498    /// signature against the credential store and returning 403
499    /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` on failure. Plain
500    /// SigV4 (HMAC-SHA256) requests pass through to s3s untouched. When
501    /// `None`, the middleware is a no-op so the existing SigV4 path is
502    /// unaffected (operators opt in via `--sigv4a-credentials <DIR>`).
503    sigv4a_gate: Option<Arc<SigV4aGate>>,
504    /// v0.8 #54 BUG-5..10: per-`upload_id` side-table that ferries the
505    /// SSE / Tagging / Object-Lock context captured at
506    /// `CreateMultipartUpload` time through to `UploadPart` /
507    /// `CompleteMultipartUpload`. Always-on (no `with_*` flag) — the
508    /// store is gateway-internal and idle when no multipart is in
509    /// flight. See [`crate::multipart_state`] for rationale.
510    multipart_state: Arc<crate::multipart_state::MultipartStateStore>,
511    /// v0.8 #52: plaintext bytes per S4E5 chunk on the SSE-S4 PUT
512    /// path. `0` (default) → use the legacy buffered S4E2 path
513    /// (whole-body AES-GCM tag, GET buffers + verifies before
514    /// emitting). Non-zero → use the chunked S4E5 frame so GET can
515    /// stream-decrypt chunk-by-chunk. Wired by `--sse-chunk-size`
516    /// in `main.rs`. SSE-C and SSE-KMS are intentionally unaffected
517    /// (chunked variants tracked in a follow-up issue).
518    sse_chunk_size: usize,
519    /// v0.8.5 #86 (audit M-2): bounded permit pool gating the detached
520    /// replication dispatcher in [`Self::spawn_replication_if_matched`].
521    /// Without this cap, a high-volume PUT workload (1k req/s × N enabled
522    /// rules × slow destination = O(10k) in-flight tokio tasks) could
523    /// exhaust process memory before the destination drains. Each
524    /// dispatcher spawn `acquire_owned`s one permit and holds it for the
525    /// lifetime of the destination PUT + status stamp; once the cap is
526    /// reached the dispatcher async-blocks on `acquire_owned()` so the
527    /// listener path itself never stalls — only the in-flight replica
528    /// queue depth is bounded. Default 1024 (operator-tunable via
529    /// `--replication-max-concurrent`).
530    replication_semaphore: Arc<tokio::sync::Semaphore>,
531    /// v0.8.11 CRIT-4 fix: trust the `X-Forwarded-For` header for the
532    /// `aws:SourceIp` Condition key only when the operator has
533    /// explicitly opted in via `--trust-x-forwarded-for`. Default
534    /// (`false`) makes the policy evaluator see `source_ip = None`
535    /// for incoming requests, so a public-internet client can no
536    /// longer spoof an internal CIDR by setting `X-Forwarded-For`
537    /// themselves. Operators behind a trusted reverse proxy that
538    /// scrubs / sets `X-Forwarded-For` enable the flag; gateways
539    /// listening directly on the public internet leave it off and
540    /// gain a clear fail-closed default. A future release plumbs
541    /// the TCP peer address through the s3s service trait so we can
542    /// validate the forwarded header against a `--trusted-proxies`
543    /// CIDR list; until then the boolean opt-in closes the immediate
544    /// auth-bypass surface.
545    trust_x_forwarded_for: bool,
546    /// v0.8.17 G-4 (#161): migration escape hatch. When `true`,
547    /// the v0.8.16 F-13 reserved-name guard does NOT block GET /
548    /// HEAD / DELETE on keys ending in `.s4index` — the operator
549    /// is asserting that the deployment may carry pre-v0.8.15
550    /// user objects with that suffix and wants a window to
551    /// migrate them off. Writes (PUT / Copy / Create-Multipart)
552    /// stay blocked regardless of this flag, so attacker
553    /// injection from M-1 / F-13 stays closed. Default
554    /// `false` matches the v0.8.16 behaviour.
555    allow_legacy_reserved_key_reads: bool,
556}
557
558/// v0.8.17 G-2: which AWS error shape the reserved-name guard
559/// should emit on hit. `Read`-mode endpoints (GET / HEAD /
560/// Attributes / Tagging-read) return `NoSuchKey` — consistent
561/// with the listing filter hiding the sidecar. `Mutating`-mode
562/// endpoints (PUT / Copy / DELETE / Tagging-write / ACL-write)
563/// return `InvalidObjectName` so the client sees the suffix is
564/// reserved by-design rather than coincidentally missing.
565#[derive(Clone, Copy, Debug)]
566enum ReservedKeyMode {
567    Read,
568    Mutating,
569}
570
571impl<B: S3> S4Service<B> {
572    /// AWS S3 単発 PUT の API 上限 (5 GiB)
573    pub const DEFAULT_MAX_BODY_BYTES: usize = 5 * 1024 * 1024 * 1024;
574
575    /// v0.8.5 #86 (audit M-2): default cap on simultaneously-in-flight
576    /// replication dispatcher tasks. See the `replication_semaphore`
577    /// field doc for the rationale + override path.
578    pub const DEFAULT_REPLICATION_MAX_CONCURRENT: usize = 1024;
579
580    pub fn new(
581        backend: B,
582        registry: Arc<CodecRegistry>,
583        dispatcher: Arc<dyn CodecDispatcher>,
584    ) -> Self {
585        Self {
586            backend: Arc::new(backend),
587            registry,
588            dispatcher,
589            max_body_bytes: Self::DEFAULT_MAX_BODY_BYTES,
590            policy: None,
591            secure_transport: false,
592            rate_limits: None,
593            access_log: None,
594            sse_keyring: None,
595            versioning: None,
596            kms: None,
597            kms_default_key_id: None,
598            object_lock: None,
599            cors: None,
600            inventory: None,
601            notifications: None,
602            lifecycle: None,
603            tagging: None,
604            replication: None,
605            mfa_delete: None,
606            compliance_strict: false,
607            sigv4a_gate: None,
608            multipart_state: Arc::new(crate::multipart_state::MultipartStateStore::new()),
609            // v0.8 #52: chunked SSE-S4 disabled by default — opt
610            // in via `S4Service::with_sse_chunk_size(...)` /
611            // `--sse-chunk-size <BYTES>`. Default keeps the legacy
612            // S4E2 buffered path so existing deployments are
613            // bit-for-bit unchanged.
614            sse_chunk_size: 0,
615            // v0.8.5 #86 (audit M-2): default cap of 1024 in-flight
616            // replication tasks. Picked to be (a) ample headroom over a
617            // typical steady-state replication rate (the v0.8.3 #66
618            // status-sweep doc cites 1k keys/hour as a "steady" rate, so
619            // even a 100x burst lands well under 1024), (b) small enough
620            // that the worst-case memory pinned by stalled dispatchers
621            // — body bytes + metadata — stays bounded (1024 × 5 MiB
622            // typical S3 PUT ≈ 5 GiB, recoverable). Operators with
623            // wider cross-region fan-out can override via
624            // `--replication-max-concurrent`.
625            replication_semaphore: Arc::new(tokio::sync::Semaphore::new(
626                Self::DEFAULT_REPLICATION_MAX_CONCURRENT,
627            )),
628            // v0.8.11 CRIT-4: default fail-closed — ignore client-
629            // supplied `X-Forwarded-For` until the operator opts in
630            // through `with_trust_x_forwarded_for(true)`.
631            trust_x_forwarded_for: false,
632            // v0.8.17 G-4: closed by default; opt in via
633            // `with_allow_legacy_reserved_key_reads(true)` for the
634            // migration window only.
635            allow_legacy_reserved_key_reads: false,
636        }
637    }
638
639    /// v0.8.17 G-4: opt in to a migration window where GET / HEAD /
640    /// DELETE on `<key>.s4index` are allowed even though new
641    /// writes against that suffix stay rejected. Used by operators
642    /// upgrading from pre-v0.8.15 deployments that may carry
643    /// legacy user-owned objects with the now-reserved suffix.
644    /// Defaults to `false`; turn off again once the legacy data
645    /// has been migrated.
646    #[must_use]
647    pub fn with_allow_legacy_reserved_key_reads(mut self, on: bool) -> Self {
648        self.allow_legacy_reserved_key_reads = on;
649        self
650    }
651
652    /// v0.8.11 CRIT-4 fix: opt in to consuming the leftmost token of
653    /// the `X-Forwarded-For` header as `aws:SourceIp`. Only enable
654    /// when the gateway sits behind a trusted reverse proxy that
655    /// strips (or rewrites) any client-supplied value. When left
656    /// off (default), the policy evaluator sees `source_ip = None`
657    /// regardless of what the client sends — closing the
658    /// public-internet `X-Forwarded-For: 10.0.0.1` IAM-allowlist
659    /// bypass.
660    #[must_use]
661    pub fn with_trust_x_forwarded_for(mut self, on: bool) -> Self {
662        self.trust_x_forwarded_for = on;
663        self
664    }
665
666    /// v0.7 #47: attach the SigV4a verify gate. Once set, the
667    /// listener-side middleware (`crate::routing::try_sigv4a_verify`)
668    /// short-circuits any incoming `AWS4-ECDSA-P256-SHA256` request,
669    /// verifying it against the supplied credential store and
670    /// returning 403 on failure. Plain SigV4 (HMAC-SHA256) requests
671    /// are unaffected. When the gate is unset (default), the
672    /// middleware skips entirely so existing SigV4 deployments keep
673    /// working.
674    #[must_use]
675    pub fn with_sigv4a_gate(mut self, gate: Arc<SigV4aGate>) -> Self {
676        self.sigv4a_gate = Some(gate);
677        self
678    }
679
680    /// v0.7 #47: borrow the attached SigV4a gate. Used by `main.rs`
681    /// to snapshot the gate `Arc` before the s3s `ServiceBuilder`
682    /// consumes the `S4Service` (the listener-side middleware needs
683    /// the same `Arc` because s3s' SigV4 verifier rejects SigV4a
684    /// algorithm tokens with "unknown algorithm" — match has to
685    /// happen at the hyper layer instead).
686    #[must_use]
687    pub fn sigv4a_gate(&self) -> Option<&Arc<SigV4aGate>> {
688        self.sigv4a_gate.as_ref()
689    }
690
691    /// v0.8.2 #62: borrow the multipart state store so `main.rs` can
692    /// snapshot the `Arc` before the s3s `ServiceBuilder` consumes
693    /// the `S4Service`. The background `sweep_stale` task in `main.rs`
694    /// holds this `Arc` and ticks once an hour to drop abandoned
695    /// upload contexts (and their `Zeroizing<[u8; 32]>` SSE-C keys).
696    #[must_use]
697    pub fn multipart_state(&self) -> &Arc<crate::multipart_state::MultipartStateStore> {
698        &self.multipart_state
699    }
700
701    /// v0.6 #39: attach the in-memory object + bucket Tagging manager.
702    /// Once set, `Put/Get/Delete` `Object/Bucket Tagging` route
703    /// through the manager (instead of forwarding to the backend),
704    /// and `put_object`'s `x-amz-tagging` parse path becomes the
705    /// source of `s3:RequestObjectTag/<key>` for the IAM policy
706    /// evaluator. The manager itself is shared via `Arc`.
707    #[must_use]
708    pub fn with_tagging(mut self, mgr: Arc<crate::tagging::TagManager>) -> Self {
709        self.tagging = Some(mgr);
710        self
711    }
712
713    /// v0.6 #39: borrow the attached tagging manager (test /
714    /// introspection — the snapshotter in `main.rs`, when wired,
715    /// will keep its own `Arc` clone).
716    #[must_use]
717    pub fn tag_manager(&self) -> Option<&Arc<crate::tagging::TagManager>> {
718        self.tagging.as_ref()
719    }
720
721    /// v0.6 #36: attach the in-memory S3 Inventory manager. Once set,
722    /// `put_bucket_inventory_configuration` /
723    /// `get_bucket_inventory_configuration` /
724    /// `list_bucket_inventory_configurations` /
725    /// `delete_bucket_inventory_configuration` route through the
726    /// manager. The actual periodic CSV / manifest emission is
727    /// orchestrated by a tokio task started in `main.rs`; the manager
728    /// itself is shared between the handler and the scheduler via
729    /// `Arc`.
730    #[must_use]
731    pub fn with_inventory(mut self, mgr: Arc<crate::inventory::InventoryManager>) -> Self {
732        self.inventory = Some(mgr);
733        self
734    }
735
736    /// v0.6 #36: borrow the attached inventory manager (test /
737    /// introspection — the background scheduler in `main.rs` keeps its
738    /// own `Arc` clone, so this accessor is for the test path that
739    /// invokes `run_once_for_test` directly).
740    #[must_use]
741    pub fn inventory_manager(&self) -> Option<&Arc<crate::inventory::InventoryManager>> {
742        self.inventory.as_ref()
743    }
744
745    /// v0.6 #37: attach the in-memory S3 Lifecycle configuration
746    /// manager. Once set, `put_bucket_lifecycle_configuration` /
747    /// `get_bucket_lifecycle_configuration` / `delete_bucket_lifecycle`
748    /// route through the manager (replacing the previous backend-
749    /// passthrough behaviour). The actual periodic scanner that walks
750    /// the source bucket and invokes Expiration / Transition /
751    /// NoncurrentExpiration actions is a v0.7+ follow-up — see
752    /// [`Self::run_lifecycle_once_for_test`] for the in-memory test
753    /// path that exercises the evaluator end-to-end.
754    #[must_use]
755    pub fn with_lifecycle(mut self, mgr: Arc<crate::lifecycle::LifecycleManager>) -> Self {
756        self.lifecycle = Some(mgr);
757        self
758    }
759
760    /// v0.6 #37: borrow the attached lifecycle manager (test /
761    /// introspection — the background scheduler in `main.rs` keeps its
762    /// own `Arc` clone, so this accessor is for the test path that
763    /// invokes the evaluator directly).
764    #[must_use]
765    pub fn lifecycle_manager(&self) -> Option<&Arc<crate::lifecycle::LifecycleManager>> {
766        self.lifecycle.as_ref()
767    }
768
769    /// v0.6 #37: synchronous test entry that runs the lifecycle evaluator
770    /// against a caller-provided list of `(key, age, size, tags)` tuples
771    /// and returns the `(key, action)` pairs that should fire. The actual
772    /// backend invocation (S3.delete_object / metadata rewrite) is left
773    /// to the caller — the unit + E2E tests use this to verify the
774    /// evaluator without spawning the (deferred) background scanner.
775    /// Returns an empty `Vec` when no lifecycle manager is attached or
776    /// no rule matches.
777    #[must_use]
778    pub fn run_lifecycle_once_for_test(
779        &self,
780        bucket: &str,
781        objects: &[crate::lifecycle::EvaluateBatchEntry],
782    ) -> Vec<(String, crate::lifecycle::LifecycleAction)> {
783        let Some(mgr) = self.lifecycle.as_ref() else {
784            return Vec::new();
785        };
786        crate::lifecycle::evaluate_batch(mgr, bucket, objects)
787    }
788
789    /// v0.6 #35: attach the in-memory bucket-notification manager. Once
790    /// set, `put_bucket_notification_configuration` /
791    /// `get_bucket_notification_configuration` route through the manager
792    /// (replacing the previous backend-passthrough behaviour); successful
793    /// `put_object` / `delete_object` calls fire matching destinations
794    /// on a detached tokio task via
795    /// `crate::notifications::dispatch_event` (best-effort, fire-and-
796    /// forget — failures bump the manager's `dropped_total` counter and
797    /// log at warn but do NOT fail the originating S3 request).
798    #[must_use]
799    pub fn with_notifications(
800        mut self,
801        mgr: Arc<crate::notifications::NotificationManager>,
802    ) -> Self {
803        self.notifications = Some(mgr);
804        self
805    }
806
807    /// v0.6 #35: borrow the attached notifications manager (test /
808    /// introspection — used by the metrics layer to read
809    /// `dropped_total`).
810    #[must_use]
811    pub fn notifications_manager(&self) -> Option<&Arc<crate::notifications::NotificationManager>> {
812        self.notifications.as_ref()
813    }
814
815    /// v0.6 #35: internal helper used by the DELETE handlers to fire a
816    /// matching notification on a detached tokio task. No-op when no
817    /// manager is attached or no rule on the bucket matches the given
818    /// (event, key) tuple.
819    fn fire_delete_notification(
820        &self,
821        bucket: &str,
822        key: &str,
823        event: crate::notifications::EventType,
824        version_id: Option<String>,
825    ) {
826        let Some(mgr) = self.notifications.as_ref() else {
827            return;
828        };
829        let dests = mgr.match_destinations(bucket, &event, key);
830        if dests.is_empty() {
831            return;
832        }
833        tokio::spawn(crate::notifications::dispatch_event(
834            Arc::clone(mgr),
835            bucket.to_owned(),
836            key.to_owned(),
837            event,
838            None,
839            None,
840            version_id,
841            format!("S4-{}", uuid::Uuid::new_v4()),
842        ));
843    }
844
845    /// v0.6 #40: attach the in-memory cross-bucket replication manager.
846    /// Once set, `put_bucket_replication` / `get_bucket_replication` /
847    /// `delete_bucket_replication` route through the manager (replacing
848    /// the previous backend-passthrough behaviour); a successful
849    /// `put_object` whose key matches an enabled rule fires a detached
850    /// tokio task that PUTs the same body + metadata to the rule's
851    /// destination bucket, stamping the replica with
852    /// `x-amz-replication-status: REPLICA`. Failures after the retry
853    /// budget bump the manager's `dropped_total` counter and are
854    /// surfaced in the `s4_replication_dropped_total` Prometheus
855    /// counter; successes bump `s4_replication_replicated_total`.
856    #[must_use]
857    pub fn with_replication(mut self, mgr: Arc<crate::replication::ReplicationManager>) -> Self {
858        self.replication = Some(mgr);
859        self
860    }
861
862    /// v0.6 #40: borrow the attached replication manager (test /
863    /// introspection — used by the metrics layer to read
864    /// `dropped_total`).
865    #[must_use]
866    pub fn replication_manager(&self) -> Option<&Arc<crate::replication::ReplicationManager>> {
867        self.replication.as_ref()
868    }
869
870    /// v0.6 #40: internal helper used by the PUT handlers to fire a
871    /// detached cross-bucket replication task. No-op when no manager
872    /// is attached, the source backend PUT failed, or no rule on the
873    /// source bucket matches the (key, tags) tuple. The `body` is the
874    /// post-compression / post-encryption `Bytes` that was sent to
875    /// the source backend (refcount-cloned), and `metadata` is the
876    /// metadata map that already includes the manifest /
877    /// `s4-encrypted` markers — the replica decodes through the same
878    /// path. The destination PUT runs through `Arc<B>::put_object`.
879    ///
880    /// ## v0.8.2 #61: generation token + shadow-key destination
881    ///
882    /// `pending_version` is the source-side `PutOutcome` minted by the
883    /// caller's versioning branch (or `None` for unversioned /
884    /// suspended buckets). When `pending_version.versioned_response`
885    /// is `true`, the dispatcher writes the destination under the same
886    /// shadow path the source uses (`<key>.__s4ver__/<vid>`) so the
887    /// destination's version chain receives the new version the same
888    /// way `?versionId=` GET resolves it. Closes audit C-1.
889    ///
890    /// The dispatcher also mints a fresh `generation` token before
891    /// spawning, threaded through to [`crate::replication::
892    /// replicate_object`]. Closes audit C-3 — a stale retry of an
893    /// older PUT can no longer overwrite the destination's newer bytes
894    /// because the CAS guard sees the higher stored generation and
895    /// drops its destination write.
896    ///
897    /// ## Asymmetric versioning policy (out of scope)
898    ///
899    /// We assume source + destination buckets share the same
900    /// versioning policy (both Enabled or both Suspended /
901    /// Unversioned). Cross-bucket policy queries would require a
902    /// backend round-trip per replication, which is not worth it for
903    /// the single-instance scope. Operators who configure asymmetric
904    /// versioning will see destination-side `?versionId=` lookups
905    /// miss — documented as out-of-scope until a future per-rule
906    /// `destination_versioning_policy` knob lands.
907    // 8 args is the post-#61 shape: replication needs the
908    // source bucket+key, the canonical tag set for rule-matching,
909    // the post-codec body+metadata for the destination PUT, the
910    // backend-success gate, and the pending version-id for the
911    // shadow-key destination override. A shape struct would just
912    // split the (single) call site so opt for the inline form.
913    #[allow(clippy::too_many_arguments)]
914    fn spawn_replication_if_matched(
915        &self,
916        source_bucket: &str,
917        source_key: &str,
918        request_tags: &Option<crate::tagging::TagSet>,
919        body: &bytes::Bytes,
920        metadata: &Option<std::collections::HashMap<String, String>>,
921        backend_ok: bool,
922        pending_version: Option<&crate::versioning::PutOutcome>,
923    ) where
924        B: Send + Sync + 'static,
925    {
926        if !backend_ok {
927            return;
928        }
929        let Some(mgr) = self.replication.as_ref() else {
930            return;
931        };
932        // Pull the request's tags into the (k, v) shape the matcher
933        // expects. The tagging manager would have the canonical
934        // post-PUT view but at this point in the pipeline it's
935        // already been written above; for the rule-match decision
936        // the request's tags are sufficient (= the tags this PUT
937        // applies, S3 PutObject is full-replace on tags).
938        let object_tags: Vec<(String, String)> = request_tags
939            .as_ref()
940            .map(|ts| ts.iter().cloned().collect())
941            .unwrap_or_default();
942        let Some(rule) = mgr.match_rule(source_bucket, source_key, &object_tags) else {
943            return;
944        };
945        // v0.8.2 #61: mint the per-PUT generation BEFORE the eager
946        // Pending stamp so the stamp itself carries the right
947        // generation (the CAS in `record_status_if_newer` would
948        // otherwise see a `generation=0` Pending and accept any
949        // stale retry).
950        let generation = mgr.next_generation();
951        // Eagerly mark the source key as Pending so a HEAD between
952        // the source PUT returning and the spawned task completing
953        // surfaces the in-flight state. CAS-guarded so a slower
954        // older PUT can't downgrade a newer Completed back to Pending.
955        let _ = mgr.record_status_if_newer(
956            source_bucket,
957            source_key,
958            generation,
959            crate::replication::ReplicationStatus::Pending,
960        );
961        // v0.8.2 #61: derive the destination storage key. For a
962        // versioning-Enabled source the destination receives the
963        // same shadow-key path so a `?versionId=<vid>` GET on the
964        // destination resolves through the same lookup the source
965        // uses. Suspended / Unversioned sources keep the logical
966        // key (= `None` override = dispatcher uses `source_key`).
967        let destination_key_override = pending_version
968            .filter(|pv| pv.versioned_response)
969            .map(|pv| versioned_shadow_key(source_key, &pv.version_id));
970        // v0.8.3 #68 (audit M-1): capture the source object's Object
971        // Lock state so the dispatcher can decorate the destination
972        // PUT with the matching AWS-wire lock headers. Without this,
973        // a Compliance / Governance / legal-hold protected source
974        // would replicate to a destination where DELETE succeeds
975        // (the WORM posture would only hold on the source).
976        let source_lock_state = self
977            .object_lock
978            .as_ref()
979            .and_then(|mgr| mgr.get(source_bucket, source_key));
980        // v0.8.3 #68: hand the destination-side ObjectLockManager to
981        // the dispatcher closure so we can persist the propagated
982        // lock state on successful destination PUT (the destination
983        // PUT below bypasses S4Service::put_object — we drive the
984        // backend directly — so the explicit_lock_mode commit block
985        // in put_object never fires for replicas. We replay it here
986        // against the destination key.)
987        let dest_lock_mgr = self.object_lock.as_ref().map(Arc::clone);
988        let mgr_cl = Arc::clone(mgr);
989        let backend = Arc::clone(&self.backend);
990        let body_cl = body.clone();
991        let metadata_cl = metadata.clone();
992        let source_bucket_cl = source_bucket.to_owned();
993        let source_key_cl = source_key.to_owned();
994        let source_lock_state_for_closure = source_lock_state.clone();
995        let source_bucket_for_warn = source_bucket.to_owned();
996        // v0.8.5 #86 (audit M-2): bound the in-flight replication queue
997        // depth. Acquire happens INSIDE the spawned task (not on the
998        // listener path) so a saturated semaphore back-pressures the
999        // dispatcher pool without stalling the source PUT response —
1000        // the source has already returned 200 to the client by the time
1001        // the spawn body runs. A failed `acquire_owned` only happens
1002        // when the semaphore is closed (we never close it, so the
1003        // logged-and-skipped fallback is unreachable in practice).
1004        let semaphore = Arc::clone(&self.replication_semaphore);
1005        tokio::spawn(async move {
1006            let _permit = match semaphore.acquire_owned().await {
1007                Ok(p) => p,
1008                Err(e) => {
1009                    tracing::warn!(
1010                        bucket = %source_bucket_cl,
1011                        key = %source_key_cl,
1012                        "S4 replication dispatcher could not acquire semaphore permit (closed? {e}); skipping replica"
1013                    );
1014                    return;
1015                }
1016            };
1017            let do_put = move |dest_bucket: String,
1018                               dest_key: String,
1019                               dest_body: bytes::Bytes,
1020                               dest_meta: Option<std::collections::HashMap<String, String>>| {
1021                let backend = Arc::clone(&backend);
1022                let dest_lock_mgr = dest_lock_mgr.clone();
1023                let lock_state = source_lock_state_for_closure.clone();
1024                let warn_src = source_bucket_for_warn.clone();
1025                async move {
1026                    let req = S3Request {
1027                        input: PutObjectInput {
1028                            bucket: dest_bucket.clone(),
1029                            key: dest_key.clone(),
1030                            body: Some(bytes_to_blob(dest_body)),
1031                            metadata: dest_meta,
1032                            ..Default::default()
1033                        },
1034                        method: http::Method::PUT,
1035                        uri: "/".parse().unwrap(),
1036                        headers: http::HeaderMap::new(),
1037                        extensions: http::Extensions::new(),
1038                        credentials: None,
1039                        region: None,
1040                        service: None,
1041                        trailing_headers: None,
1042                    };
1043                    let put_result = backend
1044                        .put_object(req)
1045                        .await
1046                        .map(|_| ())
1047                        .map_err(|e| format!("destination put_object: {e}"));
1048                    // v0.8.3 #68: on successful destination PUT,
1049                    // persist the propagated lock state into the
1050                    // destination's ObjectLockManager so a subsequent
1051                    // DELETE on the destination is refused. Three cases:
1052                    //   - PUT failed     → skip (no replica to protect)
1053                    //   - lock_state None → nothing to propagate
1054                    //   - dest manager None (operator misconfig)
1055                    //                     → log warn-once + bump skip metric
1056                    if put_result.is_ok()
1057                        && let Some(state) = lock_state
1058                    {
1059                        match dest_lock_mgr {
1060                            Some(ref mgr) => {
1061                                mgr.set(&dest_bucket, &dest_key, state);
1062                            }
1063                            None => {
1064                                crate::replication::warn_lock_propagation_skipped(
1065                                    &warn_src,
1066                                    &dest_bucket,
1067                                );
1068                            }
1069                        }
1070                    }
1071                    put_result
1072                }
1073            };
1074            // v0.8.5 #81 (audit H-7): wrap the dispatcher body in
1075            // `futures::FutureExt::catch_unwind` so a panic inside
1076            // `replicate_object` (or any of the user-supplied closures
1077            // it drives — `do_put`, the destination backend, the lock
1078            // manager) does NOT bubble out of the detached task as a
1079            // `JoinError` that no operator dashboard scrapes. Caught
1080            // panics bump `s4_dispatcher_panics_total{kind="replication"}`
1081            // + log at ERROR with the panic payload, so silent feature
1082            // degradation (= every replication PUT panicking and
1083            // dropping the replica without any visible signal) becomes
1084            // a first-class metric the operator can alert on.
1085            //
1086            // `AssertUnwindSafe` is required because the inner future
1087            // captures `Arc<...>` clones + a `do_put` closure that are
1088            // not `UnwindSafe` by default; the safety contract here is
1089            // "we don't continue using any of those captures after the
1090            // panic" which trivially holds (we drop them and return).
1091            use futures::FutureExt as _;
1092            let dispatcher_kind = "replication";
1093            let fut = crate::replication::replicate_object(
1094                rule,
1095                source_bucket_cl,
1096                source_key_cl,
1097                body_cl,
1098                metadata_cl,
1099                do_put,
1100                mgr_cl,
1101                generation,
1102                destination_key_override,
1103                source_lock_state,
1104            );
1105            if let Err(panic) = std::panic::AssertUnwindSafe(fut).catch_unwind().await {
1106                let panic_msg = panic
1107                    .downcast_ref::<&'static str>()
1108                    .copied()
1109                    .map(str::to_owned)
1110                    .or_else(|| panic.downcast_ref::<String>().cloned())
1111                    .unwrap_or_else(|| "(non-string panic payload)".to_owned());
1112                tracing::error!(
1113                    kind = dispatcher_kind,
1114                    panic_payload = %panic_msg,
1115                    "S4 dispatcher task panicked (caught by catch_unwind, runtime not poisoned)"
1116                );
1117                crate::metrics::record_dispatcher_panic(dispatcher_kind);
1118            }
1119        });
1120    }
1121
1122    /// v0.6 #42: attach the in-memory MFA-Delete enforcement manager.
1123    /// Once set, every DELETE / DELETE-version / delete-marker /
1124    /// `PutBucketVersioning` request against a bucket whose MFA-Delete
1125    /// state is `Enabled` requires a valid `x-amz-mfa: <serial> <code>`
1126    /// header (RFC 6238 6-digit TOTP); the gate is a no-op for buckets
1127    /// where MFA-Delete is `Disabled` (S3 default).
1128    #[must_use]
1129    pub fn with_mfa_delete(mut self, mgr: Arc<crate::mfa::MfaDeleteManager>) -> Self {
1130        self.mfa_delete = Some(mgr);
1131        self
1132    }
1133
1134    /// v0.6 #42: borrow the attached MFA-Delete manager (test /
1135    /// introspection — used by the snapshot path in `main.rs` to call
1136    /// `to_json` for restart-recoverable state).
1137    #[must_use]
1138    pub fn mfa_delete_manager(&self) -> Option<&Arc<crate::mfa::MfaDeleteManager>> {
1139        self.mfa_delete.as_ref()
1140    }
1141
1142    /// v0.6 #38: attach the in-memory CORS configuration manager. Once
1143    /// set, `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
1144    /// route through the manager instead of forwarding to the backend,
1145    /// and [`Self::handle_preflight`] becomes useful for the (future)
1146    /// listener-side OPTIONS interceptor.
1147    #[must_use]
1148    pub fn with_cors(mut self, mgr: Arc<crate::cors::CorsManager>) -> Self {
1149        self.cors = Some(mgr);
1150        self
1151    }
1152
1153    /// v0.6 #38: Borrow the attached CORS manager (test / introspection).
1154    #[must_use]
1155    pub fn cors_manager(&self) -> Option<&Arc<crate::cors::CorsManager>> {
1156        self.cors.as_ref()
1157    }
1158
1159    /// v0.6 #38: evaluate a CORS preflight request against the bucket's
1160    /// configured rules and, if a rule matches, return the headers that
1161    /// the (future) listener-side OPTIONS interceptor must put on the
1162    /// 200 response: `Access-Control-Allow-Origin`, `Access-Control-
1163    /// Allow-Methods`, `Access-Control-Allow-Headers`, optionally
1164    /// `Access-Control-Max-Age` and `Access-Control-Expose-Headers`.
1165    ///
1166    /// Returns `None` when no manager is attached, no config is
1167    /// registered for the bucket, or no rule matches the (origin,
1168    /// method, headers) triple. The caller is responsible for turning
1169    /// `None` into the appropriate 403 response.
1170    ///
1171    /// **Note:** the OPTIONS routing itself (i.e. wiring this method
1172    /// into the hyper-util listener path) is a follow-up — s3s does not
1173    /// surface OPTIONS as a typed S3 handler, so this method is
1174    /// currently call-able only from inside other handlers and tests.
1175    #[must_use]
1176    pub fn handle_preflight(
1177        &self,
1178        bucket: &str,
1179        origin: &str,
1180        method: &str,
1181        request_headers: &[String],
1182    ) -> Option<std::collections::HashMap<String, String>> {
1183        let mgr = self.cors.as_ref()?;
1184        let rule = mgr.match_preflight(bucket, origin, method, request_headers)?;
1185        let mut h = std::collections::HashMap::new();
1186        // Echo the matched origin back. If the rule used "*" we still
1187        // echo "*" (S3 spec — the spec does not require us to echo the
1188        // *requesting* origin when the wildcard matched).
1189        let allow_origin = if rule.allowed_origins.iter().any(|o| o == "*") {
1190            "*".to_string()
1191        } else {
1192            origin.to_string()
1193        };
1194        h.insert("Access-Control-Allow-Origin".to_string(), allow_origin);
1195        h.insert(
1196            "Access-Control-Allow-Methods".to_string(),
1197            rule.allowed_methods.join(", "),
1198        );
1199        if !rule.allowed_headers.is_empty() {
1200            // For the Allow-Headers response, echo back the rule's
1201            // pattern list verbatim (S3 echoes the configured list,
1202            // including "*" if present). Browsers honour exact-match
1203            // rules.
1204            h.insert(
1205                "Access-Control-Allow-Headers".to_string(),
1206                rule.allowed_headers.join(", "),
1207            );
1208        }
1209        if let Some(secs) = rule.max_age_seconds {
1210            h.insert("Access-Control-Max-Age".to_string(), secs.to_string());
1211        }
1212        if !rule.expose_headers.is_empty() {
1213            h.insert(
1214                "Access-Control-Expose-Headers".to_string(),
1215                rule.expose_headers.join(", "),
1216            );
1217        }
1218        Some(h)
1219    }
1220
1221    /// v0.5 #32: enable strict compliance mode. Every PUT must carry an
1222    /// SSE indicator (server-side encryption header or SSE-C customer
1223    /// key); requests without one are rejected with 400 InvalidRequest.
1224    /// Boot-time prerequisite checking lives in the binary
1225    /// (`validate_compliance_mode`) so this flag is purely the runtime
1226    /// switch.
1227    #[must_use]
1228    pub fn with_compliance_strict(mut self, on: bool) -> Self {
1229        self.compliance_strict = on;
1230        self
1231    }
1232
1233    /// v0.5 #30: attach the in-memory Object Lock (WORM) enforcement
1234    /// manager. Once set, `delete_object` and overwrite-path
1235    /// `put_object` refuse operations on locked keys with HTTP 403
1236    /// `AccessDenied`; new PUTs to a bucket with a default retention
1237    /// policy auto-create per-object lock state.
1238    #[must_use]
1239    pub fn with_object_lock(mut self, mgr: Arc<crate::object_lock::ObjectLockManager>) -> Self {
1240        self.object_lock = Some(mgr);
1241        self
1242    }
1243
1244    /// v0.7 #45: borrow the attached Object Lock manager (read-only —
1245    /// the lifecycle scanner uses this to skip currently-locked objects
1246    /// before issuing `delete_object`, since an Object Lock always wins
1247    /// over Lifecycle Expiration in AWS S3 semantics). Mirrors the
1248    /// shape of [`Self::lifecycle_manager`] /
1249    /// [`Self::tag_manager`] — purely additive accessor, no handler
1250    /// behaviour change.
1251    #[must_use]
1252    pub fn object_lock_manager(&self) -> Option<&Arc<crate::object_lock::ObjectLockManager>> {
1253        self.object_lock.as_ref()
1254    }
1255
1256    /// v0.5 #28: attach an SSE-KMS backend. `default_key_id` is used
1257    /// when a PUT requests SSE-KMS without naming a specific KMS key
1258    /// (operators set this to mirror AWS S3's bucket-default key).
1259    #[must_use]
1260    pub fn with_kms_backend(
1261        mut self,
1262        kms: Arc<dyn crate::kms::KmsBackend>,
1263        default_key_id: Option<String>,
1264    ) -> Self {
1265        self.kms = Some(kms);
1266        self.kms_default_key_id = default_key_id;
1267        self
1268    }
1269
1270    /// v0.5 #34: attach the first-class versioning state machine. Once
1271    /// set, this `S4Service` owns the per-bucket versioning state +
1272    /// per-(bucket, key) version chain; `put_object` / `get_object` /
1273    /// `delete_object` / `list_object_versions` /
1274    /// `get_bucket_versioning` / `put_bucket_versioning` consult the
1275    /// manager instead of passing through to the backend. The backend
1276    /// is still used as the byte store: Suspended / Unversioned buckets
1277    /// keep using `<key>` directly (legacy), Enabled buckets redirect
1278    /// each version's bytes to a shadow key
1279    /// (`<key>.__s4ver__/<version-id>`) so older versions survive newer
1280    /// PUTs to the same logical key.
1281    #[must_use]
1282    pub fn with_versioning(mut self, mgr: Arc<crate::versioning::VersioningManager>) -> Self {
1283        self.versioning = Some(mgr);
1284        self
1285    }
1286
1287    /// v0.8.5 #86 (audit M-3): borrow the attached versioning manager so
1288    /// the SIGUSR1 snapshot dump-back hook in `main.rs` can re-emit the
1289    /// in-memory state to the operator's `--versioning-state-file`
1290    /// without restarting the gateway. Mirrors the shape of
1291    /// [`Self::object_lock_manager`] / [`Self::lifecycle_manager`] —
1292    /// purely additive accessor, no handler behaviour change.
1293    #[must_use]
1294    pub fn versioning_manager(&self) -> Option<&Arc<crate::versioning::VersioningManager>> {
1295        self.versioning.as_ref()
1296    }
1297
1298    /// v0.8.5 #86 (audit M-2): override the default replication-dispatch
1299    /// concurrency cap (1024). Wired by the `--replication-max-concurrent`
1300    /// CLI flag in `main.rs`. Operators running heavy cross-region
1301    /// fan-out may need to raise this; operators on memory-constrained
1302    /// hosts may need to lower it. The new value replaces the existing
1303    /// `Semaphore` (so calling this after dispatchers are already in
1304    /// flight is fine — the in-flight tasks hold permits from the old
1305    /// semaphore which is dropped when its last permit is released).
1306    /// A `max` of 0 would deadlock all replicas; the value is silently
1307    /// clamped to 1 instead.
1308    #[must_use]
1309    pub fn with_replication_max_concurrent(mut self, max: usize) -> Self {
1310        let max = max.max(1);
1311        self.replication_semaphore = Arc::new(tokio::sync::Semaphore::new(max));
1312        self
1313    }
1314
1315    /// v0.8.5 #86 (audit M-2): borrow the in-flight replication
1316    /// concurrency permit pool. Tests inspect `available_permits()`
1317    /// after invoking `spawn_replication_if_matched` to verify the
1318    /// dispatcher actually `acquire_owned`s before kicking off the
1319    /// destination PUT.
1320    #[must_use]
1321    pub fn replication_semaphore(&self) -> &Arc<tokio::sync::Semaphore> {
1322        &self.replication_semaphore
1323    }
1324
1325    /// v0.4 #21 (kept for back-compat): attach a single SSE-S4 key.
1326    /// Internally wraps it in a 1-slot keyring with id=1 active, so
1327    /// new objects ride the v0.5 S4E2 frame while previously-written
1328    /// S4E1 bytes (this same key) still decrypt via the keyring's S4E1
1329    /// fallback path. Operators wanting true rotation should call
1330    /// [`Self::with_sse_keyring`] instead.
1331    #[must_use]
1332    pub fn with_sse_key(mut self, key: crate::sse::SharedSseKey) -> Self {
1333        let keyring = crate::sse::SseKeyring::new(1, key);
1334        self.sse_keyring = Some(std::sync::Arc::new(keyring));
1335        self
1336    }
1337
1338    /// v0.5 #29: attach a multi-key SSE-S4 keyring. PUT encrypts under
1339    /// the active key (S4E2 frame stamped with that key's id); GET
1340    /// dispatches on the body's magic — S4E1 falls back to trying every
1341    /// key in the ring (active first) so v0.4 objects survive a
1342    /// migration; S4E2 looks up the explicit key_id from the header.
1343    #[must_use]
1344    pub fn with_sse_keyring(mut self, keyring: crate::sse::SharedSseKeyring) -> Self {
1345        self.sse_keyring = Some(keyring);
1346        self
1347    }
1348
1349    /// v0.8 #52: opt the SSE-S4 PUT path into the chunked S4E5 frame
1350    /// (so the matching GET can stream-decrypt chunk-by-chunk
1351    /// instead of buffering the entire body before tag verify).
1352    /// `bytes` is the plaintext slice size — typically 1 MiB; 0
1353    /// disables the path and reverts to the legacy S4E2 buffered
1354    /// frame.
1355    ///
1356    /// SSE-C (S4E3) and SSE-KMS (S4E4) are intentionally untouched:
1357    /// the chunked envelopes for those flows are a follow-up issue
1358    /// (the customer-key wire surface needs separate version
1359    /// negotiation).
1360    ///
1361    /// Has no effect when `with_sse_keyring` / `with_sse_key` is
1362    /// not also set — the chunked path runs only on the SSE-S4
1363    /// branch of `put_object`.
1364    #[must_use]
1365    pub fn with_sse_chunk_size(mut self, bytes: usize) -> Self {
1366        self.sse_chunk_size = bytes;
1367        self
1368    }
1369
1370    /// v0.4 #20: attach an S3-style access-log emitter. Each completed
1371    /// PUT / GET / DELETE / List handler emits one entry into the
1372    /// emitter's buffer; a background flusher (started separately, see
1373    /// [`crate::access_log::AccessLog::spawn_flusher`]) writes hourly
1374    /// rotated `.log` files into the configured directory.
1375    #[must_use]
1376    pub fn with_access_log(mut self, log: crate::access_log::SharedAccessLog) -> Self {
1377        self.access_log = Some(log);
1378        self
1379    }
1380
1381    /// Capture the per-request access-log preamble before the request is
1382    /// consumed by the backend call. Returns `None` if no access logger
1383    /// is configured (cheap early-out so the handler doesn't pay the
1384    /// header-clone cost when access logging is off).
1385    fn access_log_preamble<I>(&self, req: &S3Request<I>) -> Option<AccessLogPreamble> {
1386        self.access_log.as_ref()?;
1387        Some(AccessLogPreamble {
1388            // v0.8.11 CRIT-4 fix: same trust gate as `request_context`.
1389            // Recording a client-controllable header in the access log
1390            // would poison forensic queries; leave it `None` until the
1391            // operator declares X-Forwarded-For is set by a trusted
1392            // proxy.
1393            remote_ip: if self.trust_x_forwarded_for {
1394                req.headers
1395                    .get("x-forwarded-for")
1396                    .and_then(|v| v.to_str().ok())
1397                    .and_then(|raw| raw.split(',').next())
1398                    .map(|s| s.trim().to_owned())
1399            } else {
1400                None
1401            },
1402            requester: Self::principal_of(req).map(str::to_owned),
1403            request_uri: format!("{} {}", req.method, req.uri.path()),
1404            user_agent: req
1405                .headers
1406                .get("user-agent")
1407                .and_then(|v| v.to_str().ok())
1408                .map(str::to_owned),
1409        })
1410    }
1411
1412    /// Internal — called by handlers at end-of-request with a captured
1413    /// preamble. Best-effort: swallows the await fast (clones Arc +
1414    /// pushes), no error propagation back to the request path.
1415    #[allow(clippy::too_many_arguments)]
1416    async fn record_access(
1417        &self,
1418        preamble: Option<AccessLogPreamble>,
1419        operation: &'static str,
1420        bucket: &str,
1421        key: Option<&str>,
1422        http_status: u16,
1423        bytes_sent: u64,
1424        object_size: u64,
1425        total_time_ms: u64,
1426        error_code: Option<&str>,
1427    ) {
1428        let (Some(log), Some(p)) = (self.access_log.as_ref(), preamble) else {
1429            return;
1430        };
1431        log.record(crate::access_log::AccessLogEntry {
1432            time: std::time::SystemTime::now(),
1433            bucket: bucket.to_owned(),
1434            remote_ip: p.remote_ip,
1435            requester: p.requester,
1436            operation,
1437            key: key.map(str::to_owned),
1438            request_uri: p.request_uri,
1439            http_status,
1440            error_code: error_code.map(str::to_owned),
1441            bytes_sent,
1442            object_size,
1443            total_time_ms,
1444            user_agent: p.user_agent,
1445        })
1446        .await;
1447    }
1448
1449    /// v0.4 #19: attach a per-(principal, bucket) token-bucket rate limiter.
1450    /// When set, every PUT / GET / DELETE / List / Copy / multipart op is
1451    /// throttle-checked before the policy gate; throttled requests return
1452    /// `S3ErrorCode::SlowDown` (HTTP 503) and bump
1453    /// `s4_rate_limit_throttled_total{principal,bucket}`.
1454    #[must_use]
1455    pub fn with_rate_limits(mut self, rl: crate::rate_limit::SharedRateLimits) -> Self {
1456        self.rate_limits = Some(rl);
1457        self
1458    }
1459
1460    /// Helper used by request handlers to apply the rate limit. Returns
1461    /// `Ok(())` when allowed (or no rate limiter is configured), or a
1462    /// `SlowDown` S3Error otherwise.
1463    fn enforce_rate_limit<I>(&self, req: &S3Request<I>, bucket: &str) -> S3Result<()> {
1464        let Some(rl) = self.rate_limits.as_ref() else {
1465            return Ok(());
1466        };
1467        let principal_id = Self::principal_of(req);
1468        if !rl.check(principal_id, bucket) {
1469            crate::metrics::record_rate_limit_throttle(principal_id.unwrap_or("-"), bucket);
1470            return Err(S3Error::with_message(
1471                S3ErrorCode::SlowDown,
1472                format!("rate-limited: bucket={bucket}"),
1473            ));
1474        }
1475        Ok(())
1476    }
1477
1478    /// Tell the policy evaluator that the listener is reached over TLS
1479    /// (or ACME). When `true`, the `aws:SecureTransport` Condition key
1480    /// resolves to `true`. Defaults to `false`.
1481    #[must_use]
1482    pub fn with_secure_transport(mut self, on: bool) -> Self {
1483        self.secure_transport = on;
1484        self
1485    }
1486
1487    #[must_use]
1488    pub fn with_max_body_bytes(mut self, n: usize) -> Self {
1489        self.max_body_bytes = n;
1490        self
1491    }
1492
1493    /// Attach an optional bucket policy (v0.2 #7). When `Some(...)`, every
1494    /// PUT / GET / DELETE / List handler runs `policy.evaluate(...)` before
1495    /// delegating to the backend; failures return `S3ErrorCode::AccessDenied`.
1496    /// When `None` (the default), no policy enforcement happens.
1497    #[must_use]
1498    pub fn with_policy(mut self, policy: crate::policy::SharedPolicy) -> Self {
1499        self.policy = Some(policy);
1500        self
1501    }
1502
1503    /// Pull the SigV4 access key id off the request's credentials, if any.
1504    /// Used as the `principal_id` for policy evaluation.
1505    fn principal_of<I>(req: &S3Request<I>) -> Option<&str> {
1506        req.credentials.as_ref().map(|c| c.access_key.as_str())
1507    }
1508
1509    /// v0.8.17 G-2: shared reserved-name guard used by every per-object
1510    /// API handler. `mode` chooses the AWS error shape: `Mutating`
1511    /// (PUT / Copy / DELETE / Tagging-write) returns
1512    /// `InvalidObjectName`; `Read` (GET / HEAD / Attributes / Tagging-read)
1513    /// returns `NoSuchKey` so a curious client gets the same response
1514    /// the listing filter has been giving them since v0.8.12 (the
1515    /// sidecar is invisible to list).
1516    ///
1517    /// v0.8.17 G-4: when `--allow-legacy-reserved-key-reads` is set
1518    /// AND the call is a `Read`, the guard returns `Ok(())` so
1519    /// operators upgrading from pre-v0.8.15 deployments can still
1520    /// access (and migrate off) any user-owned `<key>.s4index`
1521    /// objects that landed before M-1 / F-13 closed the namespace.
1522    /// Mutating operations stay blocked regardless of the flag —
1523    /// the flag is a read-only migration aid, not an injection
1524    /// re-opener.
1525    fn check_not_reserved_key(&self, key: &str, mode: ReservedKeyMode) -> S3Result<()> {
1526        if !s4_codec::index::is_reserved_sidecar_key(key) {
1527            return Ok(());
1528        }
1529        if matches!(mode, ReservedKeyMode::Read) && self.allow_legacy_reserved_key_reads {
1530            return Ok(());
1531        }
1532        match mode {
1533            ReservedKeyMode::Read => Err(S3Error::with_message(
1534                S3ErrorCode::NoSuchKey,
1535                format!("object key {key:?} is reserved for S4 internal sidecars"),
1536            )),
1537            ReservedKeyMode::Mutating => {
1538                let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
1539                    .unwrap_or(S3ErrorCode::InvalidArgument);
1540                Err(S3Error::with_message(
1541                    code,
1542                    format!(
1543                        "object key {key:?} is reserved (suffix `{}` is used for S4 internal \
1544                         sidecars)",
1545                        s4_codec::index::SIDECAR_SUFFIX,
1546                    ),
1547                ))
1548            }
1549        }
1550    }
1551
1552    /// v0.3 #13: build the per-request policy context from the incoming
1553    /// `S3Request`. Pulls `aws:UserAgent` from the User-Agent header,
1554    /// `aws:SourceIp` from the standard `X-Forwarded-For` header (most
1555    /// production deployments are behind an LB / reverse proxy that sets
1556    /// this), `aws:CurrentTime` from the system clock, and
1557    /// `aws:SecureTransport` from the per-listener TLS flag.
1558    fn request_context<I>(&self, req: &S3Request<I>) -> crate::policy::RequestContext {
1559        let user_agent = req
1560            .headers
1561            .get("user-agent")
1562            .and_then(|v| v.to_str().ok())
1563            .map(str::to_owned);
1564        // v0.8.11 CRIT-4 fix: `X-Forwarded-For` is a client-controllable
1565        // header. Trusting it unconditionally lets any public-internet
1566        // request claim it came from a trusted CIDR (e.g.
1567        // `curl -H 'X-Forwarded-For: 10.0.0.1'` to satisfy a
1568        // `Condition: NotIpAddress aws:SourceIp [10.0.0.0/8]` Deny).
1569        // We now only consume the header when the operator has
1570        // declared "this gateway sits behind a trusted reverse proxy
1571        // that scrubs client-supplied values" via
1572        // `with_trust_x_forwarded_for(true)` /
1573        // `--trust-x-forwarded-for`. Default leaves `source_ip` as
1574        // `None`, which fails closed for IP-allowlist Allow rules
1575        // and fails open for IP-blocklist Deny rules — operators
1576        // who need either case behind a public listener must opt in
1577        // or move the gate to the reverse proxy. The leftmost
1578        // comma-separated token is the originator per the
1579        // `X-Forwarded-For: client, proxy1, proxy2` convention.
1580        let source_ip = if self.trust_x_forwarded_for {
1581            req.headers
1582                .get("x-forwarded-for")
1583                .and_then(|v| v.to_str().ok())
1584                .and_then(|raw| raw.split(',').next())
1585                .and_then(|s| s.trim().parse().ok())
1586        } else {
1587            None
1588        };
1589        crate::policy::RequestContext {
1590            source_ip,
1591            user_agent,
1592            request_time: Some(std::time::SystemTime::now()),
1593            secure_transport: self.secure_transport,
1594            existing_object_tags: None,
1595            request_object_tags: None,
1596            extra: Default::default(),
1597        }
1598    }
1599
1600    /// Helper used by request handlers to enforce the optional policy.
1601    /// Returns `Ok(())` when allowed (or no policy is configured), or an
1602    /// `AccessDenied` S3Error otherwise. Bumps the policy denial Prometheus
1603    /// counter on deny.
1604    fn enforce_policy<I>(
1605        &self,
1606        req: &S3Request<I>,
1607        action: &'static str,
1608        bucket: &str,
1609        key: Option<&str>,
1610    ) -> S3Result<()> {
1611        self.enforce_policy_with_extra(req, action, bucket, key, None, None)
1612    }
1613
1614    /// v0.6 #39: variant of [`Self::enforce_policy`] that lets the
1615    /// caller plumb tag context (existing-on-object + on-request) into
1616    /// the policy evaluator. Both arguments default to `None`, in
1617    /// which case the resulting `RequestContext` is identical to
1618    /// [`Self::enforce_policy`]'s — so for handlers that don't deal
1619    /// with tags this is a transparent no-op.
1620    fn enforce_policy_with_extra<I>(
1621        &self,
1622        req: &S3Request<I>,
1623        action: &'static str,
1624        bucket: &str,
1625        key: Option<&str>,
1626        request_tags: Option<&crate::tagging::TagSet>,
1627        existing_tags: Option<&crate::tagging::TagSet>,
1628    ) -> S3Result<()> {
1629        let Some(policy) = self.policy.as_ref() else {
1630            return Ok(());
1631        };
1632        let principal_id = Self::principal_of(req);
1633        let mut ctx = self.request_context(req);
1634        if let Some(t) = request_tags {
1635            ctx.request_object_tags = Some(t.clone());
1636        }
1637        if let Some(t) = existing_tags {
1638            ctx.existing_object_tags = Some(t.clone());
1639        }
1640        let decision = policy.evaluate_with(action, bucket, key, principal_id, &ctx);
1641        if decision.allow {
1642            Ok(())
1643        } else {
1644            crate::metrics::record_policy_denial(action, bucket);
1645            tracing::info!(
1646                action,
1647                bucket,
1648                key = ?key,
1649                principal = ?principal_id,
1650                source_ip = ?ctx.source_ip,
1651                user_agent = ?ctx.user_agent,
1652                secure_transport = ctx.secure_transport,
1653                matched_sid = ?decision.matched_sid,
1654                effect = ?decision.matched_effect,
1655                "S4 policy denied request"
1656            );
1657            Err(S3Error::with_message(
1658                S3ErrorCode::AccessDenied,
1659                format!("denied by S4 policy: {action} on bucket={bucket}"),
1660            ))
1661        }
1662    }
1663
1664    /// テスト用: backend を取り戻す (test helper、production では使わない).
1665    /// v0.6 #40 で `backend` が `Arc<B>` 化したので `Arc::try_unwrap` で
1666    /// 1-clone の場合のみ返す。共有されている (= replication dispatcher が
1667    /// 同じ Arc を持っていて未完了) 場合は `Err` を返さず panic させる
1668    /// (test 用途専用 helper の caller 契約を維持)。
1669    pub fn into_backend(self) -> B {
1670        Arc::try_unwrap(self.backend).unwrap_or_else(|_| {
1671            panic!("into_backend: backend Arc still shared (replication dispatcher in flight?)")
1672        })
1673    }
1674
1675    /// 必要 frame だけを backend に Range GET し、frame parse + decompress + slice
1676    /// した結果を返す sidecar fast path。Range request の **帯域節約版**。
1677    async fn partial_range_get(
1678        &self,
1679        req: &S3Request<GetObjectInput>,
1680        plan: s4_codec::index::RangePlan,
1681        client_start: u64,
1682        client_end_exclusive: u64,
1683        total_original: u64,
1684        get_start: Instant,
1685    ) -> S3Result<S3Response<GetObjectOutput>> {
1686        // 必要 byte 範囲だけを backend に partial GET
1687        let backend_range = s3s::dto::Range::Int {
1688            first: plan.byte_start,
1689            last: Some(plan.byte_end_exclusive - 1),
1690        };
1691        let backend_input = GetObjectInput {
1692            bucket: req.input.bucket.clone(),
1693            key: req.input.key.clone(),
1694            range: Some(backend_range),
1695            ..Default::default()
1696        };
1697        let backend_req = S3Request {
1698            input: backend_input,
1699            method: req.method.clone(),
1700            uri: req.uri.clone(),
1701            headers: req.headers.clone(),
1702            extensions: http::Extensions::new(),
1703            credentials: req.credentials.clone(),
1704            region: req.region.clone(),
1705            service: req.service.clone(),
1706            trailing_headers: None,
1707        };
1708        let mut backend_resp = self.backend.get_object(backend_req).await?;
1709        let blob = backend_resp.output.body.take().ok_or_else(|| {
1710            S3Error::with_message(
1711                S3ErrorCode::InternalError,
1712                "backend partial GET returned empty body",
1713            )
1714        })?;
1715        let bytes = collect_blob(blob, self.max_body_bytes)
1716            .await
1717            .map_err(internal("collect partial body"))?;
1718
1719        // frame parse + decompress
1720        let mut combined = BytesMut::new();
1721        for frame in FrameIter::new(bytes) {
1722            let (header, payload) = frame.map_err(|e| {
1723                S3Error::with_message(
1724                    S3ErrorCode::InternalError,
1725                    format!("partial-range frame parse: {e}"),
1726                )
1727            })?;
1728            let chunk_manifest = ChunkManifest {
1729                codec: header.codec,
1730                original_size: header.original_size,
1731                compressed_size: header.compressed_size,
1732                crc32c: header.crc32c,
1733            };
1734            let decompressed = self
1735                .registry
1736                .decompress(payload, &chunk_manifest)
1737                .await
1738                .map_err(internal("partial-range decompress"))?;
1739            combined.extend_from_slice(&decompressed);
1740        }
1741        let combined = combined.freeze();
1742        let sliced = combined
1743            .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
1744
1745        // response 組立て
1746        let returned_size = sliced.len() as u64;
1747        backend_resp.output.content_length = Some(returned_size as i64);
1748        backend_resp.output.content_range = Some(format!(
1749            "bytes {client_start}-{}/{total_original}",
1750            client_end_exclusive - 1
1751        ));
1752        backend_resp.output.checksum_crc32 = None;
1753        backend_resp.output.checksum_crc32c = None;
1754        backend_resp.output.checksum_crc64nvme = None;
1755        backend_resp.output.checksum_sha1 = None;
1756        backend_resp.output.checksum_sha256 = None;
1757        backend_resp.output.e_tag = None;
1758        backend_resp.output.body = Some(bytes_to_blob(sliced));
1759        backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
1760
1761        let elapsed = get_start.elapsed();
1762        crate::metrics::record_get(
1763            "partial",
1764            plan.byte_end_exclusive - plan.byte_start,
1765            returned_size,
1766            elapsed.as_secs_f64(),
1767            true,
1768        );
1769        info!(
1770            op = "get_object",
1771            bucket = %req.input.bucket,
1772            key = %req.input.key,
1773            bytes_in = plan.byte_end_exclusive - plan.byte_start,
1774            bytes_out = returned_size,
1775            total_object_size = total_original,
1776            range = true,
1777            path = "sidecar-partial",
1778            latency_ms = elapsed.as_millis() as u64,
1779            "S4 partial Range GET via sidecar index"
1780        );
1781        Ok(backend_resp)
1782    }
1783
1784    /// `<key>.s4index` sidecar object を backend に書く。失敗しても本体 PUT は
1785    /// 成功扱いにしたいので、err は warn ログのみ (Range GET の partial path が
1786    /// 使えなくなるが、full read fallback で意味的には正しい結果を返す)。
1787    async fn write_sidecar(&self, bucket: &str, key: &str, index: &FrameIndex) {
1788        let bytes = encode_index(index);
1789        let len = bytes.len() as i64;
1790        let sidecar = sidecar_key(key);
1791        // v0.7 #49: synthetic re-entry URI must be percent-encoded; if
1792        // the (already legally-arbitrary) S3 key produces something we
1793        // cannot encode at all, drop the sidecar PUT (the GET path
1794        // falls back to a full read on a missing sidecar) instead of
1795        // panicking on `parse().unwrap()`.
1796        let uri = match safe_object_uri(bucket, &sidecar) {
1797            Ok(u) => u,
1798            Err(e) => {
1799                tracing::warn!(
1800                    bucket,
1801                    key,
1802                    "S4 write_sidecar skipped (key not URI-encodable): {e}"
1803                );
1804                return;
1805            }
1806        };
1807        let put_input = PutObjectInput {
1808            bucket: bucket.into(),
1809            key: sidecar,
1810            body: Some(bytes_to_blob(bytes)),
1811            content_length: Some(len),
1812            content_type: Some("application/x-s4-index".into()),
1813            ..Default::default()
1814        };
1815        let put_req = S3Request {
1816            input: put_input,
1817            method: http::Method::PUT,
1818            uri,
1819            headers: http::HeaderMap::new(),
1820            extensions: http::Extensions::new(),
1821            credentials: None,
1822            region: None,
1823            service: None,
1824            trailing_headers: None,
1825        };
1826        if let Err(e) = self.backend.put_object(put_req).await {
1827            tracing::warn!(
1828                bucket,
1829                key,
1830                "S4 write_sidecar failed (Range GET will fall back to full read): {e}"
1831            );
1832        }
1833    }
1834
1835    /// v0.8.4 #73 H-2: confirm that the sidecar we just decoded still
1836    /// describes the current backend object before we trust its frame
1837    /// offsets for a partial Range GET. The sidecar carries the source
1838    /// `etag` and `compressed_size` that were observed at PUT time; we
1839    /// HEAD the backend object and compare.
1840    ///
1841    /// Decision matrix:
1842    /// - sidecar `source_etag = None` (legacy v1 / build_index_from_body
1843    ///   that wasn't stamped) → return `true` (best-effort, preserves
1844    ///   pre-v0.8.4 behaviour for existing on-disk sidecars).
1845    /// - HEAD fails → return `false` (we can't tell either way; full GET
1846    ///   path will surface the real backend error to the client).
1847    /// - HEAD ETag matches → `true`.
1848    /// - HEAD ETag differs OR HEAD size differs from
1849    ///   `source_compressed_size` → `false` (sidecar stale or attacker-
1850    ///   written; fall back to full GET).
1851    async fn sidecar_version_binding_ok(
1852        &self,
1853        bucket: &str,
1854        key: &str,
1855        index: &FrameIndex,
1856    ) -> bool {
1857        let Some(ref expected_etag) = index.source_etag else {
1858            // Legacy sidecar without the v0.8.4 #73 H-2 binding —
1859            // back-compat: trust it (the partial fetch is the same
1860            // best-effort path that v0.8.3 and earlier shipped).
1861            return true;
1862        };
1863        let head_input = HeadObjectInput {
1864            bucket: bucket.into(),
1865            key: key.into(),
1866            ..Default::default()
1867        };
1868        let uri = match safe_object_uri(bucket, key) {
1869            Ok(u) => u,
1870            Err(_) => return false,
1871        };
1872        let head_req = S3Request {
1873            input: head_input,
1874            method: http::Method::HEAD,
1875            uri,
1876            headers: http::HeaderMap::new(),
1877            extensions: http::Extensions::new(),
1878            credentials: None,
1879            region: None,
1880            service: None,
1881            trailing_headers: None,
1882        };
1883        let head = match self.backend.head_object(head_req).await {
1884            Ok(r) => r.output,
1885            Err(e) => {
1886                tracing::debug!(
1887                    bucket,
1888                    key,
1889                    "S4 sidecar version-binding HEAD failed, falling back to full GET: {e}"
1890                );
1891                return false;
1892            }
1893        };
1894        // ETag is a strong-vs-weak enum; we compare on the unwrapped string
1895        // form (matches what the PUT path stamped — see below).
1896        let live_etag = head.e_tag.as_ref().map(|t| t.value());
1897        if live_etag != Some(expected_etag.as_str()) {
1898            tracing::debug!(
1899                bucket,
1900                key,
1901                "sidecar stale (ETag mismatch), falling back to full GET (sidecar={:?}, live={:?})",
1902                expected_etag,
1903                live_etag,
1904            );
1905            return false;
1906        }
1907        if let Some(expected_size) = index.source_compressed_size
1908            && let Some(live_size) = head.content_length
1909            && live_size as u64 != expected_size
1910        {
1911            tracing::debug!(
1912                bucket,
1913                key,
1914                "sidecar stale (size mismatch), falling back to full GET (sidecar={}, live={})",
1915                expected_size,
1916                live_size,
1917            );
1918            return false;
1919        }
1920        true
1921    }
1922
1923    /// `<key>.s4index` sidecar を backend から読み出す。なければ None。
1924    async fn read_sidecar(&self, bucket: &str, key: &str) -> Option<FrameIndex> {
1925        let sidecar = sidecar_key(key);
1926        // v0.7 #49: same encode-or-bail treatment as write_sidecar.
1927        let uri = safe_object_uri(bucket, &sidecar).ok()?;
1928        let get_input = GetObjectInput {
1929            bucket: bucket.into(),
1930            key: sidecar,
1931            ..Default::default()
1932        };
1933        let get_req = S3Request {
1934            input: get_input,
1935            method: http::Method::GET,
1936            uri,
1937            headers: http::HeaderMap::new(),
1938            extensions: http::Extensions::new(),
1939            credentials: None,
1940            region: None,
1941            service: None,
1942            trailing_headers: None,
1943        };
1944        let resp = self.backend.get_object(get_req).await.ok()?;
1945        let blob = resp.output.body?;
1946        let bytes = collect_blob(blob, 64 * 1024 * 1024).await.ok()?;
1947        decode_index(bytes).ok()
1948    }
1949
1950    /// Multipart object (frame 列) を解凍 → 元 bytes を再構築。
1951    ///
1952    /// **per-frame codec dispatch**: 各 frame header に codec_id が入っているので、
1953    /// frame ごとに registry が違う codec を呼ぶことができる。同一 object 内で
1954    /// 異なる codec が混在していても透過的に解凍可能 (parquet 風 mixed columns 等)。
1955    async fn decompress_multipart(&self, bytes: bytes::Bytes) -> S3Result<bytes::Bytes> {
1956        let mut out = BytesMut::new();
1957        // v0.8.15 H-h: cap the *aggregate* decoded output. Each
1958        // individual frame is already bounded by
1959        // `validate_decompress_manifest` (default 5 GiB per frame),
1960        // but a forged multi-frame body can declare many frames
1961        // each near the limit — without an object-level ceiling, a
1962        // single GET could pin tens of GiB of plaintext in
1963        // `BytesMut::extend_from_slice`. Use the gateway's
1964        // `max_body_bytes` (same cap that bounds PUT bodies) so a
1965        // GET can never produce more plaintext than a PUT can ever
1966        // legitimately have stored.
1967        let aggregate_cap = self.max_body_bytes;
1968        let mut produced: usize = 0;
1969        for frame in FrameIter::new(bytes) {
1970            let (header, payload) = frame.map_err(|e| {
1971                S3Error::with_message(
1972                    S3ErrorCode::InternalError,
1973                    format!("multipart frame parse: {e}"),
1974                )
1975            })?;
1976            let chunk_manifest = ChunkManifest {
1977                codec: header.codec,
1978                original_size: header.original_size,
1979                compressed_size: header.compressed_size,
1980                crc32c: header.crc32c,
1981            };
1982            // v0.8.15 H-h: pre-flight check on the declared
1983            // `original_size` so a forged manifest claiming a frame
1984            // that would push us past the cap is rejected before we
1985            // start decoding. Defence-in-depth alongside the
1986            // post-decode `produced` check below.
1987            if (produced as u64).saturating_add(header.original_size) > aggregate_cap as u64 {
1988                return Err(S3Error::with_message(
1989                    S3ErrorCode::InternalError,
1990                    format!(
1991                        "multipart aggregate output exceeds cap: would reach \
1992                         {produced_total} bytes after this frame, cap is {aggregate_cap}",
1993                        produced_total = (produced as u64).saturating_add(header.original_size),
1994                    ),
1995                ));
1996            }
1997            let decompressed = self
1998                .registry
1999                .decompress(payload, &chunk_manifest)
2000                .await
2001                .map_err(internal("multipart frame decompress"))?;
2002            produced = produced.saturating_add(decompressed.len());
2003            if produced > aggregate_cap {
2004                return Err(S3Error::with_message(
2005                    S3ErrorCode::InternalError,
2006                    format!(
2007                        "multipart aggregate output exceeded cap: {produced} bytes \
2008                         emitted, cap is {aggregate_cap}"
2009                    ),
2010                ));
2011            }
2012            out.extend_from_slice(&decompressed);
2013        }
2014        Ok(out.freeze())
2015    }
2016}
2017
2018/// Parse a CopySourceRange header value (`bytes=N-M`, `bytes=N-`, `bytes=-N`)
2019/// into the s3s::dto::Range used by the GetObject path. The S3 spec only
2020/// allows `bytes=N-M` for upload_part_copy (no suffix or open-ended), so
2021/// reject the other variants for parity with AWS.
2022fn parse_copy_source_range(s: &str) -> Result<s3s::dto::Range, String> {
2023    let rest = s
2024        .strip_prefix("bytes=")
2025        .ok_or_else(|| format!("CopySourceRange must start with 'bytes=', got {s:?}"))?;
2026    let (a, b) = rest
2027        .split_once('-')
2028        .ok_or_else(|| format!("CopySourceRange must be 'bytes=N-M', got {s:?}"))?;
2029    let first: u64 = a
2030        .parse()
2031        .map_err(|_| format!("CopySourceRange first byte not a number: {a:?}"))?;
2032    let last: u64 = b
2033        .parse()
2034        .map_err(|_| format!("CopySourceRange last byte not a number: {b:?}"))?;
2035    if last < first {
2036        return Err(format!("CopySourceRange last < first: {s:?}"));
2037    }
2038    Ok(s3s::dto::Range::Int {
2039        first,
2040        last: Some(last),
2041    })
2042}
2043
2044/// v0.5 #34: synthesize the backend storage key for a given
2045/// (logical key, version-id) pair on an Enabled-versioning bucket.
2046///
2047/// Uses the `__s4ver__/` infix because:
2048/// - it's not a substring of `.s4index` / `.s4ver` natural keys (no false-positive
2049///   listing filter collisions)
2050/// - directory-style separator keeps S3 console "browse by prefix" UX intact
2051///   (versions roll up under one virtual folder per object)
2052/// - human-readable on debug logs / `aws s3 ls`
2053///
2054/// `list_objects` / `list_objects_v2` / `list_object_versions` MUST filter
2055/// keys containing `.__s4ver__/` from results so customers don't see internal
2056/// shadow objects.
2057pub fn versioned_shadow_key(key: &str, version_id: &str) -> String {
2058    format!("{key}.__s4ver__/{version_id}")
2059}
2060
2061/// Test for the marker substring used by [`versioned_shadow_key`]. Cheap str
2062/// scan; both list_objects filter and the GET passthrough check use this.
2063fn is_versioning_shadow_key(key: &str) -> bool {
2064    key.contains(".__s4ver__/")
2065}
2066
2067/// v0.6 #42: wall-clock seconds since the UNIX epoch — fed to
2068/// `mfa::check_mfa` so the TOTP verifier can match the client's
2069/// authenticator app's view of "now". Falls back to `0` on the
2070/// (impossible-in-practice) clock-before-1970 path so the verifier
2071/// rejects rather than panicking.
2072fn current_unix_secs() -> u64 {
2073    std::time::SystemTime::now()
2074        .duration_since(std::time::UNIX_EPOCH)
2075        .map(|d| d.as_secs())
2076        .unwrap_or(0)
2077}
2078
2079/// v0.6 #42: translate an `MfaError` into the matching S3 wire error.
2080///
2081/// - `Missing` / `SerialMismatch` / `InvalidCode` → `403 AccessDenied`
2082///   (S3 spec for MFA Delete: every gating failure surfaces as
2083///   `AccessDenied`, not a separate `MFA*` code).
2084/// - `Malformed` → `400 InvalidRequest` (the request itself is
2085///   syntactically broken, not a permission issue).
2086fn mfa_error_to_s3(e: crate::mfa::MfaError) -> S3Error {
2087    match e {
2088        crate::mfa::MfaError::Missing => S3Error::with_message(
2089            S3ErrorCode::AccessDenied,
2090            "MFA token required for this operation",
2091        ),
2092        crate::mfa::MfaError::Malformed => {
2093            S3Error::with_message(S3ErrorCode::InvalidRequest, "malformed x-amz-mfa header")
2094        }
2095        crate::mfa::MfaError::SerialMismatch => S3Error::with_message(
2096            S3ErrorCode::AccessDenied,
2097            "MFA serial does not match configured device",
2098        ),
2099        crate::mfa::MfaError::InvalidCode => {
2100            S3Error::with_message(S3ErrorCode::AccessDenied, "invalid MFA code")
2101        }
2102    }
2103}
2104
2105fn is_multipart_object(metadata: &Option<Metadata>) -> bool {
2106    metadata
2107        .as_ref()
2108        .and_then(|m| m.get(META_MULTIPART))
2109        .map(|v| v == "true")
2110        .unwrap_or(false)
2111}
2112
2113const META_CODEC: &str = "s4-codec";
2114const META_ORIGINAL_SIZE: &str = "s4-original-size";
2115const META_COMPRESSED_SIZE: &str = "s4-compressed-size";
2116const META_CRC32C: &str = "s4-crc32c";
2117/// Multipart upload で per-part frame format を使ったオブジェクトであることを示す。
2118/// GET 時にこの flag を見て frame parser を起動する。
2119const META_MULTIPART: &str = "s4-multipart";
2120/// v0.2 #4: single-PUT でも S4F2 framed format で書かれていることを示す。
2121/// 旧 v0.1 single-PUT は raw 圧縮 bytes (この flag なし)。GET 時にこの flag を
2122/// 見て framed 経路 (= multipart と同じ FrameIter parse) に流す。
2123const META_FRAMED: &str = "s4-framed";
2124
2125fn is_framed_v2_object(metadata: &Option<Metadata>) -> bool {
2126    metadata
2127        .as_ref()
2128        .and_then(|m| m.get(META_FRAMED))
2129        .map(|v| v == "true")
2130        .unwrap_or(false)
2131}
2132
2133/// v0.4 #21: detect SSE-S4 by the metadata flag we set on PUT.
2134fn is_sse_encrypted(metadata: &Option<Metadata>) -> bool {
2135    metadata
2136        .as_ref()
2137        .and_then(|m| m.get("s4-encrypted"))
2138        .map(|v| v == "aes-256-gcm")
2139        .unwrap_or(false)
2140}
2141
2142/// v0.5 #27: pull the three SSE-C headers off an input struct. The S3
2143/// contract is "all three or none" — partial sets are a 400.
2144///
2145/// Returns `Ok(None)` when no SSE-C headers were sent (server-managed or
2146/// no encryption), `Ok(Some(material))` on validated client key, and
2147/// `Err` for malformed or partial inputs.
2148fn extract_sse_c_material(
2149    algorithm: &Option<String>,
2150    key: &Option<String>,
2151    md5: &Option<String>,
2152) -> S3Result<Option<crate::sse::CustomerKeyMaterial>> {
2153    match (algorithm, key, md5) {
2154        (None, None, None) => Ok(None),
2155        (Some(a), Some(k), Some(m)) => crate::sse::parse_customer_key_headers(a, k, m)
2156            .map(Some)
2157            .map_err(sse_c_error_to_s3),
2158        _ => Err(S3Error::with_message(
2159            S3ErrorCode::InvalidRequest,
2160            "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
2161        )),
2162    }
2163}
2164
2165/// v0.5 #28: detect SSE-KMS request — `x-amz-server-side-encryption: aws:kms`.
2166/// Returns the key-id to wrap under, falling back to the gateway default.
2167fn extract_kms_key_id(
2168    sse: &Option<ServerSideEncryption>,
2169    sse_kms_key_id: &Option<String>,
2170    gateway_default: Option<&str>,
2171) -> Option<String> {
2172    let asks_for_kms = sse
2173        .as_ref()
2174        .map(|s| s.as_str() == ServerSideEncryption::AWS_KMS)
2175        .unwrap_or(false);
2176    if !asks_for_kms {
2177        return None;
2178    }
2179    sse_kms_key_id
2180        .clone()
2181        .or_else(|| gateway_default.map(str::to_owned))
2182}
2183
2184/// v0.5 #28: map kms module errors to AWS-shaped S3 error codes.
2185/// `KeyNotFound` is operator misconfig (400); `BackendUnavailable` is a
2186/// transient KMS outage (503). Other variants are 500 InternalError.
2187fn kms_error_to_s3(e: crate::kms::KmsError) -> S3Error {
2188    use crate::kms::KmsError as K;
2189    match e {
2190        K::KeyNotFound { key_id } => S3Error::with_message(
2191            S3ErrorCode::InvalidArgument,
2192            format!("KMS key not found: {key_id}"),
2193        ),
2194        K::BackendUnavailable { message } => S3Error::with_message(
2195            S3ErrorCode::ServiceUnavailable,
2196            format!("KMS backend unavailable: {message}"),
2197        ),
2198        other => S3Error::with_message(S3ErrorCode::InternalError, format!("KMS error: {other}")),
2199    }
2200}
2201
2202/// v0.5 #27: map sse module errors to AWS-shaped S3 error codes.
2203/// `WrongCustomerKey` → 403 AccessDenied (matches AWS behaviour);
2204/// `InvalidCustomerKey` / algorithm / required / unexpected → 400.
2205fn sse_c_error_to_s3(e: crate::sse::SseError) -> S3Error {
2206    use crate::sse::SseError as E;
2207    match e {
2208        E::WrongCustomerKey => S3Error::with_message(
2209            S3ErrorCode::AccessDenied,
2210            "SSE-C key does not match the key used at PUT time",
2211        ),
2212        E::InvalidCustomerKey { reason } => {
2213            S3Error::with_message(S3ErrorCode::InvalidArgument, format!("SSE-C: {reason}"))
2214        }
2215        E::CustomerKeyAlgorithmUnsupported { algo } => S3Error::with_message(
2216            S3ErrorCode::InvalidArgument,
2217            format!("SSE-C unsupported algorithm: {algo:?} (only AES256 is allowed)"),
2218        ),
2219        E::CustomerKeyRequired => S3Error::with_message(
2220            S3ErrorCode::InvalidRequest,
2221            "object is SSE-C encrypted; supply x-amz-server-side-encryption-customer-* headers",
2222        ),
2223        E::CustomerKeyUnexpected => S3Error::with_message(
2224            S3ErrorCode::InvalidRequest,
2225            "object is not SSE-C encrypted; do not send x-amz-server-side-encryption-customer-* headers",
2226        ),
2227        other => S3Error::with_message(S3ErrorCode::InternalError, format!("SSE error: {other}")),
2228    }
2229}
2230
2231fn extract_manifest(metadata: &Option<Metadata>) -> Option<ChunkManifest> {
2232    let m = metadata.as_ref()?;
2233    let codec = m
2234        .get(META_CODEC)
2235        .and_then(|s| s.parse::<CodecKind>().ok())?;
2236    let original_size = m.get(META_ORIGINAL_SIZE)?.parse().ok()?;
2237    let compressed_size = m.get(META_COMPRESSED_SIZE)?.parse().ok()?;
2238    let crc32c = m.get(META_CRC32C)?.parse().ok()?;
2239    Some(ChunkManifest {
2240        codec,
2241        original_size,
2242        compressed_size,
2243        crc32c,
2244    })
2245}
2246
2247fn write_manifest(metadata: &mut Option<Metadata>, manifest: &ChunkManifest) {
2248    let meta = metadata.get_or_insert_with(Default::default);
2249    meta.insert(META_CODEC.into(), manifest.codec.as_str().into());
2250    meta.insert(
2251        META_ORIGINAL_SIZE.into(),
2252        manifest.original_size.to_string(),
2253    );
2254    meta.insert(
2255        META_COMPRESSED_SIZE.into(),
2256        manifest.compressed_size.to_string(),
2257    );
2258    meta.insert(META_CRC32C.into(), manifest.crc32c.to_string());
2259}
2260
2261fn internal<E: std::fmt::Display>(prefix: &'static str) -> impl FnOnce(E) -> S3Error {
2262    move |e| S3Error::with_message(S3ErrorCode::InternalError, format!("{prefix}: {e}"))
2263}
2264
2265/// v0.6 #41: map a `select::SelectError` to the S3 error surface. AWS
2266/// uses a domain-specific `InvalidSqlExpression` code for parse / unsupported
2267/// errors, but s3s 0.13 doesn't expose that as a typed variant — we
2268/// fall back to the well-known `InvalidRequest` 400 with a descriptive
2269/// message that includes the original error context.
2270fn select_error_to_s3(e: crate::select::SelectError, fmt: &str) -> S3Error {
2271    use crate::select::SelectError;
2272    match e {
2273        SelectError::Parse(msg) => S3Error::with_message(
2274            S3ErrorCode::InvalidRequest,
2275            format!("SQL parse error: {msg}"),
2276        ),
2277        SelectError::UnsupportedFeature(msg) => S3Error::with_message(
2278            S3ErrorCode::InvalidRequest,
2279            format!("unsupported SQL feature: {msg}"),
2280        ),
2281        SelectError::RowEval(msg) => S3Error::with_message(
2282            S3ErrorCode::InvalidRequest,
2283            format!("SQL row evaluation error: {msg}"),
2284        ),
2285        SelectError::InputFormat(msg) => S3Error::with_message(
2286            S3ErrorCode::InvalidRequest,
2287            format!("{fmt} input format error: {msg}"),
2288        ),
2289    }
2290}
2291
2292/// v0.5 #30: parse the `x-amz-bypass-governance-retention` header into a
2293/// boolean flag. AWS S3 accepts `true` (case-insensitive); any other value
2294/// (including missing) is treated as `false`.
2295fn parse_bypass_governance_header(headers: &http::HeaderMap) -> bool {
2296    headers
2297        .get("x-amz-bypass-governance-retention")
2298        .and_then(|v| v.to_str().ok())
2299        .map(|s| s.eq_ignore_ascii_case("true"))
2300        .unwrap_or(false)
2301}
2302
2303/// Convert s3s `Timestamp` into a `chrono::DateTime<Utc>` by formatting it
2304/// as an RFC3339 string and re-parsing through `chrono`. The string format
2305/// avoids pulling the `time` crate (transitive dep of s3s, not declared by
2306/// s4-server) into our direct deps. Returns `None` if the format/parse fails
2307/// or the value is outside `chrono`'s supported range.
2308fn timestamp_to_chrono_utc(ts: &Timestamp) -> Option<chrono::DateTime<chrono::Utc>> {
2309    let mut buf = Vec::new();
2310    ts.format(s3s::dto::TimestampFormat::DateTime, &mut buf)
2311        .ok()?;
2312    let s = std::str::from_utf8(&buf).ok()?;
2313    chrono::DateTime::parse_from_rfc3339(s)
2314        .ok()
2315        .map(|dt| dt.with_timezone(&chrono::Utc))
2316}
2317
2318/// Inverse of [`timestamp_to_chrono_utc`] — emit RFC3339 (the s3s
2319/// `DateTime` wire format) and re-parse via `Timestamp::parse`.
2320fn chrono_utc_to_timestamp(dt: chrono::DateTime<chrono::Utc>) -> Timestamp {
2321    // chrono's RFC3339 output format matches s3s' parser ("...Z" with
2322    // optional sub-second precision). Fall back to UNIX_EPOCH if anything
2323    // unexpected happens — we never produce malformed strings, so this
2324    // branch is unreachable in practice.
2325    let s = dt.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
2326    Timestamp::parse(s3s::dto::TimestampFormat::DateTime, &s).unwrap_or_default()
2327}
2328
2329/// v0.6 #39: convert our internal [`crate::tagging::TagSet`] into the
2330/// s3s `Vec<Tag>` wire shape used on `GetObject/BucketTaggingOutput`.
2331/// Both halves of every pair land in the `Some(_)` slot — AWS marks
2332/// the field optional but always populates it on response.
2333fn tagset_to_aws(set: &crate::tagging::TagSet) -> Vec<Tag> {
2334    set.iter()
2335        .map(|(k, v)| Tag {
2336            key: Some(k.clone()),
2337            value: Some(v.clone()),
2338        })
2339        .collect()
2340}
2341
2342/// v0.6 #39: inverse of [`tagset_to_aws`] for input handlers. Missing
2343/// keys / values become empty strings (mirrors AWS, which rejects
2344/// `<Key/>` with InvalidTag at the parser layer; downstream
2345/// `TagSet::validate` then enforces our size limits).
2346fn aws_to_tagset(tags: &[Tag]) -> Result<crate::tagging::TagSet, crate::tagging::TagError> {
2347    let pairs = tags
2348        .iter()
2349        .map(|t| {
2350            (
2351                t.key.clone().unwrap_or_default(),
2352                t.value.clone().unwrap_or_default(),
2353            )
2354        })
2355        .collect();
2356    crate::tagging::TagSet::from_pairs(pairs)
2357}
2358
2359/// `Range` request を decompressed object サイズ `total` に適用して `(start, end_exclusive)`
2360/// を返す。`Range::Int { first, last }` は `bytes=first-last` (last は inclusive)、
2361/// `Range::Suffix { length }` は末尾 `length` byte。S3 仕様に準拠。
2362pub fn resolve_range(range: &s3s::dto::Range, total: u64) -> Result<(u64, u64), String> {
2363    if total == 0 {
2364        return Err("cannot range-get zero-length object".into());
2365    }
2366    match range {
2367        s3s::dto::Range::Int { first, last } => {
2368            let start = *first;
2369            let end_inclusive = match last {
2370                Some(l) => (*l).min(total - 1),
2371                None => total - 1,
2372            };
2373            if start > end_inclusive || start >= total {
2374                return Err(format!(
2375                    "range bytes={start}-{:?} out of object size {total}",
2376                    last
2377                ));
2378            }
2379            Ok((start, end_inclusive + 1))
2380        }
2381        s3s::dto::Range::Suffix { length } => {
2382            let len = (*length).min(total);
2383            Ok((total - len, total))
2384        }
2385    }
2386}
2387
2388#[async_trait::async_trait]
2389impl<B: S3> S3 for S4Service<B> {
2390    // === 圧縮を挟む path (PUT) ===
2391    #[tracing::instrument(
2392        name = "s4.put_object",
2393        skip(self, req),
2394        fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_in, bytes_out, latency_ms)
2395    )]
2396    async fn put_object(
2397        &self,
2398        mut req: S3Request<PutObjectInput>,
2399    ) -> S3Result<S3Response<PutObjectOutput>> {
2400        let put_start = Instant::now();
2401        let put_bucket = req.input.bucket.clone();
2402        let put_key = req.input.key.clone();
2403        // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
2404        self.check_not_reserved_key(&put_key, ReservedKeyMode::Mutating)?;
2405        let access_preamble = self.access_log_preamble(&req);
2406        self.enforce_rate_limit(&req, &put_bucket)?;
2407        // v0.6 #39: parse `x-amz-tagging` (URL-encoded query string) so
2408        // the IAM policy gate sees the request's tags via
2409        // `s3:RequestObjectTag/<key>`. `existing_object_tags` is also
2410        // resolved from the Tagging manager (when wired) so
2411        // `s3:ExistingObjectTag/<key>` works on overwrite.
2412        let request_tags: Option<crate::tagging::TagSet> = req
2413            .input
2414            .tagging
2415            .as_deref()
2416            .map(crate::tagging::parse_tagging_header)
2417            .transpose()
2418            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
2419        let existing_tags: Option<crate::tagging::TagSet> = self
2420            .tagging
2421            .as_ref()
2422            .and_then(|m| m.get_object_tags(&put_bucket, &put_key));
2423        self.enforce_policy_with_extra(
2424            &req,
2425            "s3:PutObject",
2426            &put_bucket,
2427            Some(&put_key),
2428            request_tags.as_ref(),
2429            existing_tags.as_ref(),
2430        )?;
2431        // v0.5 #30: an Object Lock-protected key cannot be overwritten by
2432        // a non-versioned PUT (Suspended / Unversioned bucket). Enabled
2433        // bucket PUTs are exempt because they materialise a fresh
2434        // version under a shadow key (`<key>.__s4ver__/<vid>`) — the
2435        // locked version's bytes are untouched. The check mirrors the
2436        // delete path (Compliance never bypassable, Governance via the
2437        // bypass header, legal hold never).
2438        if let Some(mgr) = self.object_lock.as_ref()
2439            && let Some(state) = mgr.get(&put_bucket, &put_key)
2440        {
2441            let bucket_versioned_enabled = self
2442                .versioning
2443                .as_ref()
2444                .map(|v| v.state(&put_bucket) == crate::versioning::VersioningState::Enabled)
2445                .unwrap_or(false);
2446            if !bucket_versioned_enabled {
2447                let bypass = parse_bypass_governance_header(&req.headers);
2448                let now = chrono::Utc::now();
2449                if !state.can_delete(now, bypass) {
2450                    crate::metrics::record_policy_denial("s3:PutObject", &put_bucket);
2451                    return Err(S3Error::with_message(
2452                        S3ErrorCode::AccessDenied,
2453                        "Access Denied because object protected by object lock",
2454                    ));
2455                }
2456            }
2457        }
2458        // v0.5 #30: per-PUT explicit retention / legal hold (S3
2459        // `x-amz-object-lock-mode`, `x-amz-object-lock-retain-until-date`,
2460        // `x-amz-object-lock-legal-hold`). Captured before the body
2461        // moves into the backend; persisted into the manager only on
2462        // backend success below.
2463        let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
2464            .input
2465            .object_lock_mode
2466            .as_ref()
2467            .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
2468        let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
2469            .input
2470            .object_lock_retain_until_date
2471            .as_ref()
2472            .and_then(timestamp_to_chrono_utc);
2473        let explicit_legal_hold_on: Option<bool> = req
2474            .input
2475            .object_lock_legal_hold_status
2476            .as_ref()
2477            .map(|s| s.as_str().eq_ignore_ascii_case("ON"));
2478        if let Some(blob) = req.input.body.take() {
2479            // Sample 4 KiB から codec を決定。streaming-aware codec なら streaming
2480            // compress fast path、そうでなければ従来の collect-then-compress。
2481            let (sample, rest_stream) = peek_sample(blob, SAMPLE_BYTES)
2482                .await
2483                .map_err(internal("peek put sample"))?;
2484            let sample_len = sample.len().min(SAMPLE_BYTES);
2485            // v0.8 #56: pass the request's Content-Length (when present) so
2486            // the sampling dispatcher can promote large objects to a GPU
2487            // codec. Chunked transfers (no Content-Length) keep CPU.
2488            let total_size_hint = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2489            let kind = self
2490                .dispatcher
2491                .pick_with_size_hint(&sample[..sample_len], total_size_hint)
2492                .await;
2493
2494            // Passthrough buys nothing from S4F2 wrapping (no compression =
2495            // no per-chunk frame to skip past) and the +28-byte header
2496            // overhead breaks size-sensitive callers that expect a true
2497            // pass-through. So passthrough always uses the legacy raw-blob
2498            // path; only compressing codecs go through the framed path.
2499            //
2500            // v0.8.14 follow-up to #127 MED-B: the previous attempt
2501            // forced the buffered path whenever the client supplied
2502            // any whole-body checksum so `verify_client_body_checksums`
2503            // could run. Modern AWS SDKs auto-add an
2504            // `x-amz-checksum-crc32` trailer by default, which made
2505            // every SDK PUT lose the streaming-framed path and
2506            // therefore lose its sidecar — silent data path
2507            // regression caught by
2508            // `range_get_falls_back_to_full_when_sidecar_etag_stale`
2509            // and `upload_part_copy_propagates_source_version_id`
2510            // on the MinIO E2E job. The streaming PUT path now
2511            // passes through unchanged; client-supplied checksums on
2512            // streaming PUTs are NOT verified (same fail-open as
2513            // pre-v0.8.12). The buffered PUT branch and UploadPart
2514            // do verify, which covers the buffered upload case the
2515            // HIGH-12 audit was scoped to. True streaming verify
2516            // (tee-into-hasher on the chained input) remains the
2517            // tracked follow-up.
2518            let use_framed = supports_streaming_compress(kind) && kind != CodecKind::Passthrough;
2519            let (compressed, manifest, is_framed) = if use_framed {
2520                // streaming fast path: input は memory に collect しない
2521                let chained = chain_sample_with_rest(sample, rest_stream);
2522                debug!(
2523                    bucket = ?req.input.bucket,
2524                    key = ?req.input.key,
2525                    codec = kind.as_str(),
2526                    path = "streaming-framed",
2527                    "S4 put_object: compressing (streaming, S4F2 multi-frame)"
2528                );
2529                // v0.4 #16: pick the chunk size based on the request's
2530                // Content-Length when known, falling back to the 4 MiB
2531                // default for chunked transfers.
2532                let chunk_size = pick_chunk_size(req.input.content_length.map(|n| n as u64));
2533                // v0.8.4 #73 M2: pass the request's Content-Length so
2534                // streaming_compress_to_frames can fail-fast on a mid-PUT
2535                // truncation (client disconnect after sending half the
2536                // body). `None` is the chunked-Transfer-Encoding case
2537                // where the upstream genuinely doesn't know the size and
2538                // the backend's framing layer is the only truncation
2539                // signal we have.
2540                let expected_input_size =
2541                    req.input.content_length.and_then(|n| u64::try_from(n).ok());
2542                let (body, manifest) = streaming_compress_to_frames(
2543                    chained,
2544                    Arc::clone(&self.registry),
2545                    kind,
2546                    chunk_size,
2547                    expected_input_size,
2548                )
2549                .await
2550                .map_err(|e| match e {
2551                    s4_codec::CodecError::TruncatedStream { expected, got } => {
2552                        // 400 IncompleteBody: client advertised N bytes
2553                        // but disconnected after `got`. Mirrors AWS S3's
2554                        // canonical error code for the same shape so SDK
2555                        // retries kick in instead of treating the PUT as
2556                        // a successful upload of a half-body.
2557                        S3Error::with_message(
2558                            S3ErrorCode::IncompleteBody,
2559                            format!("PUT body truncated: expected {expected} bytes, got {got}"),
2560                        )
2561                    }
2562                    // v0.8.15 M-4: 400
2563                    // `RequestBodyLengthMismatch` for over-length
2564                    // bodies. AWS S3 returns this when the declared
2565                    // `Content-Length` is smaller than the wire body;
2566                    // S4 used to silently accept the surplus bytes.
2567                    // `IncompleteBody` is the closest typed variant
2568                    // in the s3s enum — we widen the message so the
2569                    // SDK / curl side sees the shape unambiguously.
2570                    s4_codec::CodecError::OverlengthStream { expected, got } => {
2571                        let code = S3ErrorCode::from_bytes(b"RequestBodyLengthMismatch")
2572                            .unwrap_or(S3ErrorCode::IncompleteBody);
2573                        S3Error::with_message(
2574                            code,
2575                            format!(
2576                                "PUT body length mismatch: Content-Length declared {expected} \
2577                                 bytes, body carried at least {got}"
2578                            ),
2579                        )
2580                    }
2581                    other => internal("streaming framed compress")(other),
2582                })?;
2583                (body, manifest, true)
2584            } else {
2585                // GPU codec 等で streaming-aware でないものは bytes-buffered path
2586                // (raw 圧縮 bytes、framed なし — back-compat 互換 path)
2587                let bytes = collect_with_sample(sample, rest_stream, self.max_body_bytes)
2588                    .await
2589                    .map_err(internal("collect put body (buffered path)"))?;
2590                // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
2591                // checksum algorithms against the received body on
2592                // the buffered path. The streaming-framed branch
2593                // above redirects here when ANY checksum header is
2594                // present (#127 MED-B), so this is the single
2595                // checkpoint for client-supplied integrity.
2596                verify_client_body_checksums(
2597                    &bytes,
2598                    req.input.content_md5.as_deref(),
2599                    req.input.checksum_crc32.as_deref(),
2600                    req.input.checksum_crc32c.as_deref(),
2601                    req.input.checksum_sha1.as_deref(),
2602                    req.input.checksum_sha256.as_deref(),
2603                    req.input.checksum_crc64nvme.as_deref(),
2604                )?;
2605                debug!(
2606                    bucket = ?req.input.bucket,
2607                    key = ?req.input.key,
2608                    bytes = bytes.len(),
2609                    codec = kind.as_str(),
2610                    path = "buffered",
2611                    "S4 put_object: compressing (buffered, raw blob)"
2612                );
2613                // v0.8 #55: telemetry-returning compress so we can stamp
2614                // GPU-pipeline Prometheus metrics (`s4_gpu_compress_seconds`,
2615                // throughput gauge, OOM counter) for nvcomp / dietgpu codecs.
2616                // CPU codecs come back with `gpu_seconds = None` and the
2617                // stamp helper short-circuits — no extra cost on CPU path.
2618                let (compress_res, tel) = self.registry.compress_with_telemetry(bytes, kind).await;
2619                stamp_gpu_compress_telemetry(&tel);
2620                let (body, m) = compress_res.map_err(internal("registry compress"))?;
2621                (body, m, false)
2622            };
2623
2624            write_manifest(&mut req.input.metadata, &manifest);
2625            if is_framed {
2626                // v0.2 #4: framed body であることを GET 側に伝える meta flag。
2627                req.input
2628                    .metadata
2629                    .get_or_insert_with(Default::default)
2630                    .insert(META_FRAMED.into(), "true".into());
2631            }
2632            // 重要: content_length を圧縮後サイズで更新する。
2633            // これを忘れると下流 (aws-sdk-s3 → S3) が宣言サイズ分の bytes を
2634            // 待ち続けて RequestTimeout で失敗する (S3 仕様)。
2635            req.input.content_length = Some(compressed.len() as i64);
2636            // body を書き換えたので、客側が送ってきた original body 用の
2637            // checksum / MD5 ヘッダは無効化する (そのまま転送すると下流 S3 が
2638            // XAmzContentChecksumMismatch を返す)。S4 自身の整合性は
2639            // ChunkManifest.crc32c で担保している。
2640            req.input.checksum_algorithm = None;
2641            req.input.checksum_crc32 = None;
2642            req.input.checksum_crc32c = None;
2643            req.input.checksum_crc64nvme = None;
2644            req.input.checksum_sha1 = None;
2645            req.input.checksum_sha256 = None;
2646            req.input.content_md5 = None;
2647            let original_size = manifest.original_size;
2648            let compressed_size = manifest.compressed_size;
2649            let codec_label = manifest.codec.as_str();
2650            // (sidecar_index is built below, after the SSE-mode
2651            // extraction, so v0.8.12 HIGH-10 can short-circuit the
2652            // build when the on-disk bytes are about to be encrypted.)
2653            // v0.4 #21 / v0.5 #29 / v0.5 #27: encrypt-after-compress.
2654            // Precedence:
2655            //   - SSE-C headers present → per-request customer key (S4E3)
2656            //   - server-managed keyring configured → active key (S4E2)
2657            //   - neither → no encryption (raw compressed body)
2658            // The `s4-encrypted: aes-256-gcm` metadata flag is set in
2659            // both encrypted modes; the on-disk frame magic distinguishes
2660            // S4E1 / S4E2 / S4E3 so GET picks the right decrypt path.
2661            // v0.7 #48 BUG-2/3 fix: take() the SSE fields off req.input
2662            // so the encryption headers are NOT forwarded to the
2663            // backend. S4 owns the encrypt-then-store contract; if we
2664            // leave the headers in place, real S3-compat backends
2665            // (MinIO / AWS) try to apply their own SSE on top and
2666            // either reject (MinIO requires HTTPS for SSE-C) or fail
2667            // (MinIO has no KMS configured). MemoryBackend ignored
2668            // these so mock tests passed.
2669            let sse_c_alg = req.input.sse_customer_algorithm.take();
2670            let sse_c_key = req.input.sse_customer_key.take();
2671            let sse_c_md5 = req.input.sse_customer_key_md5.take();
2672            let sse_header = req.input.server_side_encryption.take();
2673            let sse_kms_key = req.input.ssekms_key_id.take();
2674            let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
2675            // v0.5 #28: SSE-KMS request? Resolves to None unless the
2676            // request asks for `aws:kms` AND a key id is available
2677            // (explicit header or gateway default). When set, we'll
2678            // generate a per-object DEK below.
2679            let kms_key_id = extract_kms_key_id(
2680                &sse_header,
2681                &sse_kms_key,
2682                self.kms_default_key_id.as_deref(),
2683            );
2684            // v0.8.12 HIGH-10 fix: the sidecar offsets describe the
2685            // pre-encrypt `compressed` body, but the bytes the
2686            // backend stores when any SSE mode is active are
2687            // *post-encrypt* (different length, different layout).
2688            // A Range GET on an SSE-encrypted object would slice the
2689            // ciphertext at the stale offsets, hand the wrong bytes
2690            // to the frame parser, and 500. Suppress the sidecar
2691            // entirely when SSE is going to be applied below;
2692            // encrypted-object Range GET falls back to the buffered
2693            // path (decrypt full body → frame parse → slice), trading
2694            // partial-fetch performance for correctness. An
2695            // encryption-aware sidecar format is a follow-up issue.
2696            let will_encrypt =
2697                sse_c_material.is_some() || kms_key_id.is_some() || self.sse_keyring.is_some();
2698            let sidecar_index = if is_framed && !will_encrypt {
2699                s4_codec::index::build_index_from_body(&compressed).ok()
2700            } else {
2701                None
2702            };
2703            // v0.5 #32: in compliance-strict mode, every PUT must
2704            // declare SSE — either client-supplied (SSE-C), KMS, or by
2705            // virtue of a server-side keyring being configured (which
2706            // applies SSE-S4 to every PUT automatically). Requests that
2707            // would otherwise land as plain compressed bytes are
2708            // rejected with 400 InvalidRequest.
2709            if self.compliance_strict
2710                && sse_c_material.is_none()
2711                && kms_key_id.is_none()
2712                && self.sse_keyring.is_none()
2713                && sse_header.as_ref().map(|s| s.as_str()) != Some(ServerSideEncryption::AES256)
2714            {
2715                return Err(S3Error::with_message(
2716                    S3ErrorCode::InvalidRequest,
2717                    "compliance-mode strict: PUT must include x-amz-server-side-encryption \
2718                     (AES256 or aws:kms) or x-amz-server-side-encryption-customer-* headers",
2719                ));
2720            }
2721            // SSE-C and SSE-KMS are mutually exclusive on a single PUT
2722            // (AWS S3 returns 400 InvalidArgument). SSE-C wins by spec.
2723            if sse_c_material.is_some() && kms_key_id.is_some() {
2724                return Err(S3Error::with_message(
2725                    S3ErrorCode::InvalidArgument,
2726                    "SSE-C and SSE-KMS cannot be used together on the same PUT",
2727                ));
2728            }
2729            // KMS path needs to call generate_dek().await before the
2730            // body_to_send branch; capture the result here.
2731            //
2732            // v0.8.1 #58: the plaintext DEK lives in three places
2733            // during one PUT:
2734            //
2735            //   1. The `Zeroizing<Vec<u8>>` returned by `generate_dek`
2736            //      — wiped when the binding `dek` falls out of scope at
2737            //      the end of this `if`-arm.
2738            //   2. The stack `[u8; 32]` we copy into for `SseSource::Kms`
2739            //      — wrapped in `Zeroizing<[u8; 32]>` so it's wiped when
2740            //      the outer `kms_wrap` `Option` is dropped at the end
2741            //      of `put_object`.
2742            //   3. AES-GCM internal key state inside the `aes-gcm`
2743            //      crate during `encrypt_with_source` — out of scope
2744            //      for this fix; tracked separately in v0.8.2.
2745            let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
2746                if let Some(ref key_id) = kms_key_id {
2747                    let kms = self.kms.as_ref().ok_or_else(|| {
2748                    S3Error::with_message(
2749                        S3ErrorCode::InvalidRequest,
2750                        "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
2751                    )
2752                })?;
2753                    // `dek` is `Zeroizing<Vec<u8>>`; deref + slice access
2754                    // works unchanged via `Deref<Target=Vec<u8>>`.
2755                    let (dek, wrapped) = kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
2756                    if dek.len() != 32 {
2757                        return Err(S3Error::with_message(
2758                            S3ErrorCode::InternalError,
2759                            format!(
2760                                "KMS backend returned a DEK of {} bytes (expected 32)",
2761                                dek.len()
2762                            ),
2763                        ));
2764                    }
2765                    let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
2766                        zeroize::Zeroizing::new([0u8; 32]);
2767                    dek_arr.copy_from_slice(&dek);
2768                    // `dek` (the `Zeroizing<Vec<u8>>`) is dropped at the
2769                    // end of this scope, wiping the heap allocation.
2770                    Some((dek_arr, wrapped))
2771                } else {
2772                    None
2773                };
2774            // v0.7 #48 BUG-4 fix: stamp the SSE *type* into metadata
2775            // alongside `s4-encrypted` so HEAD (which doesn't fetch the
2776            // body) can echo the correct `x-amz-server-side-encryption`
2777            // value. Without this, HEAD on an SSE-KMS object would not
2778            // echo `aws:kms` because the frame magic is only available
2779            // on the body (which HEAD doesn't read).
2780            let body_to_send = if let Some(ref m) = sse_c_material {
2781                let meta = req.input.metadata.get_or_insert_with(Default::default);
2782                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2783                meta.insert("s4-sse-type".into(), "AES256".into());
2784                meta.insert(
2785                    "s4-sse-c-key-md5".into(),
2786                    base64::engine::general_purpose::STANDARD.encode(m.key_md5),
2787                );
2788                crate::sse::encrypt_with_source(
2789                    &compressed,
2790                    crate::sse::SseSource::CustomerKey {
2791                        key: &m.key,
2792                        key_md5: &m.key_md5,
2793                    },
2794                )
2795            } else if let Some((ref dek, ref wrapped)) = kms_wrap {
2796                let meta = req.input.metadata.get_or_insert_with(Default::default);
2797                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2798                meta.insert("s4-sse-type".into(), "aws:kms".into());
2799                meta.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
2800                // v0.8.1 #58: `dek` is `&Zeroizing<[u8; 32]>`; `SseSource::Kms`
2801                // wants `&[u8; 32]`. Rust auto-derefs `&Zeroizing<T>` to
2802                // `&T` here via `Deref<Target=T>`, so the binding picks
2803                // up the inner array reference without copying. The array
2804                // stays in the `Zeroizing` wrapper that owns it and gets
2805                // wiped when `kms_wrap` drops at the end of `put_object`.
2806                let dek_ref: &[u8; 32] = dek;
2807                crate::sse::encrypt_with_source(
2808                    &compressed,
2809                    crate::sse::SseSource::Kms {
2810                        dek: dek_ref,
2811                        wrapped,
2812                    },
2813                )
2814            } else if let Some(keyring) = self.sse_keyring.as_ref() {
2815                // SSE-S4 is server-driven transparent encryption; the
2816                // client didn't ask for SSE. We stamp `s4-encrypted`
2817                // (internal flag the GET path needs) but deliberately
2818                // do NOT stamp `s4-sse-type` — that lights up the HEAD
2819                // echo of `x-amz-server-side-encryption: AES256`,
2820                // which would falsely advertise AWS-style SSE-S3
2821                // semantics the operator didn't request.
2822                let meta = req.input.metadata.get_or_insert_with(Default::default);
2823                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2824                // v0.8 #52: when `--sse-chunk-size > 0` is configured,
2825                // emit the chunked S4E5 frame so the matching GET can
2826                // stream-decrypt instead of buffering 5 GiB before
2827                // emitting a byte. Falls back to the buffered S4E2
2828                // frame at chunk_size=0 (default) so existing
2829                // deployments are bit-for-bit unchanged.
2830                if self.sse_chunk_size > 0 {
2831                    crate::sse::encrypt_v2_chunked(&compressed, keyring, self.sse_chunk_size)
2832                        .map_err(|e| {
2833                            S3Error::with_message(
2834                                S3ErrorCode::InternalError,
2835                                format!("SSE-S4 chunked encrypt failed: {e}"),
2836                            )
2837                        })?
2838                } else {
2839                    crate::sse::encrypt_v2(&compressed, keyring)
2840                }
2841            } else {
2842                compressed.clone()
2843            };
2844            // v0.6 #40: capture the about-to-be-sent body + metadata so
2845            // the replication dispatcher (run after the source PUT
2846            // succeeds) can hand the same backend bytes to the
2847            // destination bucket. `Bytes` clone is cheap (refcounted).
2848            let replication_body = body_to_send.clone();
2849            let replication_metadata = req.input.metadata.clone();
2850            // v0.7 #48 BUG-1 fix: SSE encryption (S4E1/E2/E3/E4 frames)
2851            // makes the body longer than the post-compression bytes
2852            // (header + nonce + tag overhead). The earlier
2853            // content_length stamp at compressed.len() is now stale, so
2854            // re-stamp from the actual bytes about to be sent or the
2855            // backend (real S3 / MinIO) rejects with
2856            // `StreamLengthMismatch`. MemoryBackend never validated
2857            // this, which is why mock-only tests passed.
2858            req.input.content_length = Some(body_to_send.len() as i64);
2859            req.input.body = Some(bytes_to_blob(body_to_send));
2860            // v0.5 #34: pre-allocate a version-id when the bucket is
2861            // Enabled, then redirect the backend storage key to the
2862            // shadow path so older versions survive newer PUTs.
2863            // Suspended / Unversioned buckets keep using the plain
2864            // `<key>` (S3 spec: Suspended overwrites the same backend
2865            // object). Pre-allocation (instead of recording after PUT)
2866            // ensures the shadow key + the response's
2867            // `x-amz-version-id` use the same vid.
2868            let pending_version: Option<crate::versioning::PutOutcome> = self
2869                .versioning
2870                .as_ref()
2871                .map(|mgr| mgr.state(&put_bucket))
2872                .map(|state| match state {
2873                    crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
2874                        version_id: crate::versioning::VersioningManager::new_version_id(),
2875                        versioned_response: true,
2876                    },
2877                    crate::versioning::VersioningState::Suspended
2878                    | crate::versioning::VersioningState::Unversioned => {
2879                        crate::versioning::PutOutcome {
2880                            version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
2881                            versioned_response: false,
2882                        }
2883                    }
2884                });
2885            if let Some(ref pv) = pending_version
2886                && pv.versioned_response
2887            {
2888                req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
2889            }
2890            // v0.8.4 #73 H-2: capture the to-be-stored body length BEFORE
2891            // the move into `req.input` is consumed by the backend call.
2892            // The sidecar's `source_compressed_size` is checked against
2893            // the live HEAD `Content-Length` on Range GET to detect a
2894            // backend-side mutation.
2895            let backend_object_size = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2896            let mut backend_resp = self.backend.put_object(req).await;
2897            if let Some(mut idx) = sidecar_index
2898                && let Ok(ref resp) = backend_resp
2899                && idx.entries.len() > 1
2900            {
2901                // 1 chunk しかない (small object) なら sidecar は意味がない (=
2902                // partial fetch しても full body と同じ範囲) ので省略。
2903                // Sidecar は user-visible key で書く (latest version の
2904                // partial fetch path 用)。Old versions の Range GET は今 task
2905                // の scope 外 (full read fallback でも意味的には正しい)。
2906                //
2907                // v0.8.4 #73 H-2: stamp the version-binding fields the
2908                // GET path needs to detect a stale / attacker-written
2909                // sidecar. ETag comes from the backend's PUT response —
2910                // when missing (some backends don't return an ETag) we
2911                // synthesize a CRC-derived stable identifier so the
2912                // sidecar still binds to *something*; the GET HEAD will
2913                // see the same backend ETag (None vs None) and treat the
2914                // pair as consistent.
2915                let source_etag = resp.output.e_tag.as_ref().map(|t| t.value().to_string());
2916                idx.source_etag = source_etag;
2917                idx.source_compressed_size = backend_object_size;
2918                self.write_sidecar(&put_bucket, &put_key, &idx).await;
2919            }
2920            // v0.5 #34: commit the new version into the manager only on
2921            // backend success. Use the pre-allocated vid so the response
2922            // header and the chain entry agree.
2923            if let (Some(mgr), Some(pv), Ok(resp)) = (
2924                self.versioning.as_ref(),
2925                pending_version.as_ref(),
2926                backend_resp.as_mut(),
2927            ) {
2928                let etag = resp
2929                    .output
2930                    .e_tag
2931                    .clone()
2932                    .map(ETag::into_value)
2933                    .unwrap_or_else(|| format!("\"crc32c-{}\"", manifest.crc32c));
2934                let now = chrono::Utc::now();
2935                mgr.commit_put_with_version(
2936                    &put_bucket,
2937                    &put_key,
2938                    crate::versioning::VersionEntry {
2939                        version_id: pv.version_id.clone(),
2940                        etag,
2941                        size: original_size,
2942                        is_delete_marker: false,
2943                        created_at: now,
2944                    },
2945                );
2946                if pv.versioned_response {
2947                    resp.output.version_id = Some(pv.version_id.clone());
2948                }
2949            }
2950            // v0.5 #27: AWS S3 echoes the SSE-C headers back on success
2951            // so the client knows the server actually applied the
2952            // requested algorithm and which key fingerprint matched.
2953            if let (Some(m), Ok(resp)) = (sse_c_material.as_ref(), backend_resp.as_mut()) {
2954                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
2955                resp.output.sse_customer_key_md5 =
2956                    Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
2957            }
2958            // v0.5 #28: SSE-KMS echo — `aws:kms` + the canonical key id
2959            // the backend returned (AWS KMS returns the ARN even when
2960            // the request used an alias).
2961            if let (Some((_, wrapped)), Ok(resp)) = (kms_wrap.as_ref(), backend_resp.as_mut()) {
2962                resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
2963                    ServerSideEncryption::AWS_KMS,
2964                ));
2965                resp.output.ssekms_key_id = Some(wrapped.key_id.clone());
2966            }
2967            // v0.5 #30: persist any per-PUT explicit retention / legal
2968            // hold the client supplied, then auto-apply the bucket
2969            // default (no-op when state is already populated). The
2970            // explicit fields take precedence — the bucket-default
2971            // helper bails out as soon as it sees any retention.
2972            if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
2973                if explicit_lock_mode.is_some()
2974                    || explicit_retain_until.is_some()
2975                    || explicit_legal_hold_on.is_some()
2976                {
2977                    let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
2978                    if let Some(m) = explicit_lock_mode {
2979                        state.mode = Some(m);
2980                    }
2981                    if let Some(u) = explicit_retain_until {
2982                        state.retain_until = Some(u);
2983                    }
2984                    if let Some(lh) = explicit_legal_hold_on {
2985                        state.legal_hold_on = lh;
2986                    }
2987                    mgr.set(&put_bucket, &put_key, state);
2988                }
2989                mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
2990            }
2991            let _ = (original_size, compressed_size); // mute unused warnings
2992            let elapsed = put_start.elapsed();
2993            crate::metrics::record_put(
2994                codec_label,
2995                original_size,
2996                compressed_size,
2997                elapsed.as_secs_f64(),
2998                backend_resp.is_ok(),
2999            );
3000            // v0.4 #20: structured access-log entry (best-effort).
3001            self.record_access(
3002                access_preamble,
3003                "REST.PUT.OBJECT",
3004                &put_bucket,
3005                Some(&put_key),
3006                if backend_resp.is_ok() { 200 } else { 500 },
3007                compressed_size,
3008                original_size,
3009                elapsed.as_millis() as u64,
3010                backend_resp.as_ref().err().map(|e| e.code().as_str()),
3011            )
3012            .await;
3013            info!(
3014                op = "put_object",
3015                bucket = %put_bucket,
3016                key = %put_key,
3017                codec = codec_label,
3018                bytes_in = original_size,
3019                bytes_out = compressed_size,
3020                ratio = format!(
3021                    "{:.3}",
3022                    if original_size == 0 { 1.0 } else { compressed_size as f64 / original_size as f64 }
3023                ),
3024                latency_ms = elapsed.as_millis() as u64,
3025                ok = backend_resp.is_ok(),
3026                "S4 put completed"
3027            );
3028            // v0.6 #35: fire bucket-notification destinations (best-effort,
3029            // detached). Skipped when no manager is attached or when the
3030            // bucket has no rule matching `s3:ObjectCreated:Put` for this
3031            // key.
3032            if backend_resp.is_ok()
3033                && let Some(mgr) = self.notifications.as_ref()
3034            {
3035                let dests = mgr.match_destinations(
3036                    &put_bucket,
3037                    &crate::notifications::EventType::ObjectCreatedPut,
3038                    &put_key,
3039                );
3040                if !dests.is_empty() {
3041                    let etag = backend_resp
3042                        .as_ref()
3043                        .ok()
3044                        .and_then(|r| r.output.e_tag.clone())
3045                        .map(ETag::into_value);
3046                    let version_id = pending_version
3047                        .as_ref()
3048                        .filter(|pv| pv.versioned_response)
3049                        .map(|pv| pv.version_id.clone());
3050                    tokio::spawn(crate::notifications::dispatch_event(
3051                        Arc::clone(mgr),
3052                        put_bucket.clone(),
3053                        put_key.clone(),
3054                        crate::notifications::EventType::ObjectCreatedPut,
3055                        Some(original_size),
3056                        etag,
3057                        version_id,
3058                        format!("S4-{}", uuid::Uuid::new_v4()),
3059                    ));
3060                }
3061            }
3062            // v0.6 #39: persist parsed `x-amz-tagging` tags into the
3063            // tagging manager on a successful PUT. AWS PutObject's
3064            // tagging is a full-replace operation (not a merge), so
3065            // any pre-existing entry for `(bucket, key)` is overwritten.
3066            if backend_resp.is_ok()
3067                && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3068            {
3069                mgr.put_object_tags(&put_bucket, &put_key, tags);
3070            }
3071            // v0.6 #40: cross-bucket replication fire-point. On
3072            // successful source PUT, consult the replication manager;
3073            // when an enabled rule matches, mark the source key
3074            // `Pending` and spawn a detached task that PUTs the same
3075            // backend bytes + metadata to the rule's destination
3076            // bucket. The dispatcher itself records `Completed` /
3077            // `Failed` and bumps the drop counter on retry-budget
3078            // exhaustion.
3079            self.spawn_replication_if_matched(
3080                &put_bucket,
3081                &put_key,
3082                &request_tags,
3083                &replication_body,
3084                &replication_metadata,
3085                backend_resp.is_ok(),
3086                pending_version.as_ref(),
3087            );
3088            return backend_resp;
3089        }
3090        // Body-less PUT (rare: zero-length object). Mirror the body-full
3091        // versioning hooks so list_object_versions / GET-by-version still see
3092        // empty-body objects in the chain.
3093        let pending_version: Option<crate::versioning::PutOutcome> = self
3094            .versioning
3095            .as_ref()
3096            .map(|mgr| mgr.state(&put_bucket))
3097            .map(|state| match state {
3098                crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3099                    version_id: crate::versioning::VersioningManager::new_version_id(),
3100                    versioned_response: true,
3101                },
3102                _ => crate::versioning::PutOutcome {
3103                    version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3104                    versioned_response: false,
3105                },
3106            });
3107        if let Some(ref pv) = pending_version
3108            && pv.versioned_response
3109        {
3110            req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3111        }
3112        let mut backend_resp = self.backend.put_object(req).await;
3113        if let (Some(mgr), Some(pv), Ok(resp)) = (
3114            self.versioning.as_ref(),
3115            pending_version.as_ref(),
3116            backend_resp.as_mut(),
3117        ) {
3118            let etag = resp
3119                .output
3120                .e_tag
3121                .clone()
3122                .map(ETag::into_value)
3123                .unwrap_or_default();
3124            let now = chrono::Utc::now();
3125            mgr.commit_put_with_version(
3126                &put_bucket,
3127                &put_key,
3128                crate::versioning::VersionEntry {
3129                    version_id: pv.version_id.clone(),
3130                    etag,
3131                    size: 0,
3132                    is_delete_marker: false,
3133                    created_at: now,
3134                },
3135            );
3136            if pv.versioned_response {
3137                resp.output.version_id = Some(pv.version_id.clone());
3138            }
3139        }
3140        // v0.5 #30: same explicit-then-default lock-state commit as the
3141        // body-bearing branch above, so a zero-length PUT also picks up
3142        // bucket-default retention.
3143        if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3144            if explicit_lock_mode.is_some()
3145                || explicit_retain_until.is_some()
3146                || explicit_legal_hold_on.is_some()
3147            {
3148                let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3149                if let Some(m) = explicit_lock_mode {
3150                    state.mode = Some(m);
3151                }
3152                if let Some(u) = explicit_retain_until {
3153                    state.retain_until = Some(u);
3154                }
3155                if let Some(lh) = explicit_legal_hold_on {
3156                    state.legal_hold_on = lh;
3157                }
3158                mgr.set(&put_bucket, &put_key, state);
3159            }
3160            mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3161        }
3162        // v0.6 #35: same notification fire-point as the body-bearing PUT
3163        // branch above (zero-length objects still match `ObjectCreated:Put`
3164        // rules per the AWS event taxonomy).
3165        if backend_resp.is_ok()
3166            && let Some(mgr) = self.notifications.as_ref()
3167        {
3168            let dests = mgr.match_destinations(
3169                &put_bucket,
3170                &crate::notifications::EventType::ObjectCreatedPut,
3171                &put_key,
3172            );
3173            if !dests.is_empty() {
3174                let etag = backend_resp
3175                    .as_ref()
3176                    .ok()
3177                    .and_then(|r| r.output.e_tag.clone())
3178                    .map(ETag::into_value);
3179                let version_id = pending_version
3180                    .as_ref()
3181                    .filter(|pv| pv.versioned_response)
3182                    .map(|pv| pv.version_id.clone());
3183                tokio::spawn(crate::notifications::dispatch_event(
3184                    Arc::clone(mgr),
3185                    put_bucket.clone(),
3186                    put_key.clone(),
3187                    crate::notifications::EventType::ObjectCreatedPut,
3188                    Some(0),
3189                    etag,
3190                    version_id,
3191                    format!("S4-{}", uuid::Uuid::new_v4()),
3192                ));
3193            }
3194        }
3195        // v0.6 #39: persist parsed `x-amz-tagging` for the body-less
3196        // (zero-length) PUT branch too — same shape as the body-bearing
3197        // branch above.
3198        if backend_resp.is_ok()
3199            && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3200        {
3201            mgr.put_object_tags(&put_bucket, &put_key, tags);
3202        }
3203        // v0.6 #40: cross-bucket replication for the zero-length PUT
3204        // branch — same shape as the body-bearing branch above.
3205        // v0.8.2 #61: pass `pending_version` so a versioned source's
3206        // destination receives the same shadow-key path.
3207        self.spawn_replication_if_matched(
3208            &put_bucket,
3209            &put_key,
3210            &request_tags,
3211            &bytes::Bytes::new(),
3212            &None,
3213            backend_resp.is_ok(),
3214            pending_version.as_ref(),
3215        );
3216        backend_resp
3217    }
3218
3219    // === 圧縮を解く path (GET) ===
3220    #[tracing::instrument(
3221        name = "s4.get_object",
3222        skip(self, req),
3223        fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_out, range, path)
3224    )]
3225    async fn get_object(
3226        &self,
3227        mut req: S3Request<GetObjectInput>,
3228    ) -> S3Result<S3Response<GetObjectOutput>> {
3229        let get_start = Instant::now();
3230        let get_bucket = req.input.bucket.clone();
3231        let get_key = req.input.key.clone();
3232        // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
3233        self.check_not_reserved_key(&get_key, ReservedKeyMode::Read)?;
3234        self.enforce_rate_limit(&req, &get_bucket)?;
3235        self.enforce_policy(&req, "s3:GetObject", &get_bucket, Some(&get_key))?;
3236        // Range request の事前検出 (decompress 後 slice する path に使う)。
3237        let range_request = req.input.range.take();
3238        // v0.5 #27: pull SSE-C material from the input headers before
3239        // the request is moved into the backend. A header parse error
3240        // fails fast (no body fetch). The material is consumed below
3241        // when decrypting an S4E3-framed body; the SSE-C headers on
3242        // `req.input` are cleared so the backend doesn't see them.
3243        let sse_c_alg = req.input.sse_customer_algorithm.take();
3244        let sse_c_key = req.input.sse_customer_key.take();
3245        let sse_c_md5 = req.input.sse_customer_key_md5.take();
3246        let get_sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3247
3248        // v0.5 #34: route the GET through the VersioningManager when
3249        // attached AND the bucket is in a versioning-aware state.
3250        // Resolves which version to fetch (explicit `?versionId=` query
3251        // param vs. chain latest), translates a delete-marker into 404
3252        // NoSuchKey, and rewrites the backend storage key to the shadow
3253        // path (`<key>.__s4ver__/<vid>`) for non-null Enabled-bucket
3254        // versions. `resolved_version_id` is stamped onto the response
3255        // so clients see a coherent `x-amz-version-id` header.
3256        //
3257        // When the bucket is Unversioned (or no manager attached), the
3258        // chain-resolution step is skipped and the request flows
3259        // through the existing single-key path unchanged.
3260        let resolved_version_id: Option<String> = match self.versioning.as_ref() {
3261            Some(mgr)
3262                if mgr.state(&get_bucket) != crate::versioning::VersioningState::Unversioned =>
3263            {
3264                let req_vid = req.input.version_id.take();
3265                let entry = match req_vid.as_deref() {
3266                    Some(vid) => {
3267                        mgr.lookup_version(&get_bucket, &get_key, vid)
3268                            .ok_or_else(|| {
3269                                S3Error::with_message(
3270                                    S3ErrorCode::NoSuchVersion,
3271                                    format!("no such version: {vid}"),
3272                                )
3273                            })?
3274                    }
3275                    None => mgr.lookup_latest(&get_bucket, &get_key).ok_or_else(|| {
3276                        S3Error::with_message(
3277                            S3ErrorCode::NoSuchKey,
3278                            format!("no such key: {get_key}"),
3279                        )
3280                    })?,
3281                };
3282                if entry.is_delete_marker {
3283                    // S3 spec: GET without versionId on a
3284                    // delete-marker latest → 404 NoSuchKey + the
3285                    // response carries `x-amz-delete-marker: true`.
3286                    // GET with explicit versionId pointing at a delete
3287                    // marker → 405 MethodNotAllowed; we surface
3288                    // NoSuchKey here for both since s3s collapses them
3289                    // into the same not-found error path.
3290                    return Err(S3Error::with_message(
3291                        S3ErrorCode::NoSuchKey,
3292                        format!("delete marker is the current version of {get_key}"),
3293                    ));
3294                }
3295                if entry.version_id != crate::versioning::NULL_VERSION_ID {
3296                    req.input.key = versioned_shadow_key(&get_key, &entry.version_id);
3297                }
3298                Some(entry.version_id)
3299            }
3300            _ => None,
3301        };
3302
3303        // ====== Range GET の partial-fetch fast path (sidecar index 利用) ======
3304        // sidecar `<key>.s4index` が存在し、multipart-framed object であれば
3305        // 必要 frame だけを backend に Range GET し帯域節約する。
3306        //
3307        // v0.8.4 #73 H-2: BEFORE trusting the sidecar's frame offsets,
3308        // verify the source object hasn't been overwritten / mutated since
3309        // the sidecar was stamped. The sidecar carries the backend ETag
3310        // captured at PUT time (`source_etag`); a HEAD against the current
3311        // backend object tells us the live ETag. If they disagree we treat
3312        // the sidecar as stale and fall through to the full-GET path —
3313        // returning the wrong frames for a Range request would surface as
3314        // a CRC mismatch deeper in the stack but would also potentially
3315        // disclose unrelated frames if a hostile operator wrote the
3316        // sidecar themselves. Fail-open to "full read" is the safe default.
3317        //
3318        // Legacy v1 sidecars (no `source_etag` populated) keep the old
3319        // best-effort behaviour so existing on-disk indexes don't suddenly
3320        // start missing the partial-fetch path.
3321        if let Some(ref r) = range_request
3322            && let Some(index) = self.read_sidecar(&req.input.bucket, &req.input.key).await
3323            && self
3324                .sidecar_version_binding_ok(&req.input.bucket, &req.input.key, &index)
3325                .await
3326        {
3327            let total = index.total_original_size();
3328            let (start, end_exclusive) = match resolve_range(r, total) {
3329                Ok(v) => v,
3330                Err(e) => {
3331                    return Err(S3Error::with_message(S3ErrorCode::InvalidRange, e));
3332                }
3333            };
3334            if let Some(plan) = index.lookup_range(start, end_exclusive) {
3335                return self
3336                    .partial_range_get(&req, plan, start, end_exclusive, total, get_start)
3337                    .await;
3338            }
3339        }
3340        let mut resp = self.backend.get_object(req).await?;
3341        // v0.5 #34: stamp the resolved version-id so the client sees a
3342        // coherent `x-amz-version-id` header (only for chains owned by
3343        // the manager — Unversioned buckets / no-manager paths never
3344        // set this).
3345        if let Some(ref vid) = resolved_version_id {
3346            resp.output.version_id = Some(vid.clone());
3347        }
3348        let is_multipart = is_multipart_object(&resp.output.metadata);
3349        let is_framed_v2 = is_framed_v2_object(&resp.output.metadata);
3350        // v0.2 #4: framed-v2 single-PUT は多 frame parse が必要なので
3351        // multipart と同じ path に流す。
3352        let needs_frame_parse = is_multipart || is_framed_v2;
3353        let manifest_opt = extract_manifest(&resp.output.metadata);
3354
3355        if !needs_frame_parse && manifest_opt.is_none() {
3356            // S4 が書いていないオブジェクトは透過 (raw bucket pre-existing object 等)
3357            debug!("S4 get_object: object lacks s4-codec metadata, returning as-is");
3358            return Ok(resp);
3359        }
3360
3361        if let Some(blob) = resp.output.body.take() {
3362            // v0.4 #21 / v0.5 #27: if the object was stored under SSE
3363            // (metadata flag `s4-encrypted: aes-256-gcm`), decrypt
3364            // before any frame parse / streaming decompress. Encrypted
3365            // bodies are opaque to the codec; this also forces the
3366            // buffered path because AES-GCM needs the full body for tag
3367            // verify. SSE-C uses the per-request customer key, SSE-S4
3368            // falls back to the configured keyring.
3369            let blob = if is_sse_encrypted(&resp.output.metadata) {
3370                let body = collect_blob(blob, self.max_body_bytes)
3371                    .await
3372                    .map_err(internal("collect SSE-encrypted body"))?;
3373                // v0.5 #28: peek the frame magic to route the right
3374                // decrypt path. S4E4 means SSE-KMS — unwrap the DEK
3375                // through the KMS backend (async). S4E1/E2/E3 take
3376                // the sync path (keyring or customer key).
3377                //
3378                // v0.8 #52 (S4E5) / v0.8.1 #57 (S4E6): the chunked
3379                // SSE-S4 frames take the *streaming* path — we hand
3380                // the response body a per-chunk verify-and-emit
3381                // Stream so the client sees chunk 0 plaintext after
3382                // one chunk-worth of AES-GCM verify (vs. waiting
3383                // for the whole body's tag), and the gateway no
3384                // longer needs to materialize the full plaintext
3385                // in memory before responding. SSE-C is out of
3386                // scope for the chunked path (chunked S4E3 is a
3387                // follow-up), so this branch requires the SSE-S4
3388                // keyring to be wired and `get_sse_c_material` to
3389                // be absent — otherwise we surface a clear
3390                // misconfiguration error instead of silently
3391                // falling through to the buffered chunked path.
3392                // v0.8.11 CRIT-1 fix: the chunked stream early-return is
3393                // only correct when the decrypted body IS the user's
3394                // plaintext as-stored. If the object went through the
3395                // codec (compressed) or carries S4F2 frames, returning
3396                // the decrypt stream directly hands the client
3397                // compressed / framed bytes. Restrict the early-return
3398                // to codec=Passthrough + non-framed objects; everything
3399                // else falls through to the buffered path, which
3400                // decrypt-buffers S4E5/S4E6 via
3401                // `decrypt_chunked_buffered_default` and then runs the
3402                // existing decompress pipeline.
3403                let chunked_streaming_safe = !needs_frame_parse
3404                    && manifest_opt
3405                        .as_ref()
3406                        .map(|m| m.codec == CodecKind::Passthrough)
3407                        .unwrap_or(false);
3408                if matches!(crate::sse::peek_magic(&body), Some("S4E5") | Some("S4E6"))
3409                    && get_sse_c_material.is_none()
3410                    && chunked_streaming_safe
3411                {
3412                    let keyring_arc = self.sse_keyring.clone().ok_or_else(|| {
3413                        S3Error::with_message(
3414                            S3ErrorCode::InvalidRequest,
3415                            "object is SSE-S4 encrypted (S4E5/S4E6) but no --sse-s4-key is configured on this gateway",
3416                        )
3417                    })?;
3418                    let body_len = body.len() as u64;
3419                    let stream = crate::sse::decrypt_chunked_stream(body, keyring_arc.as_ref());
3420                    // Stream is `'static` (the keyring borrow is
3421                    // consumed up front; the cipher lives inside
3422                    // the stream state — see decrypt_chunked_stream
3423                    // doc), so we can move it straight into a
3424                    // StreamingBlob without lifetime gymnastics.
3425                    use futures::StreamExt;
3426                    let mapped = stream.map(|r| {
3427                        r.map_err(|e| std::io::Error::other(format!("SSE-S4 chunked decrypt: {e}")))
3428                    });
3429                    use s3s::dto::StreamingBlob;
3430                    resp.output.body = Some(StreamingBlob::wrap(mapped));
3431                    // Plaintext content_length is unknown until all
3432                    // chunks have been verified; null it out so the
3433                    // ByteStream wrapper reports `unknown` to the
3434                    // HTTP layer (which then emits chunked transfer-
3435                    // encoding) rather than lying about the size.
3436                    resp.output.content_length = None;
3437                    // The backend's checksums + ETag describe the
3438                    // encrypted body (S4E5/S4E6 wire format), not
3439                    // the plaintext we're about to stream — clear them
3440                    // so the AWS SDK doesn't fail the GET with a
3441                    // ChecksumMismatch on a successful round-trip.
3442                    // Mirrors the streaming-zstd path at L1180-1185.
3443                    resp.output.checksum_crc32 = None;
3444                    resp.output.checksum_crc32c = None;
3445                    resp.output.checksum_crc64nvme = None;
3446                    resp.output.checksum_sha1 = None;
3447                    resp.output.checksum_sha256 = None;
3448                    resp.output.e_tag = None;
3449                    let elapsed = get_start.elapsed();
3450                    crate::metrics::record_get(
3451                        "sse-s4-chunked",
3452                        body_len,
3453                        body_len,
3454                        elapsed.as_secs_f64(),
3455                        true,
3456                    );
3457                    return Ok(resp);
3458                }
3459                let plain = match crate::sse::peek_magic(&body) {
3460                    Some("S4E4") => {
3461                        let kms = self.kms.as_ref().ok_or_else(|| {
3462                            S3Error::with_message(
3463                                S3ErrorCode::InvalidRequest,
3464                                "object is SSE-KMS encrypted but no --kms-local-dir / --kms-aws-region is configured on this gateway",
3465                            )
3466                        })?;
3467                        let kms_ref: &dyn crate::kms::KmsBackend = kms.as_ref();
3468                        crate::sse::decrypt_with_kms(&body, kms_ref)
3469                            .await
3470                            .map_err(|e| match e {
3471                                crate::sse::SseError::KmsBackend(k) => kms_error_to_s3(k),
3472                                other => S3Error::with_message(
3473                                    S3ErrorCode::InternalError,
3474                                    format!("SSE-KMS decrypt failed: {other}"),
3475                                ),
3476                            })?
3477                    }
3478                    _ => {
3479                        if let Some(ref m) = get_sse_c_material {
3480                            crate::sse::decrypt(
3481                                &body,
3482                                crate::sse::SseSource::CustomerKey {
3483                                    key: &m.key,
3484                                    key_md5: &m.key_md5,
3485                                },
3486                            )
3487                            .map_err(sse_c_error_to_s3)?
3488                        } else {
3489                            let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
3490                                S3Error::with_message(
3491                                    S3ErrorCode::InvalidRequest,
3492                                    "object is SSE-S4 encrypted but no --sse-s4-key is configured on this gateway",
3493                                )
3494                            })?;
3495                            crate::sse::decrypt(&body, keyring).map_err(|e| {
3496                                S3Error::with_message(
3497                                    S3ErrorCode::InternalError,
3498                                    format!("SSE-S4 decrypt failed: {e}"),
3499                                )
3500                            })?
3501                        }
3502                    }
3503                };
3504                // v0.5 #28: parse out the on-disk wrapped DEK's key id
3505                // so the GET response can echo `x-amz-server-side-encryption-aws-kms-key-id`.
3506                if matches!(crate::sse::peek_magic(&body), Some("S4E4"))
3507                    && let Ok(hdr) = crate::sse::parse_s4e4_header(&body)
3508                {
3509                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
3510                        ServerSideEncryption::AWS_KMS,
3511                    ));
3512                    resp.output.ssekms_key_id = Some(hdr.key_id.to_string());
3513                }
3514                bytes_to_blob(plain)
3515            } else if let Some(ref m) = get_sse_c_material {
3516                // Client sent SSE-C headers for an unencrypted object —
3517                // mirror AWS S3's 400 InvalidRequest.
3518                let _ = m;
3519                return Err(sse_c_error_to_s3(
3520                    crate::sse::SseError::CustomerKeyUnexpected,
3521                ));
3522            } else {
3523                blob
3524            };
3525            // v0.5 #27: SSE-C echo on success — algorithm + key MD5
3526            // tell the client that the supplied key was the one used.
3527            if let Some(ref m) = get_sse_c_material {
3528                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
3529                resp.output.sse_customer_key_md5 =
3530                    Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
3531            }
3532            // ====== Streaming fast path (CpuZstd, non-multipart, codec supports it) ======
3533            // 大規模 object (e.g. 5 GB) を memory に collect すると OOM するので、
3534            // codec が streaming-aware なら body を chunk-by-chunk で decompress して
3535            // 即座に client に流す。
3536            //
3537            // ただし Range request 時は streaming できない (slice するため total bytes
3538            // が必要) → buffered path に fall through。
3539            if range_request.is_none()
3540                && !needs_frame_parse
3541                && let Some(ref m) = manifest_opt
3542                && supports_streaming_decompress(m.codec)
3543                && m.codec == CodecKind::CpuZstd
3544            {
3545                // v0.8.4 #73 H-1: wrap the decompressor output in a
3546                // rolling-CRC32C verifier so a tampered ciphertext (or a
3547                // backend-side corruption that the zstd decoder happens
3548                // to "successfully" decode into wrong bytes) surfaces as
3549                // a streaming error tail at EOF instead of silently
3550                // delivering corrupt plaintext to the client. The wrap
3551                // is a pure pass-through during the body — no extra
3552                // buffering, TTFB unaffected — and the integrity
3553                // decision lands at the last chunk.
3554                let decompressed_blob = cpu_zstd_decompress_stream(blob);
3555                let verified_reader = Crc32cVerifyingReader::new(
3556                    blob_to_async_read(decompressed_blob),
3557                    m.crc32c,
3558                    m.original_size,
3559                );
3560                let verified_blob = async_read_to_blob(verified_reader);
3561                resp.output.content_length = Some(m.original_size as i64);
3562                resp.output.checksum_crc32 = None;
3563                resp.output.checksum_crc32c = None;
3564                resp.output.checksum_crc64nvme = None;
3565                resp.output.checksum_sha1 = None;
3566                resp.output.checksum_sha256 = None;
3567                resp.output.e_tag = None;
3568                resp.output.body = Some(verified_blob);
3569                let elapsed = get_start.elapsed();
3570                crate::metrics::record_get(
3571                    m.codec.as_str(),
3572                    m.compressed_size,
3573                    m.original_size,
3574                    elapsed.as_secs_f64(),
3575                    true,
3576                );
3577                info!(
3578                    op = "get_object",
3579                    bucket = %get_bucket,
3580                    key = %get_key,
3581                    codec = m.codec.as_str(),
3582                    bytes_in = m.compressed_size,
3583                    bytes_out = m.original_size,
3584                    path = "streaming",
3585                    setup_latency_ms = elapsed.as_millis() as u64,
3586                    "S4 get started (streaming)"
3587                );
3588                return Ok(resp);
3589            }
3590            // Passthrough: そのまま流す (Range なしの場合のみ streaming)
3591            if range_request.is_none()
3592                && !needs_frame_parse
3593                && let Some(ref m) = manifest_opt
3594                && m.codec == CodecKind::Passthrough
3595            {
3596                resp.output.content_length = Some(m.original_size as i64);
3597                resp.output.checksum_crc32 = None;
3598                resp.output.checksum_crc32c = None;
3599                resp.output.checksum_crc64nvme = None;
3600                resp.output.checksum_sha1 = None;
3601                resp.output.checksum_sha256 = None;
3602                resp.output.e_tag = None;
3603                resp.output.body = Some(blob);
3604                debug!("S4 get_object: passthrough streaming");
3605                return Ok(resp);
3606            }
3607
3608            // ====== Buffered slow path (multipart frame parser, GPU codecs) ======
3609            let bytes = collect_blob(blob, self.max_body_bytes)
3610                .await
3611                .map_err(internal("collect get body"))?;
3612
3613            let decompressed = if needs_frame_parse {
3614                // multipart objects と framed-v2 single-PUT objects は同じ
3615                // S4F2 frame 列なので decompress_multipart で統一処理
3616                self.decompress_multipart(bytes).await?
3617            } else {
3618                let manifest = manifest_opt.as_ref().expect("non-multipart guarded above");
3619                self.registry
3620                    .decompress(bytes, manifest)
3621                    .await
3622                    .map_err(internal("registry decompress"))?
3623            };
3624
3625            // Range request があれば slice。なければ full body を返す。
3626            let total_size = decompressed.len() as u64;
3627            let (final_bytes, status_override) = if let Some(r) = range_request.as_ref() {
3628                let (start, end) = resolve_range(r, total_size)
3629                    .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
3630                let sliced = decompressed.slice(start as usize..end as usize);
3631                resp.output.content_range = Some(format!(
3632                    "bytes {start}-{}/{total_size}",
3633                    end.saturating_sub(1)
3634                ));
3635                (sliced, Some(http::StatusCode::PARTIAL_CONTENT))
3636            } else {
3637                (decompressed, None)
3638            };
3639            // 解凍後の真のサイズを返す (S3 client は content_length を信頼するので
3640            // 圧縮 size のままだと downstream が body を途中で切ってしまう)
3641            resp.output.content_length = Some(final_bytes.len() as i64);
3642            // 圧縮済 bytes の checksum を返すと AWS SDK 側で StreamingError
3643            // (ChecksumMismatch) になる。ETag も backend が返した「圧縮済 bytes の
3644            // MD5/checksum」なので意味的にズレる — クリアして S4 自身の crc32c
3645            // (manifest 内 / frame 内) で integrity を保証する設計にする。
3646            resp.output.checksum_crc32 = None;
3647            resp.output.checksum_crc32c = None;
3648            resp.output.checksum_crc64nvme = None;
3649            resp.output.checksum_sha1 = None;
3650            resp.output.checksum_sha256 = None;
3651            resp.output.e_tag = None;
3652            let returned_size = final_bytes.len() as u64;
3653            let codec_label = manifest_opt
3654                .as_ref()
3655                .map(|m| m.codec.as_str())
3656                .unwrap_or("multipart");
3657            resp.output.body = Some(bytes_to_blob(final_bytes));
3658            if let Some(status) = status_override {
3659                resp.status = Some(status);
3660            }
3661            let elapsed = get_start.elapsed();
3662            crate::metrics::record_get(codec_label, 0, returned_size, elapsed.as_secs_f64(), true);
3663            info!(
3664                op = "get_object",
3665                bucket = %get_bucket,
3666                key = %get_key,
3667                codec = codec_label,
3668                bytes_out = returned_size,
3669                total_object_size = total_size,
3670                range = range_request.is_some(),
3671                path = "buffered",
3672                latency_ms = elapsed.as_millis() as u64,
3673                "S4 get completed (buffered)"
3674            );
3675        }
3676        // v0.6 #40: echo the recorded `x-amz-replication-status` so
3677        // consumers can poll progress (PENDING / COMPLETED / FAILED).
3678        if let Some(mgr) = self.replication.as_ref()
3679            && let Some(status) = mgr.lookup_status(&get_bucket, &get_key)
3680        {
3681            resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3682                status.as_aws_str().to_owned(),
3683            ));
3684        }
3685        Ok(resp)
3686    }
3687
3688    // === passthrough delegations ===
3689    async fn head_bucket(
3690        &self,
3691        req: S3Request<HeadBucketInput>,
3692    ) -> S3Result<S3Response<HeadBucketOutput>> {
3693        self.backend.head_bucket(req).await
3694    }
3695    async fn list_buckets(
3696        &self,
3697        req: S3Request<ListBucketsInput>,
3698    ) -> S3Result<S3Response<ListBucketsOutput>> {
3699        self.backend.list_buckets(req).await
3700    }
3701    async fn create_bucket(
3702        &self,
3703        req: S3Request<CreateBucketInput>,
3704    ) -> S3Result<S3Response<CreateBucketOutput>> {
3705        self.backend.create_bucket(req).await
3706    }
3707    async fn delete_bucket(
3708        &self,
3709        req: S3Request<DeleteBucketInput>,
3710    ) -> S3Result<S3Response<DeleteBucketOutput>> {
3711        self.backend.delete_bucket(req).await
3712    }
3713    async fn head_object(
3714        &self,
3715        req: S3Request<HeadObjectInput>,
3716    ) -> S3Result<S3Response<HeadObjectOutput>> {
3717        // v0.6 #40: capture bucket/key before req is consumed so the
3718        // replication-status echo can look the entry up.
3719        let head_bucket = req.input.bucket.clone();
3720        let head_key = req.input.key.clone();
3721        // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
3722        self.check_not_reserved_key(&head_key, ReservedKeyMode::Read)?;
3723        let mut resp = self.backend.head_object(req).await?;
3724        if let Some(manifest) = extract_manifest(&resp.output.metadata) {
3725            // 客側には decompress 後の意味のある content_length / checksum を返す。
3726            // backend が返す圧縮済 bytes の checksum / e_tag は意味が違うため除去
3727            // (S4 は manifest 内の crc32c で integrity を担保する)。
3728            resp.output.content_length = Some(manifest.original_size as i64);
3729            resp.output.checksum_crc32 = None;
3730            resp.output.checksum_crc32c = None;
3731            resp.output.checksum_crc64nvme = None;
3732            resp.output.checksum_sha1 = None;
3733            resp.output.checksum_sha256 = None;
3734            resp.output.e_tag = None;
3735        }
3736        // v0.6 #40: echo `x-amz-replication-status` (PENDING / COMPLETED
3737        // / FAILED) so consumers can poll progress without a GET.
3738        if let Some(mgr) = self.replication.as_ref()
3739            && let Some(status) = mgr.lookup_status(&head_bucket, &head_key)
3740        {
3741            resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3742                status.as_aws_str().to_owned(),
3743            ));
3744        }
3745        // v0.7 #48 BUG-4 fix: HEAD must echo SSE indicators so SDKs
3746        // and pipelines see the same posture they got on PUT. The PUT
3747        // path stamps `s4-sse-type` metadata for exactly this — HEAD
3748        // doesn't fetch the body, so it can't peek frame magic.
3749        if let Some(meta) = resp.output.metadata.as_ref()
3750            && let Some(sse_type) = meta.get("s4-sse-type")
3751        {
3752            {
3753                match sse_type.as_str() {
3754                    "aws:kms" => {
3755                        resp.output.server_side_encryption = Some(
3756                            ServerSideEncryption::from_static(ServerSideEncryption::AWS_KMS),
3757                        );
3758                        if let Some(key_id) = meta.get("s4-sse-kms-key-id") {
3759                            resp.output.ssekms_key_id = Some(key_id.clone());
3760                        }
3761                    }
3762                    _ => {
3763                        resp.output.server_side_encryption = Some(
3764                            ServerSideEncryption::from_static(ServerSideEncryption::AES256),
3765                        );
3766                        if let Some(md5) = meta.get("s4-sse-c-key-md5") {
3767                            resp.output.sse_customer_algorithm =
3768                                Some(crate::sse::SSE_C_ALGORITHM.into());
3769                            resp.output.sse_customer_key_md5 = Some(md5.clone());
3770                        }
3771                    }
3772                }
3773            }
3774        }
3775        Ok(resp)
3776    }
3777    async fn delete_object(
3778        &self,
3779        mut req: S3Request<DeleteObjectInput>,
3780    ) -> S3Result<S3Response<DeleteObjectOutput>> {
3781        let bucket = req.input.bucket.clone();
3782        let key = req.input.key.clone();
3783        // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
3784        // The S4 internal sidecar cleanup path
3785        // (`write_sidecar` and friends) talks to
3786        // `self.backend.delete_object(...)` directly, NOT through
3787        // this trait method, so the guard doesn't break
3788        // legitimate sidecar cleanup.
3789        self.check_not_reserved_key(&key, ReservedKeyMode::Mutating)?;
3790        self.enforce_rate_limit(&req, &bucket)?;
3791        self.enforce_policy(&req, "s3:DeleteObject", &bucket, Some(&key))?;
3792        // v0.6 #42: MFA Delete enforcement. When the bucket has
3793        // MFA-Delete = Enabled, every DELETE / DELETE-version /
3794        // delete-marker form needs `x-amz-mfa: <serial> <code>` (RFC 6238
3795        // 6-digit TOTP). Runs *before* the WORM / versioning routers so
3796        // a missing token is denied for free regardless of which delete
3797        // path the request would otherwise take.
3798        if let Some(mgr) = self.mfa_delete.as_ref()
3799            && mgr.is_enabled(&bucket)
3800        {
3801            let header = req.input.mfa.as_deref();
3802            if let Err(e) = crate::mfa::check_mfa(&bucket, header, mgr, current_unix_secs()) {
3803                crate::metrics::record_mfa_delete_denial(&bucket);
3804                return Err(mfa_error_to_s3(e));
3805            }
3806        }
3807        // v0.5 #30: refuse the delete while a WORM lock is in effect.
3808        // Compliance can never be bypassed; Governance can be overridden
3809        // via `x-amz-bypass-governance-retention: true`; legal hold
3810        // never. The check happens before the versioning router so a
3811        // locked object can't be soft-deleted (delete-marker push) on an
3812        // Enabled bucket either — S3 spec says lock applies to all
3813        // delete forms.
3814        if let Some(mgr) = self.object_lock.as_ref()
3815            && let Some(state) = mgr.get(&bucket, &key)
3816        {
3817            let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
3818            // v0.8.12 HIGH-7 fix: the bypass header alone used to be
3819            // enough to override Governance retention. AWS spec
3820            // requires the caller hold `s3:BypassGovernanceRetention`
3821            // for the target ARN; without that, the header is
3822            // silently ignored (not an error — it lines up with how
3823            // AWS' canonical behaviour treats unprivileged callers).
3824            let bypass_allowed = if bypass_header {
3825                self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
3826                    .is_ok()
3827            } else {
3828                false
3829            };
3830            let now = chrono::Utc::now();
3831            if !state.can_delete(now, bypass_allowed) {
3832                crate::metrics::record_policy_denial("s3:DeleteObject", &bucket);
3833                return Err(S3Error::with_message(
3834                    S3ErrorCode::AccessDenied,
3835                    "Access Denied because object protected by object lock",
3836                ));
3837            }
3838        }
3839        // v0.5 #34: route DELETE through the VersioningManager when the
3840        // bucket is in a versioning-aware state.
3841        //
3842        // - Enabled bucket, no version_id → push a delete marker into
3843        //   the chain. NO backend object is touched (older versions
3844        //   stay reachable via specific-version GET).
3845        // - Enabled / Suspended bucket, with version_id → physical
3846        //   delete. Backend bytes at the shadow key (or `<key>` for
3847        //   `null`) are removed; chain entry is dropped. If the deleted
3848        //   entry was a delete marker, no backend bytes exist for it
3849        //   (record-only).
3850        // - Suspended bucket, no version_id → push a "null" delete
3851        //   marker (S3 spec); backend bytes at `<key>` are physically
3852        //   removed (same as legacy).
3853        // - Unversioned bucket → fall through to legacy passthrough.
3854        if let Some(mgr) = self.versioning.as_ref() {
3855            let state = mgr.state(&bucket);
3856            if state != crate::versioning::VersioningState::Unversioned {
3857                let req_vid = req.input.version_id.take();
3858                if let Some(vid) = req_vid {
3859                    // Specific-version DELETE: touch backend bytes only
3860                    // when the entry was a real version (not a delete
3861                    // marker, which has no backend bytes).
3862                    let outcome = mgr.record_delete_specific(&bucket, &key, &vid);
3863                    let backend_target = if vid == crate::versioning::NULL_VERSION_ID {
3864                        key.clone()
3865                    } else {
3866                        versioned_shadow_key(&key, &vid)
3867                    };
3868                    let was_real_version = outcome
3869                        .as_ref()
3870                        .map(|o| !o.is_delete_marker)
3871                        .unwrap_or(false);
3872                    if was_real_version {
3873                        // Best-effort backend cleanup; missing bytes
3874                        // are not an error (e.g. shadow key already
3875                        // GC'd).
3876                        let backend_input = DeleteObjectInput {
3877                            bucket: bucket.clone(),
3878                            key: backend_target,
3879                            ..Default::default()
3880                        };
3881                        let backend_req = S3Request {
3882                            input: backend_input,
3883                            method: http::Method::DELETE,
3884                            uri: req.uri.clone(),
3885                            headers: req.headers.clone(),
3886                            extensions: http::Extensions::new(),
3887                            credentials: req.credentials.clone(),
3888                            region: req.region.clone(),
3889                            service: req.service.clone(),
3890                            trailing_headers: None,
3891                        };
3892                        let _ = self.backend.delete_object(backend_req).await;
3893                    }
3894                    let mut output = DeleteObjectOutput {
3895                        version_id: Some(vid.clone()),
3896                        ..Default::default()
3897                    };
3898                    if let Some(o) = outcome.as_ref()
3899                        && o.is_delete_marker
3900                    {
3901                        output.delete_marker = Some(true);
3902                    }
3903                    // v0.6 #35: specific-version DELETE always counts as
3904                    // a hard `ObjectRemoved:Delete` event (the chain
3905                    // entry, marker or not, is gone after this call).
3906                    self.fire_delete_notification(
3907                        &bucket,
3908                        &key,
3909                        crate::notifications::EventType::ObjectRemovedDelete,
3910                        Some(vid.clone()),
3911                    );
3912                    return Ok(S3Response::new(output));
3913                }
3914                // No version_id: record a delete marker (state-aware).
3915                let outcome = mgr.record_delete(&bucket, &key);
3916                if state == crate::versioning::VersioningState::Suspended {
3917                    // Suspended buckets also evict the prior `<key>`
3918                    // bytes (the previous null version is gone too).
3919                    let backend_input = DeleteObjectInput {
3920                        bucket: bucket.clone(),
3921                        key: key.clone(),
3922                        ..Default::default()
3923                    };
3924                    let backend_req = S3Request {
3925                        input: backend_input,
3926                        method: http::Method::DELETE,
3927                        uri: req.uri.clone(),
3928                        headers: req.headers.clone(),
3929                        extensions: http::Extensions::new(),
3930                        credentials: req.credentials.clone(),
3931                        region: req.region.clone(),
3932                        service: req.service.clone(),
3933                        trailing_headers: None,
3934                    };
3935                    let _ = self.backend.delete_object(backend_req).await;
3936                }
3937                let output = DeleteObjectOutput {
3938                    delete_marker: Some(true),
3939                    version_id: outcome.version_id.clone(),
3940                    ..Default::default()
3941                };
3942                // v0.6 #35: versioned bucket DELETE without a version-id
3943                // creates a delete marker — the dedicated AWS event
3944                // taxonomy entry. Suspended-state buckets also push a
3945                // (null) marker, so the same event fires there.
3946                self.fire_delete_notification(
3947                    &bucket,
3948                    &key,
3949                    crate::notifications::EventType::ObjectRemovedDeleteMarker,
3950                    outcome.version_id,
3951                );
3952                return Ok(S3Response::new(output));
3953            }
3954        }
3955        // Legacy / Unversioned path: physical delete on the backend +
3956        // best-effort sidecar cleanup (mirrors v0.4 behaviour).
3957        let resp = self.backend.delete_object(req).await?;
3958        // v0.5 #30: drop any per-object lock state once the delete has
3959        // succeeded so the freed key can be re-armed by a future PUT
3960        // under the bucket default. Reaching here implies the lock had
3961        // already passed `can_delete` above, so this is purely cleanup.
3962        if let Some(mgr) = self.object_lock.as_ref() {
3963            mgr.clear(&bucket, &key);
3964        }
3965        // v0.6 #39: drop any object-level tag set on physical delete —
3966        // the freed key starts a fresh tag history if a future PUT
3967        // re-creates it. (Versioned-delete branches above return early
3968        // and do NOT touch tags, mirroring AWS where tag state is
3969        // attached to the logical key, not the version chain.)
3970        if let Some(mgr) = self.tagging.as_ref() {
3971            mgr.delete_object_tags(&bucket, &key);
3972        }
3973        let sidecar = sidecar_key(&key);
3974        // v0.7 #49: skip the sidecar DELETE if the key + sidecar suffix
3975        // can't be encoded into a request URI — the primary delete
3976        // already succeeded and a stale sidecar is harmless (Range GET
3977        // re-validates the underlying object on next read).
3978        if let Ok(uri) = safe_object_uri(&bucket, &sidecar) {
3979            let sidecar_input = DeleteObjectInput {
3980                bucket: bucket.clone(),
3981                key: sidecar,
3982                ..Default::default()
3983            };
3984            let sidecar_req = S3Request {
3985                input: sidecar_input,
3986                method: http::Method::DELETE,
3987                uri,
3988                headers: http::HeaderMap::new(),
3989                extensions: http::Extensions::new(),
3990                credentials: None,
3991                region: None,
3992                service: None,
3993                trailing_headers: None,
3994            };
3995            let _ = self.backend.delete_object(sidecar_req).await;
3996        }
3997        // v0.6 #35: legacy unversioned-bucket hard delete fires the
3998        // canonical `ObjectRemoved:Delete` event.
3999        self.fire_delete_notification(
4000            &bucket,
4001            &key,
4002            crate::notifications::EventType::ObjectRemovedDelete,
4003            None,
4004        );
4005        Ok(resp)
4006    }
4007    async fn delete_objects(
4008        &self,
4009        req: S3Request<DeleteObjectsInput>,
4010    ) -> S3Result<S3Response<DeleteObjectsOutput>> {
4011        // v0.6 #42: MFA Delete applies once to the whole batch (S3 spec:
4012        // when MFA-Delete is on the bucket, a missing / invalid token
4013        // fails the entire DeleteObjects request, not per-object).
4014        if let Some(mgr) = self.mfa_delete.as_ref()
4015            && mgr.is_enabled(&req.input.bucket)
4016        {
4017            let header = req.input.mfa.as_deref();
4018            if let Err(e) =
4019                crate::mfa::check_mfa(&req.input.bucket, header, mgr, current_unix_secs())
4020            {
4021                crate::metrics::record_mfa_delete_denial(&req.input.bucket);
4022                return Err(mfa_error_to_s3(e));
4023            }
4024        }
4025        // v0.8.11 CRIT-3 fix: route every entry through the gated
4026        // per-object `delete_object` path so Object Lock, IAM policy,
4027        // versioning, tagging, sidecar cleanup and notification fan-
4028        // out all fire for batch DELETE. The previous
4029        // `self.backend.delete_objects(req).await` straight-through
4030        // bypassed every gate, so a `legal_hold=on` key listed inside
4031        // a DeleteObjects XML was happily removed.
4032        //
4033        // S3 spec note: DeleteObjects is "best-effort per object" —
4034        // a failure on one key surfaces as an `Errors` entry without
4035        // aborting the rest of the batch. Quiet-mode suppresses the
4036        // `Deleted` list (errors are still reported). We honour both.
4037        let bucket = req.input.bucket.clone();
4038        let bypass_governance = req.input.bypass_governance_retention.unwrap_or(false);
4039        let mfa_header = req.input.mfa.clone();
4040        let quiet = req.input.delete.quiet.unwrap_or(false);
4041        let mut deleted: Vec<DeletedObject> = Vec::new();
4042        let mut errors: Vec<s3s::dto::Error> = Vec::new();
4043        for ident in req.input.delete.objects.iter() {
4044            let key = ident.key.clone();
4045            let version_id = ident.version_id.clone();
4046            let per_input = DeleteObjectInput {
4047                bucket: bucket.clone(),
4048                key: key.clone(),
4049                version_id: version_id.clone(),
4050                bypass_governance_retention: Some(bypass_governance),
4051                mfa: mfa_header.clone(),
4052                ..Default::default()
4053            };
4054            let per_uri = match safe_object_uri(&bucket, &key) {
4055                Ok(u) => u,
4056                Err(_) => {
4057                    errors.push(s3s::dto::Error {
4058                        code: Some("InvalidArgument".to_owned()),
4059                        key: Some(key),
4060                        message: Some("object key is not URI-encodable".to_owned()),
4061                        version_id,
4062                    });
4063                    continue;
4064                }
4065            };
4066            let per_req = S3Request {
4067                input: per_input,
4068                method: http::Method::DELETE,
4069                uri: per_uri,
4070                headers: req.headers.clone(),
4071                extensions: http::Extensions::new(),
4072                credentials: req.credentials.clone(),
4073                region: req.region.clone(),
4074                service: req.service.clone(),
4075                trailing_headers: None,
4076            };
4077            match self.delete_object(per_req).await {
4078                Ok(resp) => {
4079                    let out = resp.output;
4080                    // DeleteObjectOutput doesn't surface a separate
4081                    // `delete_marker_version_id`; the marker's version
4082                    // id is whatever `version_id` carries (when the
4083                    // versioning manager pushed a delete-marker, that
4084                    // field already holds the marker's vid).
4085                    let vid = out.version_id.clone().or(version_id);
4086                    deleted.push(DeletedObject {
4087                        key: Some(key),
4088                        version_id: vid.clone(),
4089                        delete_marker: out.delete_marker,
4090                        delete_marker_version_id: vid,
4091                    });
4092                }
4093                Err(e) => {
4094                    let code_str = e.code().as_str().to_owned();
4095                    let msg = e.message().unwrap_or(code_str.as_str()).to_owned();
4096                    errors.push(s3s::dto::Error {
4097                        code: Some(code_str),
4098                        key: Some(key),
4099                        message: Some(msg),
4100                        version_id,
4101                    });
4102                }
4103            }
4104        }
4105        let output = DeleteObjectsOutput {
4106            deleted: if quiet || deleted.is_empty() {
4107                None
4108            } else {
4109                Some(deleted)
4110            },
4111            errors: if errors.is_empty() {
4112                None
4113            } else {
4114                Some(errors)
4115            },
4116            ..Default::default()
4117        };
4118        Ok(S3Response::new(output))
4119    }
4120    async fn copy_object(
4121        &self,
4122        mut req: S3Request<CopyObjectInput>,
4123    ) -> S3Result<S3Response<CopyObjectOutput>> {
4124        // copy is conceptually "GetObject src + PutObject dst" — enforce both.
4125        let dst_bucket = req.input.bucket.clone();
4126        let dst_key = req.input.key.clone();
4127        // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
4128        self.check_not_reserved_key(&dst_key, ReservedKeyMode::Mutating)?;
4129        self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
4130        if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4131            // v0.8.17 G-2: source `<key>.s4index` would let
4132            // CopyObject expose the raw sidecar (frame layout +
4133            // source ETag) into a writable destination, bypassing
4134            // the F-13 GET reject. Same guard, Read mode (returns
4135            // NoSuchKey to match listing semantics).
4136            self.check_not_reserved_key(key, ReservedKeyMode::Read)?;
4137            self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
4138        }
4139        // S4-aware copy: source object に s4-* metadata がある場合、それを
4140        // destination に確実に preserve する。
4141        //
4142        // - MetadataDirective::COPY (default): backend が source metadata を
4143        //   そのまま copy するので S4 metadata も自動で渡る。介入不要
4144        // - MetadataDirective::REPLACE: 客が指定した metadata で source を
4145        //   上書き → s4-* metadata が消えると destination は decompress 不能に
4146        //   なる (silent corruption)。S4 が source metadata を HEAD で取得し、
4147        //   s4-* fields を input.metadata に強制 merge する
4148        let needs_merge = req
4149            .input
4150            .metadata_directive
4151            .as_ref()
4152            .map(|d| d.as_str() == MetadataDirective::REPLACE)
4153            .unwrap_or(false);
4154        if needs_merge && let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4155            // v0.8.16 F-8: strip the client-supplied `s4-*` keys
4156            // *unconditionally* — the v0.8.15 M-2 fix only ran the
4157            // strip inside the `if let Ok(head) = ...` block, so a
4158            // backend HEAD failure (transient 5xx, NoSuchKey on a
4159            // racing delete) left attacker-injected `s4-*` /
4160            // `S4-*` metadata intact on the destination. Now we
4161            // strip first, then re-populate from the source HEAD
4162            // when available — HEAD failure simply means the
4163            // destination loses the codec markers (correct: a
4164            // CopyObject without the source's codec metadata
4165            // produces an unreadable object, but doesn't allow
4166            // injection).
4167            let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4168            dest_meta.retain(|k, _| !k.to_ascii_lowercase().starts_with("s4-"));
4169            let head_input = HeadObjectInput {
4170                bucket: bucket.to_string(),
4171                key: key.to_string(),
4172                ..Default::default()
4173            };
4174            let head_req = S3Request {
4175                input: head_input,
4176                method: req.method.clone(),
4177                uri: req.uri.clone(),
4178                headers: req.headers.clone(),
4179                extensions: http::Extensions::new(),
4180                credentials: req.credentials.clone(),
4181                region: req.region.clone(),
4182                service: req.service.clone(),
4183                trailing_headers: None,
4184            };
4185            if let Ok(head) = self.backend.head_object(head_req).await
4186                && let Some(src_meta) = head.output.metadata.as_ref()
4187            {
4188                let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4189                for key in [
4190                    META_CODEC,
4191                    META_ORIGINAL_SIZE,
4192                    META_COMPRESSED_SIZE,
4193                    META_CRC32C,
4194                    META_MULTIPART,
4195                    META_FRAMED,
4196                ] {
4197                    if let Some(v) = src_meta.get(key) {
4198                        dest_meta.insert(key.to_string(), v.clone());
4199                    }
4200                }
4201                // SSE markers are equally reserved — propagate any
4202                // source flags so a copy of an encrypted object stays
4203                // marked as encrypted at the destination.
4204                for sse_key in [
4205                    "s4-encrypted",
4206                    "s4-sse-type",
4207                    "s4-sse-c-key-md5",
4208                    "s4-sse-kms-key-id",
4209                ] {
4210                    if let Some(v) = src_meta.get(sse_key) {
4211                        dest_meta.insert(sse_key.to_string(), v.clone());
4212                    }
4213                }
4214                debug!(
4215                    src_bucket = %bucket,
4216                    src_key = %key,
4217                    "S4 copy_object: replaced client s4-* metadata with source values across REPLACE directive (v0.8.15 M-2)"
4218                );
4219            }
4220        }
4221        self.backend.copy_object(req).await
4222    }
4223    async fn list_objects(
4224        &self,
4225        req: S3Request<ListObjectsInput>,
4226    ) -> S3Result<S3Response<ListObjectsOutput>> {
4227        self.enforce_rate_limit(&req, &req.input.bucket)?;
4228        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4229        let mut resp = self.backend.list_objects(req).await?;
4230        // S4 内部 object (`*.s4index` sidecar、`.__s4ver__/` shadow versions
4231        // — v0.5 #34) を顧客から隠す。
4232        if let Some(contents) = resp.output.contents.as_mut() {
4233            contents.retain(|o| {
4234                o.key
4235                    .as_ref()
4236                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4237                    .unwrap_or(true)
4238            });
4239        }
4240        Ok(resp)
4241    }
4242    async fn list_objects_v2(
4243        &self,
4244        req: S3Request<ListObjectsV2Input>,
4245    ) -> S3Result<S3Response<ListObjectsV2Output>> {
4246        self.enforce_rate_limit(&req, &req.input.bucket)?;
4247        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4248        let mut resp = self.backend.list_objects_v2(req).await?;
4249        if let Some(contents) = resp.output.contents.as_mut() {
4250            let before = contents.len();
4251            contents.retain(|o| {
4252                o.key
4253                    .as_ref()
4254                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4255                    .unwrap_or(true)
4256            });
4257            // key_count も補正 (S3 spec compliance)
4258            if let Some(kc) = resp.output.key_count.as_mut() {
4259                *kc -= (before - contents.len()) as i32;
4260            }
4261        }
4262        Ok(resp)
4263    }
4264    /// v0.4 #17: filter S4-internal sidecars from versioned listings.
4265    /// v0.5 #34: when a [`crate::versioning::VersioningManager`] is
4266    /// attached AND the bucket is in a versioning-aware state, build
4267    /// the `Versions` / `DeleteMarkers` arrays directly from the
4268    /// in-memory chain (paginated + ordered the S3 way: key asc,
4269    /// version newest-first inside each key). Otherwise fall back to
4270    /// passthrough + sidecar-filter (legacy v0.4 behaviour).
4271    async fn list_object_versions(
4272        &self,
4273        req: S3Request<ListObjectVersionsInput>,
4274    ) -> S3Result<S3Response<ListObjectVersionsOutput>> {
4275        self.enforce_rate_limit(&req, &req.input.bucket)?;
4276        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4277        // v0.5 #34: VersioningManager-owned path.
4278        if let Some(mgr) = self.versioning.as_ref()
4279            && mgr.state(&req.input.bucket) != crate::versioning::VersioningState::Unversioned
4280        {
4281            let max_keys = req.input.max_keys.unwrap_or(1000) as usize;
4282            let page = mgr.list_versions(
4283                &req.input.bucket,
4284                req.input.prefix.as_deref(),
4285                req.input.key_marker.as_deref(),
4286                req.input.version_id_marker.as_deref(),
4287                max_keys,
4288            );
4289            let versions: Vec<ObjectVersion> = page
4290                .versions
4291                .into_iter()
4292                .map(|e| ObjectVersion {
4293                    key: Some(e.key),
4294                    version_id: Some(e.version_id),
4295                    is_latest: Some(e.is_latest),
4296                    e_tag: Some(ETag::Strong(e.etag)),
4297                    size: Some(e.size as i64),
4298                    last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4299                    ..Default::default()
4300                })
4301                .collect();
4302            let delete_markers: Vec<DeleteMarkerEntry> = page
4303                .delete_markers
4304                .into_iter()
4305                .map(|e| DeleteMarkerEntry {
4306                    key: Some(e.key),
4307                    version_id: Some(e.version_id),
4308                    is_latest: Some(e.is_latest),
4309                    last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4310                    ..Default::default()
4311                })
4312                .collect();
4313            let output = ListObjectVersionsOutput {
4314                name: Some(req.input.bucket.clone()),
4315                prefix: req.input.prefix.clone(),
4316                key_marker: req.input.key_marker.clone(),
4317                version_id_marker: req.input.version_id_marker.clone(),
4318                max_keys: req.input.max_keys,
4319                versions: if versions.is_empty() {
4320                    None
4321                } else {
4322                    Some(versions)
4323                },
4324                delete_markers: if delete_markers.is_empty() {
4325                    None
4326                } else {
4327                    Some(delete_markers)
4328                },
4329                is_truncated: Some(page.is_truncated),
4330                next_key_marker: page.next_key_marker,
4331                next_version_id_marker: page.next_version_id_marker,
4332                ..Default::default()
4333            };
4334            return Ok(S3Response::new(output));
4335        }
4336        // Legacy passthrough path (v0.4 #17 sidecar filter retained).
4337        let mut resp = self.backend.list_object_versions(req).await?;
4338        if let Some(versions) = resp.output.versions.as_mut() {
4339            versions.retain(|v| {
4340                v.key
4341                    .as_ref()
4342                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4343                    .unwrap_or(true)
4344            });
4345        }
4346        if let Some(markers) = resp.output.delete_markers.as_mut() {
4347            markers.retain(|m| {
4348                m.key
4349                    .as_ref()
4350                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4351                    .unwrap_or(true)
4352            });
4353        }
4354        Ok(resp)
4355    }
4356
4357    async fn create_multipart_upload(
4358        &self,
4359        mut req: S3Request<CreateMultipartUploadInput>,
4360    ) -> S3Result<S3Response<CreateMultipartUploadOutput>> {
4361        // v0.8.12 HIGH-9 fix: gate multipart Create on `s3:PutObject` —
4362        // the destination is conceptually about to host a new object,
4363        // matching what `put_object` enforces L2078. Without this, a
4364        // bucket policy denying `s3:PutObject` was bypassable simply
4365        // by switching the client to the multipart wire path.
4366        let mp_bucket = req.input.bucket.clone();
4367        let mp_key = req.input.key.clone();
4368        // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
4369        self.check_not_reserved_key(&mp_key, ReservedKeyMode::Mutating)?;
4370        self.enforce_policy(&req, "s3:PutObject", &mp_bucket, Some(&mp_key))?;
4371        self.enforce_rate_limit(&req, &mp_bucket)?;
4372        // Multipart object は per-part 圧縮 + frame 形式で書く。GET 時に
4373        // frame parse を起動するため、object metadata に flag を立てる。
4374        // codec は dispatcher の default kind を採用 (per-part 別 codec は Phase 2)。
4375        let codec_kind = self.registry.default_kind();
4376        let meta = req.input.metadata.get_or_insert_with(Default::default);
4377        meta.insert(META_MULTIPART.into(), "true".into());
4378        meta.insert(META_CODEC.into(), codec_kind.as_str().into());
4379        // v0.8 #54 BUG-10 fix: take() the SSE request fields off
4380        // `req.input` so they are NOT forwarded to the backend on
4381        // CreateMultipartUpload. Same root cause as v0.7 #48 BUG-2/3 on
4382        // single-PUT — MinIO rejects SSE-C with "HTTPS required" and
4383        // SSE-KMS with "KMS not configured" when the headers reach it.
4384        // S4 owns the encrypt-then-store contract; we capture the
4385        // recipe in `multipart_state` here and apply it on Complete.
4386        let sse_c_alg = req.input.sse_customer_algorithm.take();
4387        let sse_c_key = req.input.sse_customer_key.take();
4388        let sse_c_md5 = req.input.sse_customer_key_md5.take();
4389        let sse_header = req.input.server_side_encryption.take();
4390        let sse_kms_key = req.input.ssekms_key_id.take();
4391        // Strip the encryption-context too — leaving it would make
4392        // MinIO try to validate it against a non-existent KMS key.
4393        let _ = req.input.ssekms_encryption_context.take();
4394        let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
4395        let kms_key_id = extract_kms_key_id(
4396            &sse_header,
4397            &sse_kms_key,
4398            self.kms_default_key_id.as_deref(),
4399        );
4400        // SSE-C / SSE-KMS exclusivity (mirrors put_object L1870).
4401        if sse_c_material.is_some() && kms_key_id.is_some() {
4402            return Err(S3Error::with_message(
4403                S3ErrorCode::InvalidArgument,
4404                "SSE-C and SSE-KMS cannot be used together on the same multipart upload",
4405            ));
4406        }
4407        let sse_mode = if let Some(ref m) = sse_c_material {
4408            // v0.8.2 #62 (H-6 audit fix): wrap the customer-supplied
4409            // 32-byte key in `Zeroizing` so abandoned uploads (or
4410            // normal Complete/Abort) wipe the key bytes on drop. The
4411            // `key_md5` is the public fingerprint and stays as a
4412            // bare `[u8; 16]`.
4413            crate::multipart_state::MultipartSseMode::SseC {
4414                key: zeroize::Zeroizing::new(m.key),
4415                key_md5: m.key_md5,
4416            }
4417        } else if let Some(ref kid) = kms_key_id {
4418            // KMS pre-flight: fail at Create rather than at Complete if
4419            // the gateway has no KMS backend wired (mirrors the
4420            // put_object L1879 check).
4421            if self.kms.is_none() {
4422                return Err(S3Error::with_message(
4423                    S3ErrorCode::InvalidRequest,
4424                    "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4425                ));
4426            }
4427            crate::multipart_state::MultipartSseMode::SseKms {
4428                key_id: kid.clone(),
4429            }
4430        } else if self.sse_keyring.is_some() {
4431            // SSE-S4: server-driven transparent encryption. Activates
4432            // whenever the gateway has a keyring configured AND the
4433            // client didn't pick a different SSE mode.
4434            crate::multipart_state::MultipartSseMode::SseS4
4435        } else {
4436            crate::multipart_state::MultipartSseMode::None
4437        };
4438        // v0.8 #54 BUG-9 fix: parse the Tagging header on Create. The
4439        // single-PUT path does this on PutObject; the multipart path
4440        // captures it now and commits via TagManager on Complete.
4441        let request_tags: Option<crate::tagging::TagSet> = req
4442            .input
4443            .tagging
4444            .as_deref()
4445            .map(crate::tagging::parse_tagging_header)
4446            .transpose()
4447            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
4448        // Strip the `Tagging` field off the input so the backend
4449        // doesn't try to apply it (no-op on MinIO but keeps the wire
4450        // clean).
4451        let _ = req.input.tagging.take();
4452        // Object Lock recipe (BUG-7 — captured here, applied on Complete).
4453        let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
4454            .input
4455            .object_lock_mode
4456            .as_ref()
4457            .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
4458        let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
4459            .input
4460            .object_lock_retain_until_date
4461            .as_ref()
4462            .and_then(timestamp_to_chrono_utc);
4463        let explicit_legal_hold_on: bool = req
4464            .input
4465            .object_lock_legal_hold_status
4466            .as_ref()
4467            .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
4468            .unwrap_or(false);
4469        let bucket = req.input.bucket.clone();
4470        let key = req.input.key.clone();
4471        debug!(
4472            bucket = %bucket,
4473            key = %key,
4474            codec = codec_kind.as_str(),
4475            sse = ?sse_mode,
4476            "S4 create_multipart_upload: marking object for per-part compression"
4477        );
4478        let mut resp = self.backend.create_multipart_upload(req).await?;
4479        // Stash the per-upload context only after the backend handed
4480        // us an upload_id (failed Creates leave nothing in the store).
4481        if let Some(upload_id) = resp.output.upload_id.as_ref() {
4482            self.multipart_state.put(
4483                upload_id,
4484                crate::multipart_state::MultipartUploadContext {
4485                    bucket,
4486                    key,
4487                    sse: sse_mode.clone(),
4488                    tags: request_tags,
4489                    object_lock_mode: explicit_lock_mode,
4490                    object_lock_retain_until: explicit_retain_until,
4491                    object_lock_legal_hold: explicit_legal_hold_on,
4492                },
4493            );
4494        }
4495        // SSE-C / SSE-KMS response echo (mirrors put_object L2036-L2050).
4496        match &sse_mode {
4497            crate::multipart_state::MultipartSseMode::SseC { key_md5, .. } => {
4498                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
4499                resp.output.sse_customer_key_md5 =
4500                    Some(base64::engine::general_purpose::STANDARD.encode(key_md5));
4501            }
4502            crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
4503                resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
4504                    ServerSideEncryption::AWS_KMS,
4505                ));
4506                resp.output.ssekms_key_id = Some(key_id.clone());
4507            }
4508            _ => {}
4509        }
4510        Ok(resp)
4511    }
4512
4513    async fn upload_part(
4514        &self,
4515        mut req: S3Request<UploadPartInput>,
4516    ) -> S3Result<S3Response<UploadPartOutput>> {
4517        // v0.8.12 HIGH-9 fix: same `s3:PutObject` gate as
4518        // `put_object` / `create_multipart_upload`. Even though
4519        // Create already passed the gate, a bucket policy that
4520        // *revokes* `s3:PutObject` mid-flight should stop further
4521        // parts (e.g. legal hold drops, retention shortened).
4522        let part_bucket = req.input.bucket.clone();
4523        let part_key = req.input.key.clone();
4524        self.enforce_policy(&req, "s3:PutObject", &part_bucket, Some(&part_key))?;
4525        self.enforce_rate_limit(&req, &part_bucket)?;
4526        // 各 part を圧縮して frame header 付きで forward。GET 時に
4527        // `decompress_multipart` が frame iter で順に解凍する。
4528        // **per-part codec dispatch**: dispatcher が body 先頭 sample から
4529        // codec を選ぶので、parquet 風の mixed-content multipart で part ごとに
4530        // 最適 codec を使える (整数列 part → Bitcomp、text 列 part → zstd 等)。
4531        //
4532        // v0.8 #54 BUG-5/BUG-10 fix: lookup the per-upload SSE
4533        // context captured by `create_multipart_upload` and (a) strip
4534        // any SSE-C request headers off `req.input` so the backend
4535        // doesn't see them — same root cause as v0.7 #48 BUG-2/3 on
4536        // single-PUT; MinIO refuses SSE-C parts over HTTP — and (b)
4537        // observe that an upload context exists for `upload_id`. The
4538        // actual encrypt happens once at `complete_multipart_upload`
4539        // time on the assembled body (the per-part-encrypt approach
4540        // would require a matching multi-segment decrypt path on GET;
4541        // encrypting the whole assembled body keeps the GET path's
4542        // `is_sse_encrypted` branch in get_object L2429 working
4543        // unchanged).
4544        let sse_ctx = self.multipart_state.get(req.input.upload_id.as_str());
4545        // v0.8.2 #62 (H-1 audit fix): SSE-C key consistency check.
4546        // The AWS S3 spec requires the same SSE-C key headers on
4547        // every UploadPart and rejects mismatches with 400. Prior to
4548        // #62 we silently stripped the headers (BUG-10 fix) without
4549        // validating them, allowing a client to send part 1 under
4550        // key-A and part 2 under key-B; both got stored, then
4551        // re-encrypted with key-A on Complete — the client thinks
4552        // part 2 is under key-B but a GET with key-B would in fact
4553        // hit the part-1 ciphertext that was actually encrypted with
4554        // key-A. That would either decrypt successfully (silent
4555        // corruption: client lost track of which key encrypts what)
4556        // or fail in a confusing way. Validate the per-part headers
4557        // now and reject with 400 InvalidArgument on mismatch /
4558        // omission / partial supply, matching real-S3 behaviour.
4559        if let Some(ref ctx) = sse_ctx {
4560            if let crate::multipart_state::MultipartSseMode::SseC {
4561                key_md5: ctx_md5, ..
4562            } = &ctx.sse
4563            {
4564                let alg = req.input.sse_customer_algorithm.take();
4565                let key_b64 = req.input.sse_customer_key.take();
4566                let md5_b64 = req.input.sse_customer_key_md5.take();
4567                match (alg, key_b64, md5_b64) {
4568                    (Some(a), Some(k), Some(m)) => {
4569                        // Parse + validate; if the per-part headers
4570                        // are themselves malformed (algorithm not
4571                        // AES256, MD5 mismatch, key not 32 bytes)
4572                        // surface the same 400 the single-PUT path
4573                        // would. Then compare the parsed MD5 to the
4574                        // upload-context's MD5; mismatch is a
4575                        // different-key UploadPart and must reject.
4576                        let part_material = crate::sse::parse_customer_key_headers(&a, &k, &m)
4577                            .map_err(sse_c_error_to_s3)?;
4578                        if part_material.key_md5 != *ctx_md5 {
4579                            return Err(S3Error::with_message(
4580                                S3ErrorCode::InvalidArgument,
4581                                "SSE-C key on UploadPart does not match the key supplied on CreateMultipartUpload",
4582                            ));
4583                        }
4584                        // OK — same key as Create. Headers are
4585                        // already taken off `req.input` so the
4586                        // backend never sees them.
4587                    }
4588                    (None, None, None) => {
4589                        // AWS S3 spec: SSE-C headers MUST be replayed
4590                        // on every UploadPart of an SSE-C multipart.
4591                        // Real-S3 returns 400 InvalidRequest in this
4592                        // case; mirror that.
4593                        return Err(S3Error::with_message(
4594                            S3ErrorCode::InvalidRequest,
4595                            "SSE-C requires customer-key headers on every UploadPart (CreateMultipartUpload was SSE-C)",
4596                        ));
4597                    }
4598                    _ => {
4599                        // Partial header set (e.g. algorithm + key
4600                        // but no MD5) — same handling as the
4601                        // single-PUT `extract_sse_c_material` helper.
4602                        return Err(S3Error::with_message(
4603                            S3ErrorCode::InvalidRequest,
4604                            "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
4605                        ));
4606                    }
4607                }
4608            } else {
4609                // CreateMultipartUpload was non-SSE-C (None / SseS4 /
4610                // SseKms). A part that arrives carrying SSE-C headers
4611                // is either a confused client or an attempt to
4612                // smuggle SSE-C around the gateway-internal SSE
4613                // recipe. Reject with 400 InvalidRequest rather than
4614                // silently strip — the strip would let the client
4615                // believe the part was encrypted under their key
4616                // when in fact the upload's encryption recipe is
4617                // whatever the Create captured.
4618                if req.input.sse_customer_algorithm.is_some()
4619                    || req.input.sse_customer_key.is_some()
4620                    || req.input.sse_customer_key_md5.is_some()
4621                {
4622                    return Err(S3Error::with_message(
4623                        S3ErrorCode::InvalidRequest,
4624                        "UploadPart sent SSE-C headers but CreateMultipartUpload was not SSE-C",
4625                    ));
4626                }
4627            }
4628        } else {
4629            // No upload context registered (gateway crashed between
4630            // Create and Part, or pre-#62 abandoned-upload restore).
4631            // We can't check key consistency in this case — strip
4632            // the headers and let the request through unchanged so
4633            // the backend's `NoSuchUpload` reply (or whatever it
4634            // chooses to do) flows back to the client.
4635            let _ = req.input.sse_customer_algorithm.take();
4636            let _ = req.input.sse_customer_key.take();
4637            let _ = req.input.sse_customer_key_md5.take();
4638        }
4639        let _sse_ctx = sse_ctx;
4640        if let Some(blob) = req.input.body.take() {
4641            let bytes = collect_blob(blob, self.max_body_bytes)
4642                .await
4643                .map_err(internal("collect upload_part body"))?;
4644            // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
4645            // checksum algorithms against the received part body.
4646            verify_client_body_checksums(
4647                &bytes,
4648                req.input.content_md5.as_deref(),
4649                req.input.checksum_crc32.as_deref(),
4650                req.input.checksum_crc32c.as_deref(),
4651                req.input.checksum_sha1.as_deref(),
4652                req.input.checksum_sha256.as_deref(),
4653                req.input.checksum_crc64nvme.as_deref(),
4654            )?;
4655            let sample_len = bytes.len().min(SAMPLE_BYTES);
4656            // v0.8 #56: full part body is already in memory here; use its
4657            // length as the size hint so the dispatcher can promote to GPU
4658            // if it's big enough.
4659            let codec_kind = self
4660                .dispatcher
4661                .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
4662                .await;
4663            let original_size = bytes.len() as u64;
4664            // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
4665            let (compress_res, tel) = self
4666                .registry
4667                .compress_with_telemetry(bytes, codec_kind)
4668                .await;
4669            stamp_gpu_compress_telemetry(&tel);
4670            let (compressed, manifest) =
4671                compress_res.map_err(internal("registry compress part"))?;
4672            let header = FrameHeader {
4673                codec: codec_kind,
4674                original_size,
4675                compressed_size: compressed.len() as u64,
4676                crc32c: manifest.crc32c,
4677            };
4678            let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
4679            write_frame(&mut framed, header, &compressed);
4680            // v0.2 #5: heuristic-based padding skip for likely-final parts.
4681            //
4682            // AWS SDK / aws-cli / boto3 always send the final (and only the
4683            // final) part below the configured part_size. So if the raw user
4684            // part is already smaller than S3's 5 MiB multipart minimum, this
4685            // is overwhelmingly likely to be the final part — and the final
4686            // part is exempt from S3's size constraint. Skipping padding here
4687            // saves up to ~5 MiB per object on highly compressible workloads.
4688            //
4689            // If a misbehaving client sends a tiny **non-final** part, S3
4690            // itself rejects with EntityTooSmall at CompleteMultipartUpload —
4691            // identical outcome to a vanilla S3 PUT, just earlier than
4692            // padding-then-complete would catch it.
4693            let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
4694            if !likely_final {
4695                pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
4696            }
4697            let framed_bytes = framed.freeze();
4698            let new_len = framed_bytes.len() as i64;
4699            // 同じ wire 互換問題が multipart にもある (content-length / checksum)
4700            req.input.content_length = Some(new_len);
4701            req.input.checksum_algorithm = None;
4702            req.input.checksum_crc32 = None;
4703            req.input.checksum_crc32c = None;
4704            req.input.checksum_crc64nvme = None;
4705            req.input.checksum_sha1 = None;
4706            req.input.checksum_sha256 = None;
4707            req.input.content_md5 = None;
4708            req.input.body = Some(bytes_to_blob(framed_bytes));
4709            debug!(
4710                part_number = ?req.input.part_number,
4711                upload_id = ?req.input.upload_id,
4712                original_size,
4713                framed_size = new_len,
4714                "S4 upload_part: framed compressed payload"
4715            );
4716        }
4717        self.backend.upload_part(req).await
4718    }
4719    async fn complete_multipart_upload(
4720        &self,
4721        mut req: S3Request<CompleteMultipartUploadInput>,
4722    ) -> S3Result<S3Response<CompleteMultipartUploadOutput>> {
4723        let bucket = req.input.bucket.clone();
4724        let key = req.input.key.clone();
4725        let upload_id = req.input.upload_id.clone();
4726        // v0.8.12 HIGH-9 fix: gate Complete on `s3:PutObject` (the
4727        // commit point for the multipart-assembled object).
4728        self.enforce_policy(&req, "s3:PutObject", &bucket, Some(&key))?;
4729        self.enforce_rate_limit(&req, &bucket)?;
4730        // v0.8.12 HIGH-6 fix: re-verify Object Lock on the target key
4731        // at Complete time. Without this an attacker with PutObject
4732        // permission could `CreateMultipartUpload` against a key
4733        // that's currently under retention / legal hold and silently
4734        // overwrite it on Complete (the single-PUT path runs the
4735        // same check at L2007). Compliance retention is never
4736        // bypassable; Governance only with explicit IAM permission
4737        // (HIGH-7 gate below).
4738        if let Some(mgr) = self.object_lock.as_ref()
4739            && let Some(state) = mgr.get(&bucket, &key)
4740        {
4741            // CompleteMultipartUpload doesn't carry the bypass header
4742            // (the s3s DTO matches AWS' wire schema). A locked key
4743            // therefore cannot be overwritten by Complete regardless
4744            // of caller permission — operators who need to break a
4745            // Governance lock do it via PutObjectRetention before
4746            // calling Complete.
4747            let now = chrono::Utc::now();
4748            if !state.can_delete(now, false) {
4749                crate::metrics::record_policy_denial("s3:PutObject", &bucket);
4750                return Err(S3Error::with_message(
4751                    S3ErrorCode::AccessDenied,
4752                    "Access Denied because target key is protected by object lock",
4753                ));
4754            }
4755        }
4756        // v0.8.1 #59: serialise concurrent Complete invocations on the
4757        // same `(bucket, key)`. The race window the lock closes is the
4758        // GET-assembled-body → encrypt → PUT-encrypted-body triple
4759        // below (BUG-5 fix); without serialisation, two Completes for
4760        // different `upload_id` but the same logical key could each
4761        // read the other's plaintext assembled body and overwrite the
4762        // peer's encrypted result. The guard is held to function exit
4763        // (drop on `Ok` / `Err`), covering version-id mint, object-
4764        // lock apply, tagging persist, and replication enqueue too.
4765        let completion_lock = self.multipart_state.completion_lock(&bucket, &key);
4766        let _completion_guard = completion_lock.lock().await;
4767        // v0.8 #54 — fetch the per-upload context captured on Create.
4768        // `None` means an abandoned / unknown upload_id (gateway
4769        // crashed between Create and Complete, or pre-v0.8 state
4770        // restore); we still let the backend do its thing for
4771        // transparency, but we can't apply any SSE / version / lock /
4772        // tag / replication post-processing because we never captured
4773        // the recipe.
4774        let ctx = self.multipart_state.get(upload_id.as_str());
4775        // v0.8 #54 BUG-10 fix: same SSE-C header strip as upload_part
4776        // — some clients (boto3 / aws-sdk-cpp older versions) replay
4777        // the SSE-C triple on Complete too, and MinIO will choke if
4778        // they reach the backend.
4779        let _ = req.input.sse_customer_algorithm.take();
4780        let _ = req.input.sse_customer_key.take();
4781        let _ = req.input.sse_customer_key_md5.take();
4782        let mut resp = self.backend.complete_multipart_upload(req).await?;
4783        // CompleteMultipartUpload 成功 → 完成した object を full fetch して frame
4784        // index を build、`<key>.s4index` sidecar として保存。これで Range GET の
4785        // partial fetch path が利用可能になる (Range request の帯域節約)。
4786        // 注: 巨大 object の場合この pass は重いが、Range query は一度 sidecar が
4787        // できれば爆速になるので 1 回の cost は payback される
4788        //
4789        // v0.8 #54 BUG-5..9: this same fetch is the choke-point for
4790        // the SSE encrypt re-PUT + versioning shadow-key rewrite +
4791        // replication source-bytes capture, so we GET once and reuse
4792        // the bytes for every post-processing step.
4793        let assembled_body: Option<bytes::Bytes> = if let Ok(uri) = safe_object_uri(&bucket, &key) {
4794            let get_input = GetObjectInput {
4795                bucket: bucket.clone(),
4796                key: key.clone(),
4797                ..Default::default()
4798            };
4799            let get_req = S3Request {
4800                input: get_input,
4801                method: http::Method::GET,
4802                uri,
4803                headers: http::HeaderMap::new(),
4804                extensions: http::Extensions::new(),
4805                credentials: None,
4806                region: None,
4807                service: None,
4808                trailing_headers: None,
4809            };
4810            match self.backend.get_object(get_req).await {
4811                Ok(get_resp) => match get_resp.output.body {
4812                    Some(blob) => collect_blob(blob, self.max_body_bytes).await.ok(),
4813                    None => None,
4814                },
4815                Err(e) => {
4816                    // v0.8.4 #71 (C-1 audit fix): a silent
4817                    // `Err(_) => None` here is a SSE plaintext
4818                    // leak. The post-processing block below only
4819                    // runs the SSE re-encrypt branch when
4820                    // `assembled_body.is_some()`, so swallowing a
4821                    // backend error skipped the encrypt step and
4822                    // left the multipart object on disk as
4823                    // plaintext, even on SSE-S4 / SSE-C / SSE-KMS
4824                    // configured buckets. Same root-cause family
4825                    // as v0.8 BUG-5; this branch closes the
4826                    // remaining read-side window.
4827                    //
4828                    // We distinguish two cases:
4829                    //  - `NoSuchKey`: the object is genuinely
4830                    //    missing post-Complete. This is rare and
4831                    //    typically races with a concurrent
4832                    //    DeleteObject; there is nothing to re-
4833                    //    encrypt and no SSE markers to honour, so
4834                    //    falling through to the legacy
4835                    //    `assembled_body = None` path is safe.
4836                    //  - everything else (5xx, network, auth,
4837                    //    etc.): we must FAIL the Complete so the
4838                    //    client can retry. Returning Ok with
4839                    //    `assembled_body = None` would silently
4840                    //    skip the SSE re-encrypt and leave the
4841                    //    backend bytes plaintext.
4842                    if matches!(e.code(), &S3ErrorCode::NoSuchKey) {
4843                        tracing::warn!(
4844                            bucket = %bucket,
4845                            key = %key,
4846                            "multipart Complete: backend GET returned NoSuchKey; \
4847                             skipping post-processing (object likely raced with DeleteObject)"
4848                        );
4849                        None
4850                    } else {
4851                        tracing::error!(
4852                            bucket = %bucket,
4853                            key = %key,
4854                            error = %e,
4855                            "multipart Complete: backend GET failed; failing the Complete \
4856                             so the client retries (silent fall-through would skip SSE \
4857                             re-encrypt and store plaintext)"
4858                        );
4859                        return Err(internal("multipart Complete: backend body fetch failed")(e));
4860                    }
4861                }
4862            }
4863        } else {
4864            None
4865        };
4866        // Sidecar build (existing behaviour, gated on assembled body).
4867        //
4868        // v0.8.12 HIGH-10 fix: skip the sidecar when the Complete is
4869        // going to SSE-encrypt the assembled body before re-PUT (the
4870        // single-PUT path applies the same suppression at L2271).
4871        // Stale offsets into the pre-encrypt body would break Range
4872        // GET on the encrypted on-disk bytes. `ctx.sse != None`
4873        // covers all three SSE modes captured at Create time.
4874        let mp_will_encrypt = ctx
4875            .as_ref()
4876            .map(|c| !matches!(c.sse, crate::multipart_state::MultipartSseMode::None))
4877            .unwrap_or(false);
4878        // v0.8.16 F-7: versioned multipart writes the assembled body
4879        // under `versioned_shadow_key(&key, vid)` *after* this
4880        // sidecar block, then deletes the original `<key>`. Stamping
4881        // the sidecar against the to-be-deleted `<key>` (which is
4882        // what H-g did) leaves an orphan `<key>.s4index` whose
4883        // source-ETag binding can never match the live shadow body
4884        // — the Range GET fast-path's stale-sidecar check then
4885        // falls through to a full read on every request, silently
4886        // disabling partial fetch. Skip the sidecar build entirely
4887        // for versioned buckets; a follow-up issue tracks writing
4888        // the sidecar under the shadow key with the shadow's ETag.
4889        let mp_skip_sidecar_for_versioning = self
4890            .versioning
4891            .as_ref()
4892            .map(|mgr| mgr.state(&bucket))
4893            .map(|state| state == crate::versioning::VersioningState::Enabled)
4894            .unwrap_or(false);
4895        if let Some(ref body) = assembled_body
4896            && !mp_will_encrypt
4897            && !mp_skip_sidecar_for_versioning
4898            && let Ok(mut index) = build_index_from_body(body)
4899        {
4900            // v0.8.15 H-g: stamp the source-ETag / source-compressed-size
4901            // binding on the multipart sidecar. The single-PUT path
4902            // does this at L2519-L2521 via the backend's PUT response,
4903            // but Complete returns its own ETag (an opaque manifest
4904            // hash) so we have to HEAD the freshly-completed object
4905            // to pick up what backend actually wrote, then bind the
4906            // sidecar to those values. Without the binding, a
4907            // subsequent backend-side mutation (lifecycle rewrite,
4908            // out-of-band CopyObject) wouldn't trip the staleness
4909            // check on the next Range GET — the GET would happily
4910            // slice the new bytes at the old sidecar offsets, with
4911            // silent data corruption.
4912            if let Ok(uri) = safe_object_uri(&bucket, &key) {
4913                let head_req = S3Request {
4914                    input: HeadObjectInput {
4915                        bucket: bucket.clone(),
4916                        key: key.clone(),
4917                        ..Default::default()
4918                    },
4919                    method: http::Method::HEAD,
4920                    uri,
4921                    headers: http::HeaderMap::new(),
4922                    extensions: http::Extensions::new(),
4923                    credentials: None,
4924                    region: None,
4925                    service: None,
4926                    trailing_headers: None,
4927                };
4928                if let Ok(head) = self.backend.head_object(head_req).await {
4929                    index.source_etag = head.output.e_tag.as_ref().map(|t| t.value().to_string());
4930                    index.source_compressed_size = head
4931                        .output
4932                        .content_length
4933                        .and_then(|n| u64::try_from(n).ok());
4934                }
4935                // HEAD failure is non-fatal — the sidecar still works
4936                // as a v1-style best-effort fast path; the Range GET
4937                // simply falls back to a full read on any consistency
4938                // signal.
4939            }
4940            self.write_sidecar(&bucket, &key, &index).await;
4941        }
4942        // From here on, post-processing depends on the context —
4943        // short-circuit when the upload had no captured recipe
4944        // (legacy / crashed-Create / pre-v0.8 state restore).
4945        if let Some(ctx) = ctx {
4946            // v0.8 #54 BUG-6 fix: mint a version-id when the bucket
4947            // is versioning-Enabled. The single-PUT path does this in
4948            // `put_object` ~L1968; multipart was the missing branch.
4949            // We mint here (post-Complete, before any re-PUT) so the
4950            // same vid threads into both the shadow-key rewrite and
4951            // the VersionEntry the manager records.
4952            let pending_version: Option<crate::versioning::PutOutcome> = self
4953                .versioning
4954                .as_ref()
4955                .map(|mgr| mgr.state(&bucket))
4956                .map(|state| match state {
4957                    crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
4958                        version_id: crate::versioning::VersioningManager::new_version_id(),
4959                        versioned_response: true,
4960                    },
4961                    crate::versioning::VersioningState::Suspended
4962                    | crate::versioning::VersioningState::Unversioned => {
4963                        crate::versioning::PutOutcome {
4964                            version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
4965                            versioned_response: false,
4966                        }
4967                    }
4968                });
4969            // v0.8 #54 BUG-5 fix: encrypt the assembled framed body
4970            // and re-PUT it to the backend so the on-disk bytes are
4971            // SSE-encrypted. The single-PUT path does this body-by-
4972            // body inside `put_object` (L1907-L1942); for multipart,
4973            // encrypt-per-part would require a multi-segment decrypt
4974            // path on GET — we instead do a single encrypt over the
4975            // assembled framed body so the existing GET decrypt
4976            // branch (`is_sse_encrypted` → `decrypt(body, source)` →
4977            // FrameIter) handles it unchanged.
4978            //
4979            // The cost is one extra round-trip per Complete for SSE-
4980            // enabled multipart (already-paid for the sidecar build).
4981            // For single-instance gateways pointing at a co-located
4982            // backend this is negligible; cross-region operators
4983            // would benefit from per-part encrypt + multi-segment
4984            // decrypt as a follow-up.
4985            let needs_re_put = matches!(
4986                ctx.sse,
4987                crate::multipart_state::MultipartSseMode::SseS4
4988                    | crate::multipart_state::MultipartSseMode::SseC { .. }
4989                    | crate::multipart_state::MultipartSseMode::SseKms { .. }
4990            ) || pending_version
4991                .as_ref()
4992                .map(|pv| pv.versioned_response)
4993                .unwrap_or(false);
4994            // v0.8.11 CRIT-2 fix: seed the replication body with the
4995            // pre-encrypt assembled bytes, but overwrite it with the
4996            // post-encrypt `new_body` once the re-PUT branch lands.
4997            // The previous "snapshot in advance" pattern shipped the
4998            // *plaintext* framed body to the destination bucket even
4999            // when SSE-S4 / SSE-C / SSE-KMS was active — the GET on
5000            // the destination would then fail to decrypt (or, worse,
5001            // succeed in handing out plaintext that the source had
5002            // promised was encrypted at rest). When `needs_re_put`
5003            // is false (no SSE, no versioning), the backend still
5004            // holds the original plaintext-framed bytes, and the
5005            // seed value is what the destination should receive.
5006            let mut replication_body = assembled_body.clone();
5007            let mut applied_metadata: Option<std::collections::HashMap<String, String>> = None;
5008            if needs_re_put && let Some(body) = assembled_body {
5009                // v0.8.1 #58: same Zeroizing pattern as put_object's
5010                // single-PUT KMS branch — DEK plaintext lives in
5011                // `Zeroizing<[u8; 32]>` for the lifetime of this
5012                // Complete handler, then is wiped on drop.
5013                let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
5014                    if let crate::multipart_state::MultipartSseMode::SseKms { ref key_id } = ctx.sse
5015                    {
5016                        let kms = self.kms.as_ref().ok_or_else(|| {
5017                        S3Error::with_message(
5018                            S3ErrorCode::InvalidRequest,
5019                            "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
5020                        )
5021                    })?;
5022                        let (dek, wrapped) =
5023                            kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
5024                        if dek.len() != 32 {
5025                            return Err(S3Error::with_message(
5026                                S3ErrorCode::InternalError,
5027                                format!(
5028                                    "KMS backend returned a DEK of {} bytes (expected 32)",
5029                                    dek.len()
5030                                ),
5031                            ));
5032                        }
5033                        let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
5034                            zeroize::Zeroizing::new([0u8; 32]);
5035                        dek_arr.copy_from_slice(&dek);
5036                        // `dek` (Zeroizing<Vec<u8>>) is dropped at scope end.
5037                        Some((dek_arr, wrapped))
5038                    } else {
5039                        None
5040                    };
5041                // Build the new metadata map: re-fetch via HEAD so
5042                // the multipart / codec markers the backend stamped
5043                // on Create flow through unchanged, then layer the
5044                // SSE markers on top.
5045                let head_req = S3Request {
5046                    input: HeadObjectInput {
5047                        bucket: bucket.clone(),
5048                        key: key.clone(),
5049                        ..Default::default()
5050                    },
5051                    method: http::Method::HEAD,
5052                    uri: safe_object_uri(&bucket, &key)?,
5053                    headers: http::HeaderMap::new(),
5054                    extensions: http::Extensions::new(),
5055                    credentials: None,
5056                    region: None,
5057                    service: None,
5058                    trailing_headers: None,
5059                };
5060                let mut new_metadata: std::collections::HashMap<String, String> =
5061                    match self.backend.head_object(head_req).await {
5062                        Ok(h) => h.output.metadata.unwrap_or_default(),
5063                        Err(_) => std::collections::HashMap::new(),
5064                    };
5065                let new_body = match &ctx.sse {
5066                    crate::multipart_state::MultipartSseMode::SseC { key, key_md5 } => {
5067                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5068                        new_metadata.insert("s4-sse-type".into(), "AES256".into());
5069                        new_metadata.insert(
5070                            "s4-sse-c-key-md5".into(),
5071                            base64::engine::general_purpose::STANDARD.encode(key_md5),
5072                        );
5073                        // v0.8.2 #62: `key` is `&Zeroizing<[u8; 32]>`;
5074                        // auto-deref through one explicit binding so
5075                        // `SseSource::CustomerKey` gets the `&[u8; 32]`
5076                        // it expects (mirrors the SSE-KMS DEK shape
5077                        // a few lines down).
5078                        let key_ref: &[u8; 32] = key;
5079                        crate::sse::encrypt_with_source(
5080                            &body,
5081                            crate::sse::SseSource::CustomerKey {
5082                                key: key_ref,
5083                                key_md5,
5084                            },
5085                        )
5086                    }
5087                    crate::multipart_state::MultipartSseMode::SseKms { .. } => {
5088                        let (dek, wrapped) = kms_wrap
5089                            .as_ref()
5090                            .expect("SseKms branch implies kms_wrap is Some");
5091                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5092                        new_metadata.insert("s4-sse-type".into(), "aws:kms".into());
5093                        new_metadata.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
5094                        // v0.8.1 #58: auto-deref from `&Zeroizing<[u8; 32]>`
5095                        // to `&[u8; 32]` (same shape as the put_object
5096                        // single-PUT branch).
5097                        let dek_ref: &[u8; 32] = dek;
5098                        crate::sse::encrypt_with_source(
5099                            &body,
5100                            crate::sse::SseSource::Kms {
5101                                dek: dek_ref,
5102                                wrapped,
5103                            },
5104                        )
5105                    }
5106                    crate::multipart_state::MultipartSseMode::SseS4 => {
5107                        let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
5108                            S3Error::with_message(
5109                                S3ErrorCode::InternalError,
5110                                "SSE-S4 captured at Create but keyring missing at Complete",
5111                            )
5112                        })?;
5113                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5114                        // SSE-S4 deliberately omits `s4-sse-type` so
5115                        // HEAD doesn't falsely advertise AWS-style
5116                        // SSE-S3 (matches the put_object L1929-L1939
5117                        // comment).
5118                        // v0.8 #52: same chunk_size dispatch as the
5119                        // single-PUT branch — multipart Complete
5120                        // re-encrypts the assembled body, so honoring
5121                        // the chunked path here is required to keep
5122                        // GET streaming on multipart-uploaded objects.
5123                        if self.sse_chunk_size > 0 {
5124                            crate::sse::encrypt_v2_chunked(&body, keyring, self.sse_chunk_size)
5125                                .map_err(|e| {
5126                                    S3Error::with_message(
5127                                        S3ErrorCode::InternalError,
5128                                        format!("SSE-S4 chunked encrypt failed at Complete: {e}"),
5129                                    )
5130                                })?
5131                        } else {
5132                            crate::sse::encrypt_v2(&body, keyring)
5133                        }
5134                    }
5135                    crate::multipart_state::MultipartSseMode::None => body.clone(),
5136                };
5137                // v0.8 #54 BUG-6 fix: write the re-PUT under the
5138                // shadow key so the version chain doesn't overwrite
5139                // the previous version on a versioned bucket. The
5140                // original (unshadowed) key was assembled by the
5141                // backend on Complete; we delete it after the shadow
5142                // PUT lands.
5143                let put_target_key = if let Some(pv) = pending_version.as_ref() {
5144                    if pv.versioned_response {
5145                        versioned_shadow_key(&key, &pv.version_id)
5146                    } else {
5147                        key.clone()
5148                    }
5149                } else {
5150                    key.clone()
5151                };
5152                let new_body_len = new_body.len() as i64;
5153                let put_req = S3Request {
5154                    input: PutObjectInput {
5155                        bucket: bucket.clone(),
5156                        key: put_target_key.clone(),
5157                        body: Some(bytes_to_blob(new_body.clone())),
5158                        metadata: Some(new_metadata.clone()),
5159                        content_length: Some(new_body_len),
5160                        ..Default::default()
5161                    },
5162                    method: http::Method::PUT,
5163                    uri: safe_object_uri(&bucket, &put_target_key)?,
5164                    headers: http::HeaderMap::new(),
5165                    extensions: http::Extensions::new(),
5166                    credentials: None,
5167                    region: None,
5168                    service: None,
5169                    trailing_headers: None,
5170                };
5171                self.backend.put_object(put_req).await?;
5172                // v0.8.11 CRIT-2 fix: refresh the replication snapshot
5173                // with the bytes that were actually persisted to the
5174                // backend (post-SSE-encrypt for SSE modes; identical to
5175                // `body` for `MultipartSseMode::None` + versioning-only
5176                // re-PUT). The destination then sees the same on-disk
5177                // shape the source does, and a destination GET decrypts
5178                // correctly when SSE is on.
5179                replication_body = Some(new_body.clone());
5180                // If we rewrote the storage key (versioning shadow),
5181                // we must drop the original (unshadowed) Complete-
5182                // assembled bytes so subsequent listings don't see a
5183                // duplicate.
5184                if put_target_key != key {
5185                    let del_req = S3Request {
5186                        input: DeleteObjectInput {
5187                            bucket: bucket.clone(),
5188                            key: key.clone(),
5189                            ..Default::default()
5190                        },
5191                        method: http::Method::DELETE,
5192                        uri: safe_object_uri(&bucket, &key)?,
5193                        headers: http::HeaderMap::new(),
5194                        extensions: http::Extensions::new(),
5195                        credentials: None,
5196                        region: None,
5197                        service: None,
5198                        trailing_headers: None,
5199                    };
5200                    let _ = self.backend.delete_object(del_req).await;
5201                }
5202                applied_metadata = Some(new_metadata);
5203            }
5204            // v0.8 #54 BUG-6 commit: register the new version with
5205            // the VersioningManager so list_object_versions /
5206            // GET ?versionId= see it.
5207            if let (Some(mgr), Some(pv)) = (self.versioning.as_ref(), pending_version.as_ref()) {
5208                let etag = resp
5209                    .output
5210                    .e_tag
5211                    .clone()
5212                    .map(ETag::into_value)
5213                    .unwrap_or_default();
5214                let now = chrono::Utc::now();
5215                mgr.commit_put_with_version(
5216                    &bucket,
5217                    &key,
5218                    crate::versioning::VersionEntry {
5219                        version_id: pv.version_id.clone(),
5220                        etag,
5221                        size: replication_body
5222                            .as_ref()
5223                            .map(|b| b.len() as u64)
5224                            .unwrap_or(0),
5225                        is_delete_marker: false,
5226                        created_at: now,
5227                    },
5228                );
5229                if pv.versioned_response {
5230                    resp.output.version_id = Some(pv.version_id.clone());
5231                }
5232            }
5233            // v0.8 #54 BUG-7 fix: persist any per-upload Object Lock
5234            // recipe + auto-apply the bucket default. Mirrors the
5235            // put_object L2057-L2074 block.
5236            if let Some(mgr) = self.object_lock.as_ref() {
5237                if ctx.object_lock_mode.is_some()
5238                    || ctx.object_lock_retain_until.is_some()
5239                    || ctx.object_lock_legal_hold
5240                {
5241                    let mut state = mgr.get(&bucket, &key).unwrap_or_default();
5242                    if let Some(m) = ctx.object_lock_mode {
5243                        state.mode = Some(m);
5244                    }
5245                    if let Some(u) = ctx.object_lock_retain_until {
5246                        state.retain_until = Some(u);
5247                    }
5248                    if ctx.object_lock_legal_hold {
5249                        state.legal_hold_on = true;
5250                    }
5251                    mgr.set(&bucket, &key, state);
5252                }
5253                mgr.apply_default_on_put(&bucket, &key, chrono::Utc::now());
5254            }
5255            // v0.8 #54 BUG-9 fix: persist the captured tags via the
5256            // TagManager so GetObjectTagging returns them.
5257            if let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), ctx.tags.as_ref()) {
5258                mgr.put_object_tags(&bucket, &key, tags.clone());
5259            }
5260            // SSE-C / SSE-KMS response echo. The
5261            // CompleteMultipartUploadOutput only exposes
5262            // `server_side_encryption` + `ssekms_key_id` (no
5263            // sse_customer_* — those round-tripped on Create / parts).
5264            match &ctx.sse {
5265                crate::multipart_state::MultipartSseMode::SseC { .. } => {
5266                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5267                        ServerSideEncryption::AES256,
5268                    ));
5269                }
5270                crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5271                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5272                        ServerSideEncryption::AWS_KMS,
5273                    ));
5274                    resp.output.ssekms_key_id = Some(key_id.clone());
5275                }
5276                _ => {}
5277            }
5278            // v0.8 #54 BUG-8 fix: fire cross-bucket replication just
5279            // like put_object L2165 does. We hand the dispatcher the
5280            // assembled body bytes (post-encrypt where applicable, so
5281            // the destination ends up byte-identical to the source's
5282            // on-disk shape) plus the metadata that was actually
5283            // committed.
5284            let replication_body_bytes = replication_body.unwrap_or_default();
5285            // v0.8.2 #61: thread the multipart-Complete `pending_version`
5286            // through so a versioning-Enabled source's destination
5287            // receives the same shadow-key path (mirror of the
5288            // single-PUT branch above).
5289            self.spawn_replication_if_matched(
5290                &bucket,
5291                &key,
5292                &ctx.tags,
5293                &replication_body_bytes,
5294                &applied_metadata,
5295                true,
5296                pending_version.as_ref(),
5297            );
5298            self.multipart_state.remove(upload_id.as_str());
5299        }
5300        // v0.8.1 #59 janitor: best-effort sweep of stale completion
5301        // locks while we are still on the critical path of a single
5302        // Complete (so steady-state workloads of unique keys don't
5303        // accumulate `DashMap` entries). The sweep only retires
5304        // entries whose `Arc::strong_count == 1`, so any other in-
5305        // flight Complete on a different key keeps its lock alive.
5306        // Our own `_completion_guard` keeps `bucket`/`key`'s entry
5307        // alive across this call; it's reaped on the next Complete or
5308        // the next caller-driven prune.
5309        self.multipart_state.prune_completion_locks();
5310        Ok(resp)
5311    }
5312    async fn abort_multipart_upload(
5313        &self,
5314        req: S3Request<AbortMultipartUploadInput>,
5315    ) -> S3Result<S3Response<AbortMultipartUploadOutput>> {
5316        // v0.8.12 HIGH-9 fix: gate Abort on `s3:AbortMultipartUpload`
5317        // — the AWS-spec action verb for this operation. Without the
5318        // gate, anyone who could guess an upload_id could throw away
5319        // someone else's in-flight multipart upload.
5320        let abort_bucket = req.input.bucket.clone();
5321        let abort_key = req.input.key.clone();
5322        self.enforce_policy(
5323            &req,
5324            "s3:AbortMultipartUpload",
5325            &abort_bucket,
5326            Some(&abort_key),
5327        )?;
5328        // v0.8 #54: drop the per-upload state (SSE-C key bytes / tag
5329        // set) promptly so an aborted upload doesn't leak the
5330        // customer's key into a long-running gateway's RSS.
5331        //
5332        // v0.8.4 #71 (H-7 audit fix): backend.abort_multipart_upload
5333        // FIRST, then drop in-process state ONLY on success. The
5334        // previous order ("remove → call backend") meant a transient
5335        // backend abort failure (5xx, network) wiped the SSE-C key
5336        // bytes locally while leaving the parts on the backend, so a
5337        // client retry would have to re-validate the SSE-C key against
5338        // a context the gateway no longer has — and the retried abort
5339        // would still hit the unaborted backend parts. Calling the
5340        // backend first lets the failure propagate to the client with
5341        // state intact for a clean retry; only on success do we wipe
5342        // the local state.
5343        let upload_id = req.input.upload_id.as_str().to_owned();
5344        let resp = self.backend.abort_multipart_upload(req).await?;
5345        self.multipart_state.remove(&upload_id);
5346        Ok(resp)
5347    }
5348    async fn list_multipart_uploads(
5349        &self,
5350        req: S3Request<ListMultipartUploadsInput>,
5351    ) -> S3Result<S3Response<ListMultipartUploadsOutput>> {
5352        self.backend.list_multipart_uploads(req).await
5353    }
5354    async fn list_parts(
5355        &self,
5356        req: S3Request<ListPartsInput>,
5357    ) -> S3Result<S3Response<ListPartsOutput>> {
5358        self.backend.list_parts(req).await
5359    }
5360
5361    // =========================================================================
5362    // Phase 2 — pure passthrough delegations。S4 はこれらに対して圧縮 hook を
5363    // 持たないので、backend (= AWS S3) の動作と完全に同一。
5364    //
5365    // 既知の制限事項:
5366    // - copy_object / upload_part_copy: source object が S4-compressed の場合、
5367    //   backend が bytes を copy するだけなので metadata (s4-codec etc) も一緒に
5368    //   coppied される (AWS S3 default = MetadataDirective COPY)。GET は manifest
5369    //   経由で正しく decompress できる。MetadataDirective REPLACE で上書き
5370    //   されると圧縮 metadata が消えて壊れる — 顧客側の運用で注意
5371    // - list_object_versions: versioning enabled bucket では各 version も S4
5372    //   metadata を維持する。古い version も S4 経由で正しく GET できる。
5373    // =========================================================================
5374
5375    // ---- Object ACL / tagging / attributes ----
5376    async fn get_object_acl(
5377        &self,
5378        req: S3Request<GetObjectAclInput>,
5379    ) -> S3Result<S3Response<GetObjectAclOutput>> {
5380        // v0.8.17 G-2: reserved-name guard. Without it a hostile
5381        // client can `GetObjectAcl(<key>.s4index)` to confirm the
5382        // sidecar exists, an information leak the F-13 GET reject
5383        // closed for the same object.
5384        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5385        self.backend.get_object_acl(req).await
5386    }
5387    async fn put_object_acl(
5388        &self,
5389        req: S3Request<PutObjectAclInput>,
5390    ) -> S3Result<S3Response<PutObjectAclOutput>> {
5391        // v0.8.17 G-2: reserved-name guard. `put-object-acl
5392        // --acl public-read` against `<key>.s4index` would grant
5393        // external read access to the internal sidecar, bypassing
5394        // the F-13 GET reject via the backend's public-URL path.
5395        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5396        self.backend.put_object_acl(req).await
5397    }
5398    // v0.6 #39: object tagging — when a `TagManager` is attached the
5399    // configuration / per-(bucket, key) state lives in the manager and
5400    // these handlers serve directly from it; when no manager is
5401    // attached they fall back to the backend (legacy passthrough so
5402    // v0.5 deployments are unaffected).
5403    async fn get_object_tagging(
5404        &self,
5405        req: S3Request<GetObjectTaggingInput>,
5406    ) -> S3Result<S3Response<GetObjectTaggingOutput>> {
5407        // v0.8.17 G-2: reserved-name guard.
5408        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5409        let Some(mgr) = self.tagging.as_ref() else {
5410            return self.backend.get_object_tagging(req).await;
5411        };
5412        let tags = mgr
5413            .get_object_tags(&req.input.bucket, &req.input.key)
5414            .unwrap_or_default();
5415        Ok(S3Response::new(GetObjectTaggingOutput {
5416            tag_set: tagset_to_aws(&tags),
5417            ..Default::default()
5418        }))
5419    }
5420    async fn put_object_tagging(
5421        &self,
5422        req: S3Request<PutObjectTaggingInput>,
5423    ) -> S3Result<S3Response<PutObjectTaggingOutput>> {
5424        // v0.8.17 G-2: reserved-name guard.
5425        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5426        let Some(mgr) = self.tagging.as_ref() else {
5427            return self.backend.put_object_tagging(req).await;
5428        };
5429        let bucket = req.input.bucket.clone();
5430        let key = req.input.key.clone();
5431        let parsed = aws_to_tagset(&req.input.tagging.tag_set)
5432            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5433        // v0.6 #39: gate via IAM policy with both the request tags
5434        // (`s3:RequestObjectTag/<key>`) and any existing tags on the
5435        // target object (`s3:ExistingObjectTag/<key>`).
5436        let existing = mgr.get_object_tags(&bucket, &key);
5437        self.enforce_policy_with_extra(
5438            &req,
5439            "s3:PutObjectTagging",
5440            &bucket,
5441            Some(&key),
5442            Some(&parsed),
5443            existing.as_ref(),
5444        )?;
5445        mgr.put_object_tags(&bucket, &key, parsed);
5446        Ok(S3Response::new(PutObjectTaggingOutput::default()))
5447    }
5448    async fn delete_object_tagging(
5449        &self,
5450        req: S3Request<DeleteObjectTaggingInput>,
5451    ) -> S3Result<S3Response<DeleteObjectTaggingOutput>> {
5452        // v0.8.17 G-2: reserved-name guard.
5453        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5454        let Some(mgr) = self.tagging.as_ref() else {
5455            return self.backend.delete_object_tagging(req).await;
5456        };
5457        let bucket = req.input.bucket.clone();
5458        let key = req.input.key.clone();
5459        let existing = mgr.get_object_tags(&bucket, &key);
5460        self.enforce_policy_with_extra(
5461            &req,
5462            "s3:DeleteObjectTagging",
5463            &bucket,
5464            Some(&key),
5465            None,
5466            existing.as_ref(),
5467        )?;
5468        mgr.delete_object_tags(&bucket, &key);
5469        Ok(S3Response::new(DeleteObjectTaggingOutput::default()))
5470    }
5471    async fn get_object_attributes(
5472        &self,
5473        req: S3Request<GetObjectAttributesInput>,
5474    ) -> S3Result<S3Response<GetObjectAttributesOutput>> {
5475        // v0.8.17 G-2: reserved-name guard. Attributes leak the
5476        // sidecar's size + ETag, same shape as F-13's GET concern.
5477        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5478        self.backend.get_object_attributes(req).await
5479    }
5480    async fn restore_object(
5481        &self,
5482        req: S3Request<RestoreObjectInput>,
5483    ) -> S3Result<S3Response<RestoreObjectOutput>> {
5484        // v0.8.17 G-2: reserved-name guard.
5485        self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5486        self.backend.restore_object(req).await
5487    }
5488    async fn upload_part_copy(
5489        &self,
5490        req: S3Request<UploadPartCopyInput>,
5491    ) -> S3Result<S3Response<UploadPartCopyOutput>> {
5492        // v0.8.12 HIGH-9 fix: same per-action gates as `copy_object` —
5493        // destination PUT + source GET.
5494        let dst_bucket = req.input.bucket.clone();
5495        let dst_key = req.input.key.clone();
5496        // v0.8.17 G-2: reserved-name guard on both destination
5497        // and source. Mirrors what `copy_object` enforces.
5498        self.check_not_reserved_key(&dst_key, ReservedKeyMode::Mutating)?;
5499        if let CopySource::Bucket { key, .. } = &req.input.copy_source {
5500            self.check_not_reserved_key(key, ReservedKeyMode::Read)?;
5501        }
5502        self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
5503        if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
5504            self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
5505        }
5506        self.enforce_rate_limit(&req, &dst_bucket)?;
5507        // v0.2 #6: byte-range aware copy when the source is S4-framed.
5508        //
5509        // For a framed source (multipart upload OR single-PUT framed-v2),
5510        // a naive byte-range passthrough would copy compressed bytes that
5511        // don't align with S4 frame boundaries — silently corrupting the
5512        // result. Instead we GET the source through S4 (which handles
5513        // decompression + Range), re-compress + re-frame as a new part,
5514        // and forward as upload_part. For non-framed sources (S4-untouched
5515        // raw objects), passthrough is correct and we keep the original
5516        // (cheaper) code path.
5517        // v0.8.4 #74: propagate the optional `?versionId=<vid>` from the
5518        // copy-source header. Without this, a versioned source bucket
5519        // copy that pins a specific old version would silently fall
5520        // back to "latest", assembling wrong bytes into the destination
5521        // multipart object (silent data corruption).
5522        let CopySource::Bucket {
5523            bucket: src_bucket,
5524            key: src_key,
5525            version_id: src_version_id,
5526        } = &req.input.copy_source
5527        else {
5528            return self.backend.upload_part_copy(req).await;
5529        };
5530        let src_bucket = src_bucket.to_string();
5531        let src_key = src_key.to_string();
5532        let src_version_id: Option<String> = src_version_id.as_deref().map(str::to_owned);
5533
5534        // Probe metadata to decide whether the source needs S4-aware copy.
5535        let head_input = HeadObjectInput {
5536            bucket: src_bucket.clone(),
5537            key: src_key.clone(),
5538            version_id: src_version_id.clone(),
5539            ..Default::default()
5540        };
5541        let head_req = S3Request {
5542            input: head_input,
5543            method: http::Method::HEAD,
5544            uri: req.uri.clone(),
5545            headers: req.headers.clone(),
5546            extensions: http::Extensions::new(),
5547            credentials: req.credentials.clone(),
5548            region: req.region.clone(),
5549            service: req.service.clone(),
5550            trailing_headers: None,
5551        };
5552        let needs_s4_copy = match self.backend.head_object(head_req).await {
5553            Ok(h) => {
5554                is_multipart_object(&h.output.metadata) || is_framed_v2_object(&h.output.metadata)
5555            }
5556            Err(_) => false,
5557        };
5558        if !needs_s4_copy {
5559            return self.backend.upload_part_copy(req).await;
5560        }
5561
5562        // Resolve the optional source byte range to pass to GET.
5563        let source_range = req
5564            .input
5565            .copy_source_range
5566            .as_ref()
5567            .map(|r| parse_copy_source_range(r))
5568            .transpose()
5569            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
5570
5571        // GET source via S4 (handles decompression + sidecar partial fetch
5572        // when range is present). The result is the requested user-visible
5573        // byte range, fully decompressed. version_id is propagated so
5574        // pinned-version copies fetch the exact version requested.
5575        let mut get_input = GetObjectInput {
5576            bucket: src_bucket.clone(),
5577            key: src_key.clone(),
5578            version_id: src_version_id.clone(),
5579            ..Default::default()
5580        };
5581        get_input.range = source_range;
5582        let get_req = S3Request {
5583            input: get_input,
5584            method: http::Method::GET,
5585            uri: req.uri.clone(),
5586            headers: req.headers.clone(),
5587            extensions: http::Extensions::new(),
5588            credentials: req.credentials.clone(),
5589            region: req.region.clone(),
5590            service: req.service.clone(),
5591            trailing_headers: None,
5592        };
5593        let get_resp = self.get_object(get_req).await?;
5594        let blob = get_resp.output.body.ok_or_else(|| {
5595            S3Error::with_message(
5596                S3ErrorCode::InternalError,
5597                "upload_part_copy: empty body from source GET",
5598            )
5599        })?;
5600        let bytes = collect_blob(blob, self.max_body_bytes)
5601            .await
5602            .map_err(internal("collect upload_part_copy source body"))?;
5603
5604        // Compress + frame as a fresh part (mirrors upload_part path).
5605        let sample_len = bytes.len().min(SAMPLE_BYTES);
5606        // v0.8 #56: same size-hint promotion as the upload_part path.
5607        let codec_kind = self
5608            .dispatcher
5609            .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
5610            .await;
5611        let original_size = bytes.len() as u64;
5612        // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
5613        let (compress_res, tel) = self
5614            .registry
5615            .compress_with_telemetry(bytes, codec_kind)
5616            .await;
5617        stamp_gpu_compress_telemetry(&tel);
5618        let (compressed, manifest) =
5619            compress_res.map_err(internal("registry compress upload_part_copy"))?;
5620        let header = FrameHeader {
5621            codec: codec_kind,
5622            original_size,
5623            compressed_size: compressed.len() as u64,
5624            crc32c: manifest.crc32c,
5625        };
5626        let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
5627        write_frame(&mut framed, header, &compressed);
5628        let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
5629        if !likely_final {
5630            pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
5631        }
5632        let framed_bytes = framed.freeze();
5633        let framed_len = framed_bytes.len() as i64;
5634
5635        // Forward as upload_part to the destination multipart upload.
5636        let part_input = UploadPartInput {
5637            bucket: req.input.bucket.clone(),
5638            key: req.input.key.clone(),
5639            part_number: req.input.part_number,
5640            upload_id: req.input.upload_id.clone(),
5641            body: Some(bytes_to_blob(framed_bytes)),
5642            content_length: Some(framed_len),
5643            ..Default::default()
5644        };
5645        let part_req = S3Request {
5646            input: part_input,
5647            method: http::Method::PUT,
5648            uri: req.uri.clone(),
5649            headers: req.headers.clone(),
5650            extensions: http::Extensions::new(),
5651            credentials: req.credentials.clone(),
5652            region: req.region.clone(),
5653            service: req.service.clone(),
5654            trailing_headers: None,
5655        };
5656        let upload_resp = self.backend.upload_part(part_req).await?;
5657
5658        let copy_output = UploadPartCopyOutput {
5659            copy_part_result: Some(CopyPartResult {
5660                e_tag: upload_resp.output.e_tag.clone(),
5661                ..Default::default()
5662            }),
5663            ..Default::default()
5664        };
5665        Ok(S3Response::new(copy_output))
5666    }
5667
5668    // ---- Object lock / retention / legal hold (v0.5 #30) ----
5669    //
5670    // When an `ObjectLockManager` is attached the configuration / per-object
5671    // state lives in the manager and these handlers serve directly from it;
5672    // when no manager is attached they fall back to the backend (legacy
5673    // passthrough so v0.4 deployments are unaffected).
5674    async fn get_object_lock_configuration(
5675        &self,
5676        req: S3Request<GetObjectLockConfigurationInput>,
5677    ) -> S3Result<S3Response<GetObjectLockConfigurationOutput>> {
5678        self.enforce_policy(
5679            &req,
5680            "s3:GetBucketObjectLockConfiguration",
5681            &req.input.bucket,
5682            None,
5683        )?;
5684        if let Some(mgr) = self.object_lock.as_ref() {
5685            let cfg = mgr
5686                .bucket_default(&req.input.bucket)
5687                .map(|d| ObjectLockConfiguration {
5688                    object_lock_enabled: Some(ObjectLockEnabled::from_static(
5689                        ObjectLockEnabled::ENABLED,
5690                    )),
5691                    rule: Some(ObjectLockRule {
5692                        default_retention: Some(DefaultRetention {
5693                            days: Some(d.retention_days as i32),
5694                            mode: Some(ObjectLockRetentionMode::from_static(match d.mode {
5695                                crate::object_lock::LockMode::Governance => {
5696                                    ObjectLockRetentionMode::GOVERNANCE
5697                                }
5698                                crate::object_lock::LockMode::Compliance => {
5699                                    ObjectLockRetentionMode::COMPLIANCE
5700                                }
5701                            })),
5702                            years: None,
5703                        }),
5704                    }),
5705                });
5706            let output = GetObjectLockConfigurationOutput {
5707                object_lock_configuration: cfg,
5708            };
5709            return Ok(S3Response::new(output));
5710        }
5711        self.backend.get_object_lock_configuration(req).await
5712    }
5713    async fn put_object_lock_configuration(
5714        &self,
5715        req: S3Request<PutObjectLockConfigurationInput>,
5716    ) -> S3Result<S3Response<PutObjectLockConfigurationOutput>> {
5717        self.enforce_policy(
5718            &req,
5719            "s3:PutBucketObjectLockConfiguration",
5720            &req.input.bucket,
5721            None,
5722        )?;
5723        if let Some(mgr) = self.object_lock.as_ref() {
5724            let bucket = req.input.bucket.clone();
5725            if let Some(cfg) = req.input.object_lock_configuration.as_ref()
5726                && let Some(rule) = cfg.rule.as_ref()
5727                && let Some(d) = rule.default_retention.as_ref()
5728            {
5729                let mode = d
5730                    .mode
5731                    .as_ref()
5732                    .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()))
5733                    .ok_or_else(|| {
5734                        S3Error::with_message(
5735                            S3ErrorCode::InvalidRequest,
5736                            "Object Lock default retention requires a valid Mode (GOVERNANCE | COMPLIANCE)",
5737                        )
5738                    })?;
5739                // S3 spec: exactly one of Days / Years (we accept Days
5740                // outright and convert Years → Days for storage; Years
5741                // is just a UX shorthand on the wire).
5742                let days: u32 = match (d.days, d.years) {
5743                    (Some(d), None) if d > 0 => d as u32,
5744                    (None, Some(y)) if y > 0 => (y as u32).saturating_mul(365),
5745                    _ => {
5746                        return Err(S3Error::with_message(
5747                            S3ErrorCode::InvalidRequest,
5748                            "Object Lock default retention requires exactly one of Days or Years (positive integer)",
5749                        ));
5750                    }
5751                };
5752                mgr.set_bucket_default(
5753                    &bucket,
5754                    crate::object_lock::BucketObjectLockDefault {
5755                        mode,
5756                        retention_days: days,
5757                    },
5758                );
5759            }
5760            return Ok(S3Response::new(PutObjectLockConfigurationOutput::default()));
5761        }
5762        self.backend.put_object_lock_configuration(req).await
5763    }
5764    async fn get_object_legal_hold(
5765        &self,
5766        req: S3Request<GetObjectLegalHoldInput>,
5767    ) -> S3Result<S3Response<GetObjectLegalHoldOutput>> {
5768        let key = req.input.key.clone();
5769        self.enforce_policy(&req, "s3:GetObjectLegalHold", &req.input.bucket, Some(&key))?;
5770        if let Some(mgr) = self.object_lock.as_ref() {
5771            let on = mgr
5772                .get(&req.input.bucket, &req.input.key)
5773                .map(|s| s.legal_hold_on)
5774                .unwrap_or(false);
5775            let status = ObjectLockLegalHoldStatus::from_static(if on {
5776                ObjectLockLegalHoldStatus::ON
5777            } else {
5778                ObjectLockLegalHoldStatus::OFF
5779            });
5780            let output = GetObjectLegalHoldOutput {
5781                legal_hold: Some(ObjectLockLegalHold {
5782                    status: Some(status),
5783                }),
5784            };
5785            return Ok(S3Response::new(output));
5786        }
5787        self.backend.get_object_legal_hold(req).await
5788    }
5789    async fn put_object_legal_hold(
5790        &self,
5791        req: S3Request<PutObjectLegalHoldInput>,
5792    ) -> S3Result<S3Response<PutObjectLegalHoldOutput>> {
5793        let key = req.input.key.clone();
5794        self.enforce_policy(&req, "s3:PutObjectLegalHold", &req.input.bucket, Some(&key))?;
5795        if let Some(mgr) = self.object_lock.as_ref() {
5796            let on = req
5797                .input
5798                .legal_hold
5799                .as_ref()
5800                .and_then(|h| h.status.as_ref())
5801                .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
5802                .unwrap_or(false);
5803            mgr.set_legal_hold(&req.input.bucket, &req.input.key, on);
5804            return Ok(S3Response::new(PutObjectLegalHoldOutput::default()));
5805        }
5806        self.backend.put_object_legal_hold(req).await
5807    }
5808    async fn get_object_retention(
5809        &self,
5810        req: S3Request<GetObjectRetentionInput>,
5811    ) -> S3Result<S3Response<GetObjectRetentionOutput>> {
5812        let key = req.input.key.clone();
5813        self.enforce_policy(&req, "s3:GetObjectRetention", &req.input.bucket, Some(&key))?;
5814        if let Some(mgr) = self.object_lock.as_ref() {
5815            let retention = mgr
5816                .get(&req.input.bucket, &req.input.key)
5817                .filter(|s| s.mode.is_some() || s.retain_until.is_some())
5818                .map(|s| {
5819                    let mode = s.mode.map(|m| {
5820                        ObjectLockRetentionMode::from_static(match m {
5821                            crate::object_lock::LockMode::Governance => {
5822                                ObjectLockRetentionMode::GOVERNANCE
5823                            }
5824                            crate::object_lock::LockMode::Compliance => {
5825                                ObjectLockRetentionMode::COMPLIANCE
5826                            }
5827                        })
5828                    });
5829                    let until = s.retain_until.map(chrono_utc_to_timestamp);
5830                    ObjectLockRetention {
5831                        mode,
5832                        retain_until_date: until,
5833                    }
5834                });
5835            let output = GetObjectRetentionOutput { retention };
5836            return Ok(S3Response::new(output));
5837        }
5838        self.backend.get_object_retention(req).await
5839    }
5840    async fn put_object_retention(
5841        &self,
5842        req: S3Request<PutObjectRetentionInput>,
5843    ) -> S3Result<S3Response<PutObjectRetentionOutput>> {
5844        let key = req.input.key.clone();
5845        self.enforce_policy(&req, "s3:PutObjectRetention", &req.input.bucket, Some(&key))?;
5846        if let Some(mgr) = self.object_lock.as_ref() {
5847            let bucket = req.input.bucket.clone();
5848            let key = req.input.key.clone();
5849            // v0.8.12 HIGH-7 fix: the bypass header gates Governance
5850            // shortening only when the caller has the matching IAM
5851            // action explicitly allowed; otherwise it's silently
5852            // dropped to `false` and the "shortening Governance
5853            // requires bypass" branch below rejects.
5854            let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
5855            let bypass = if bypass_header {
5856                self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
5857                    .is_ok()
5858            } else {
5859                false
5860            };
5861            let retention = req.input.retention.as_ref().ok_or_else(|| {
5862                S3Error::with_message(
5863                    S3ErrorCode::InvalidRequest,
5864                    "PutObjectRetention requires a Retention element",
5865                )
5866            })?;
5867            let new_mode = retention
5868                .mode
5869                .as_ref()
5870                .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
5871            let new_until = retention
5872                .retain_until_date
5873                .as_ref()
5874                .map(timestamp_to_chrono_utc)
5875                .unwrap_or(None);
5876            let now = chrono::Utc::now();
5877            let existing = mgr.get(&bucket, &key).unwrap_or_default();
5878            // S3 immutability rules:
5879            //   - Compliance is one-way: once set, mode cannot move to
5880            //     Governance, and retain-until cannot be shortened.
5881            //   - Governance can be lengthened freely; shortened only
5882            //     with bypass=true.
5883            if let Some(existing_mode) = existing.mode
5884                && existing_mode == crate::object_lock::LockMode::Compliance
5885                && existing.is_locked(now)
5886            {
5887                if matches!(new_mode, Some(crate::object_lock::LockMode::Governance)) {
5888                    return Err(S3Error::with_message(
5889                        S3ErrorCode::AccessDenied,
5890                        "Cannot downgrade Compliance retention to Governance while lock is active",
5891                    ));
5892                }
5893                if let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5894                    && next < prev
5895                {
5896                    return Err(S3Error::with_message(
5897                        S3ErrorCode::AccessDenied,
5898                        "Cannot shorten Compliance retention while lock is active",
5899                    ));
5900                }
5901            }
5902            if let Some(existing_mode) = existing.mode
5903                && existing_mode == crate::object_lock::LockMode::Governance
5904                && existing.is_locked(now)
5905                && !bypass
5906                && let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5907                && next < prev
5908            {
5909                return Err(S3Error::with_message(
5910                    S3ErrorCode::AccessDenied,
5911                    "Shortening Governance retention requires x-amz-bypass-governance-retention: true",
5912                ));
5913            }
5914            let mut state = existing;
5915            if new_mode.is_some() {
5916                state.mode = new_mode;
5917            }
5918            if new_until.is_some() {
5919                state.retain_until = new_until;
5920            }
5921            mgr.set(&bucket, &key, state);
5922            return Ok(S3Response::new(PutObjectRetentionOutput::default()));
5923        }
5924        self.backend.put_object_retention(req).await
5925    }
5926
5927    // ---- Versioning ----
5928    // list_object_versions is implemented above in the compression-hook
5929    // section so it filters S4-internal sidecars (v0.4 #17) AND, when a
5930    // VersioningManager is attached (v0.5 #34), serves chains directly
5931    // from the in-memory index.
5932    async fn get_bucket_versioning(
5933        &self,
5934        req: S3Request<GetBucketVersioningInput>,
5935    ) -> S3Result<S3Response<GetBucketVersioningOutput>> {
5936        // v0.5 #34: when a VersioningManager is attached, the bucket's
5937        // versioning state lives in the manager (= S4-server's
5938        // authoritative source). Pass-through hits the backend only
5939        // when no manager is configured (legacy v0.4 behaviour).
5940        if let Some(mgr) = self.versioning.as_ref() {
5941            let output = match mgr.state(&req.input.bucket).as_aws_status() {
5942                Some(s) => GetBucketVersioningOutput {
5943                    status: Some(BucketVersioningStatus::from(s.to_owned())),
5944                    ..Default::default()
5945                },
5946                None => GetBucketVersioningOutput::default(),
5947            };
5948            return Ok(S3Response::new(output));
5949        }
5950        self.backend.get_bucket_versioning(req).await
5951    }
5952    async fn put_bucket_versioning(
5953        &self,
5954        req: S3Request<PutBucketVersioningInput>,
5955    ) -> S3Result<S3Response<PutBucketVersioningOutput>> {
5956        // v0.6 #42: MFA gating on the `PutBucketVersioning` request
5957        // itself. S3 spec: when the request body carries an
5958        // `MfaDelete` element (either `Enabled` or `Disabled`), the
5959        // request must include a valid `x-amz-mfa` token — both for
5960        // the *first* enable (so the operator can't quietly side-step
5961        // the gate by never enabling it) and for any subsequent
5962        // change (so a leaked credential alone can't disable MFA
5963        // Delete to bypass it on subsequent DELETEs). Requests that
5964        // omit the `MfaDelete` element entirely (i.e. they flip only
5965        // `Status`) skip this gate, matching AWS.
5966        if let Some(mgr) = self.mfa_delete.as_ref()
5967            && let Some(target_enabled) = req
5968                .input
5969                .versioning_configuration
5970                .mfa_delete
5971                .as_ref()
5972                .map(|m| m.as_str().eq_ignore_ascii_case("Enabled"))
5973        {
5974            let bucket = req.input.bucket.clone();
5975            let header = req.input.mfa.as_deref();
5976            let secret = mgr.lookup_secret(&bucket);
5977            let verified = match (header, secret.as_ref()) {
5978                (Some(h), Some(s)) => match crate::mfa::parse_mfa_header(h) {
5979                    Ok((serial, code)) => {
5980                        serial == s.serial
5981                            && crate::mfa::verify_totp(&s.secret_base32, &code, current_unix_secs())
5982                    }
5983                    Err(_) => false,
5984                },
5985                _ => false,
5986            };
5987            if !verified {
5988                crate::metrics::record_mfa_delete_denial(&bucket);
5989                let err = if header.is_none() {
5990                    crate::mfa::MfaError::Missing
5991                } else {
5992                    crate::mfa::MfaError::InvalidCode
5993                };
5994                return Err(mfa_error_to_s3(err));
5995            }
5996            mgr.set_bucket_state(&bucket, target_enabled);
5997        }
5998        // v0.5 #34: stash the new state in the manager, then forward to
5999        // the backend so any downstream that *also* tracks state
6000        // (e.g. a real S3 backend) stays in sync. Manager-attached but
6001        // backend rejection is treated as a soft-fail (state is still
6002        // owned by the manager).
6003        if let Some(mgr) = self.versioning.as_ref() {
6004            let new_state = match req
6005                .input
6006                .versioning_configuration
6007                .status
6008                .as_ref()
6009                .map(|s| s.as_str())
6010            {
6011                Some(s) if s.eq_ignore_ascii_case("Enabled") => {
6012                    crate::versioning::VersioningState::Enabled
6013                }
6014                Some(s) if s.eq_ignore_ascii_case("Suspended") => {
6015                    crate::versioning::VersioningState::Suspended
6016                }
6017                _ => crate::versioning::VersioningState::Unversioned,
6018            };
6019            mgr.set_state(&req.input.bucket, new_state);
6020            return Ok(S3Response::new(PutBucketVersioningOutput::default()));
6021        }
6022        self.backend.put_bucket_versioning(req).await
6023    }
6024
6025    // ---- Bucket location ----
6026    async fn get_bucket_location(
6027        &self,
6028        req: S3Request<GetBucketLocationInput>,
6029    ) -> S3Result<S3Response<GetBucketLocationOutput>> {
6030        self.backend.get_bucket_location(req).await
6031    }
6032
6033    // ---- Bucket policy ----
6034    async fn get_bucket_policy(
6035        &self,
6036        req: S3Request<GetBucketPolicyInput>,
6037    ) -> S3Result<S3Response<GetBucketPolicyOutput>> {
6038        self.backend.get_bucket_policy(req).await
6039    }
6040    async fn put_bucket_policy(
6041        &self,
6042        req: S3Request<PutBucketPolicyInput>,
6043    ) -> S3Result<S3Response<PutBucketPolicyOutput>> {
6044        self.backend.put_bucket_policy(req).await
6045    }
6046    async fn delete_bucket_policy(
6047        &self,
6048        req: S3Request<DeleteBucketPolicyInput>,
6049    ) -> S3Result<S3Response<DeleteBucketPolicyOutput>> {
6050        self.backend.delete_bucket_policy(req).await
6051    }
6052    async fn get_bucket_policy_status(
6053        &self,
6054        req: S3Request<GetBucketPolicyStatusInput>,
6055    ) -> S3Result<S3Response<GetBucketPolicyStatusOutput>> {
6056        self.backend.get_bucket_policy_status(req).await
6057    }
6058
6059    // ---- Bucket ACL ----
6060    async fn get_bucket_acl(
6061        &self,
6062        req: S3Request<GetBucketAclInput>,
6063    ) -> S3Result<S3Response<GetBucketAclOutput>> {
6064        self.backend.get_bucket_acl(req).await
6065    }
6066    async fn put_bucket_acl(
6067        &self,
6068        req: S3Request<PutBucketAclInput>,
6069    ) -> S3Result<S3Response<PutBucketAclOutput>> {
6070        self.backend.put_bucket_acl(req).await
6071    }
6072
6073    // ---- Bucket CORS (v0.6 #38) ----
6074    async fn get_bucket_cors(
6075        &self,
6076        req: S3Request<GetBucketCorsInput>,
6077    ) -> S3Result<S3Response<GetBucketCorsOutput>> {
6078        if let Some(mgr) = self.cors.as_ref() {
6079            let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6080                S3Error::with_message(
6081                    S3ErrorCode::NoSuchCORSConfiguration,
6082                    "The CORS configuration does not exist".to_string(),
6083                )
6084            })?;
6085            let rules: Vec<CORSRule> = cfg
6086                .rules
6087                .into_iter()
6088                .map(|r| CORSRule {
6089                    allowed_headers: if r.allowed_headers.is_empty() {
6090                        None
6091                    } else {
6092                        Some(r.allowed_headers)
6093                    },
6094                    allowed_methods: r.allowed_methods,
6095                    allowed_origins: r.allowed_origins,
6096                    expose_headers: if r.expose_headers.is_empty() {
6097                        None
6098                    } else {
6099                        Some(r.expose_headers)
6100                    },
6101                    id: r.id,
6102                    max_age_seconds: r.max_age_seconds.map(|s| s as i32),
6103                })
6104                .collect();
6105            return Ok(S3Response::new(GetBucketCorsOutput {
6106                cors_rules: Some(rules),
6107            }));
6108        }
6109        self.backend.get_bucket_cors(req).await
6110    }
6111    async fn put_bucket_cors(
6112        &self,
6113        req: S3Request<PutBucketCorsInput>,
6114    ) -> S3Result<S3Response<PutBucketCorsOutput>> {
6115        if let Some(mgr) = self.cors.as_ref() {
6116            let cfg = crate::cors::CorsConfig {
6117                rules: req
6118                    .input
6119                    .cors_configuration
6120                    .cors_rules
6121                    .into_iter()
6122                    .map(|r| crate::cors::CorsRule {
6123                        allowed_origins: r.allowed_origins,
6124                        allowed_methods: r.allowed_methods,
6125                        allowed_headers: r.allowed_headers.unwrap_or_default(),
6126                        expose_headers: r.expose_headers.unwrap_or_default(),
6127                        max_age_seconds: r
6128                            .max_age_seconds
6129                            .and_then(|s| if s < 0 { None } else { Some(s as u32) }),
6130                        id: r.id,
6131                    })
6132                    .collect(),
6133            };
6134            // v0.8.15 M-3: AWS S3 rejects `AllowedMethods` outside
6135            // the canonical {GET,PUT,POST,DELETE,HEAD} set (including
6136            // the `*` wildcard). Validate at PutBucketCors time so
6137            // operators see the misconfiguration in the API response
6138            // instead of having silently-broken preflights at the
6139            // browser later.
6140            if let Err(e) = crate::cors::CorsManager::validate(&cfg) {
6141                return Err(S3Error::with_message(
6142                    S3ErrorCode::InvalidArgument,
6143                    e.to_string(),
6144                ));
6145            }
6146            mgr.put(&req.input.bucket, cfg);
6147            return Ok(S3Response::new(PutBucketCorsOutput::default()));
6148        }
6149        self.backend.put_bucket_cors(req).await
6150    }
6151    async fn delete_bucket_cors(
6152        &self,
6153        req: S3Request<DeleteBucketCorsInput>,
6154    ) -> S3Result<S3Response<DeleteBucketCorsOutput>> {
6155        if let Some(mgr) = self.cors.as_ref() {
6156            mgr.delete(&req.input.bucket);
6157            return Ok(S3Response::new(DeleteBucketCorsOutput::default()));
6158        }
6159        self.backend.delete_bucket_cors(req).await
6160    }
6161
6162    // ---- Bucket lifecycle (v0.6 #37) ----
6163    async fn get_bucket_lifecycle_configuration(
6164        &self,
6165        req: S3Request<GetBucketLifecycleConfigurationInput>,
6166    ) -> S3Result<S3Response<GetBucketLifecycleConfigurationOutput>> {
6167        if let Some(mgr) = self.lifecycle.as_ref() {
6168            let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6169                S3Error::with_message(
6170                    S3ErrorCode::NoSuchLifecycleConfiguration,
6171                    "The lifecycle configuration does not exist".to_string(),
6172                )
6173            })?;
6174            let rules: Vec<LifecycleRule> = cfg.rules.iter().map(internal_rule_to_dto).collect();
6175            return Ok(S3Response::new(GetBucketLifecycleConfigurationOutput {
6176                rules: Some(rules),
6177                transition_default_minimum_object_size: None,
6178            }));
6179        }
6180        self.backend.get_bucket_lifecycle_configuration(req).await
6181    }
6182    async fn put_bucket_lifecycle_configuration(
6183        &self,
6184        req: S3Request<PutBucketLifecycleConfigurationInput>,
6185    ) -> S3Result<S3Response<PutBucketLifecycleConfigurationOutput>> {
6186        if let Some(mgr) = self.lifecycle.as_ref() {
6187            let bucket = req.input.bucket.clone();
6188            let dto_cfg = req.input.lifecycle_configuration.unwrap_or_default();
6189            let cfg = dto_lifecycle_to_internal(&dto_cfg);
6190            mgr.put(&bucket, cfg);
6191            return Ok(S3Response::new(
6192                PutBucketLifecycleConfigurationOutput::default(),
6193            ));
6194        }
6195        self.backend.put_bucket_lifecycle_configuration(req).await
6196    }
6197    async fn delete_bucket_lifecycle(
6198        &self,
6199        req: S3Request<DeleteBucketLifecycleInput>,
6200    ) -> S3Result<S3Response<DeleteBucketLifecycleOutput>> {
6201        if let Some(mgr) = self.lifecycle.as_ref() {
6202            mgr.delete(&req.input.bucket);
6203            return Ok(S3Response::new(DeleteBucketLifecycleOutput::default()));
6204        }
6205        self.backend.delete_bucket_lifecycle(req).await
6206    }
6207
6208    // ---- Bucket tagging (v0.6 #39) ----
6209    async fn get_bucket_tagging(
6210        &self,
6211        req: S3Request<GetBucketTaggingInput>,
6212    ) -> S3Result<S3Response<GetBucketTaggingOutput>> {
6213        let Some(mgr) = self.tagging.as_ref() else {
6214            return self.backend.get_bucket_tagging(req).await;
6215        };
6216        let tags = mgr.get_bucket_tags(&req.input.bucket).unwrap_or_default();
6217        Ok(S3Response::new(GetBucketTaggingOutput {
6218            tag_set: tagset_to_aws(&tags),
6219        }))
6220    }
6221    async fn put_bucket_tagging(
6222        &self,
6223        req: S3Request<PutBucketTaggingInput>,
6224    ) -> S3Result<S3Response<PutBucketTaggingOutput>> {
6225        let Some(mgr) = self.tagging.as_ref() else {
6226            return self.backend.put_bucket_tagging(req).await;
6227        };
6228        let bucket = req.input.bucket.clone();
6229        let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6230            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6231        self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6232        mgr.put_bucket_tags(&bucket, parsed);
6233        Ok(S3Response::new(PutBucketTaggingOutput::default()))
6234    }
6235    async fn delete_bucket_tagging(
6236        &self,
6237        req: S3Request<DeleteBucketTaggingInput>,
6238    ) -> S3Result<S3Response<DeleteBucketTaggingOutput>> {
6239        let Some(mgr) = self.tagging.as_ref() else {
6240            return self.backend.delete_bucket_tagging(req).await;
6241        };
6242        let bucket = req.input.bucket.clone();
6243        self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6244        mgr.delete_bucket_tags(&bucket);
6245        Ok(S3Response::new(DeleteBucketTaggingOutput::default()))
6246    }
6247
6248    // ---- Bucket encryption ----
6249    async fn get_bucket_encryption(
6250        &self,
6251        req: S3Request<GetBucketEncryptionInput>,
6252    ) -> S3Result<S3Response<GetBucketEncryptionOutput>> {
6253        self.backend.get_bucket_encryption(req).await
6254    }
6255    async fn put_bucket_encryption(
6256        &self,
6257        req: S3Request<PutBucketEncryptionInput>,
6258    ) -> S3Result<S3Response<PutBucketEncryptionOutput>> {
6259        self.backend.put_bucket_encryption(req).await
6260    }
6261    async fn delete_bucket_encryption(
6262        &self,
6263        req: S3Request<DeleteBucketEncryptionInput>,
6264    ) -> S3Result<S3Response<DeleteBucketEncryptionOutput>> {
6265        self.backend.delete_bucket_encryption(req).await
6266    }
6267
6268    // ---- Bucket logging ----
6269    async fn get_bucket_logging(
6270        &self,
6271        req: S3Request<GetBucketLoggingInput>,
6272    ) -> S3Result<S3Response<GetBucketLoggingOutput>> {
6273        self.backend.get_bucket_logging(req).await
6274    }
6275    async fn put_bucket_logging(
6276        &self,
6277        req: S3Request<PutBucketLoggingInput>,
6278    ) -> S3Result<S3Response<PutBucketLoggingOutput>> {
6279        self.backend.put_bucket_logging(req).await
6280    }
6281
6282    // ---- Bucket notification (v0.6 #35) ----
6283    //
6284    // When a `NotificationManager` is attached, S4 itself owns per-bucket
6285    // notification configurations and the PUT / GET handlers route through
6286    // the manager. The wire DTO's queue / topic configurations map onto
6287    // S4's `Destination::Sqs` / `Destination::Sns`; LambdaFunction and
6288    // EventBridge configurations are accepted on PUT but silently dropped
6289    // (out of scope for v0.6 #35). When no manager is attached the legacy
6290    // backend-passthrough behaviour applies.
6291    async fn get_bucket_notification_configuration(
6292        &self,
6293        req: S3Request<GetBucketNotificationConfigurationInput>,
6294    ) -> S3Result<S3Response<GetBucketNotificationConfigurationOutput>> {
6295        if let Some(mgr) = self.notifications.as_ref() {
6296            let cfg = mgr.get(&req.input.bucket).unwrap_or_default();
6297            let dto = notif_to_dto(&cfg);
6298            return Ok(S3Response::new(GetBucketNotificationConfigurationOutput {
6299                event_bridge_configuration: dto.event_bridge_configuration,
6300                lambda_function_configurations: dto.lambda_function_configurations,
6301                queue_configurations: dto.queue_configurations,
6302                topic_configurations: dto.topic_configurations,
6303            }));
6304        }
6305        self.backend
6306            .get_bucket_notification_configuration(req)
6307            .await
6308    }
6309    async fn put_bucket_notification_configuration(
6310        &self,
6311        req: S3Request<PutBucketNotificationConfigurationInput>,
6312    ) -> S3Result<S3Response<PutBucketNotificationConfigurationOutput>> {
6313        if let Some(mgr) = self.notifications.as_ref() {
6314            let cfg = notif_from_dto(&req.input.notification_configuration);
6315            mgr.put(&req.input.bucket, cfg);
6316            return Ok(S3Response::new(
6317                PutBucketNotificationConfigurationOutput::default(),
6318            ));
6319        }
6320        self.backend
6321            .put_bucket_notification_configuration(req)
6322            .await
6323    }
6324
6325    // ---- Bucket request payment ----
6326    async fn get_bucket_request_payment(
6327        &self,
6328        req: S3Request<GetBucketRequestPaymentInput>,
6329    ) -> S3Result<S3Response<GetBucketRequestPaymentOutput>> {
6330        self.backend.get_bucket_request_payment(req).await
6331    }
6332    async fn put_bucket_request_payment(
6333        &self,
6334        req: S3Request<PutBucketRequestPaymentInput>,
6335    ) -> S3Result<S3Response<PutBucketRequestPaymentOutput>> {
6336        self.backend.put_bucket_request_payment(req).await
6337    }
6338
6339    // ---- Bucket website ----
6340    async fn get_bucket_website(
6341        &self,
6342        req: S3Request<GetBucketWebsiteInput>,
6343    ) -> S3Result<S3Response<GetBucketWebsiteOutput>> {
6344        self.backend.get_bucket_website(req).await
6345    }
6346    async fn put_bucket_website(
6347        &self,
6348        req: S3Request<PutBucketWebsiteInput>,
6349    ) -> S3Result<S3Response<PutBucketWebsiteOutput>> {
6350        self.backend.put_bucket_website(req).await
6351    }
6352    async fn delete_bucket_website(
6353        &self,
6354        req: S3Request<DeleteBucketWebsiteInput>,
6355    ) -> S3Result<S3Response<DeleteBucketWebsiteOutput>> {
6356        self.backend.delete_bucket_website(req).await
6357    }
6358
6359    // ---- Bucket replication (v0.6 #40) ----
6360    async fn get_bucket_replication(
6361        &self,
6362        req: S3Request<GetBucketReplicationInput>,
6363    ) -> S3Result<S3Response<GetBucketReplicationOutput>> {
6364        if let Some(mgr) = self.replication.as_ref() {
6365            return match mgr.get(&req.input.bucket) {
6366                Some(cfg) => Ok(S3Response::new(GetBucketReplicationOutput {
6367                    replication_configuration: Some(replication_to_dto(&cfg)),
6368                })),
6369                None => Err(S3Error::with_message(
6370                    S3ErrorCode::Custom("ReplicationConfigurationNotFoundError".into()),
6371                    format!(
6372                        "no replication configuration on bucket {}",
6373                        req.input.bucket
6374                    ),
6375                )),
6376            };
6377        }
6378        self.backend.get_bucket_replication(req).await
6379    }
6380    async fn put_bucket_replication(
6381        &self,
6382        req: S3Request<PutBucketReplicationInput>,
6383    ) -> S3Result<S3Response<PutBucketReplicationOutput>> {
6384        if let Some(mgr) = self.replication.as_ref() {
6385            let cfg = replication_from_dto(&req.input.replication_configuration);
6386            mgr.put(&req.input.bucket, cfg);
6387            return Ok(S3Response::new(PutBucketReplicationOutput::default()));
6388        }
6389        self.backend.put_bucket_replication(req).await
6390    }
6391    async fn delete_bucket_replication(
6392        &self,
6393        req: S3Request<DeleteBucketReplicationInput>,
6394    ) -> S3Result<S3Response<DeleteBucketReplicationOutput>> {
6395        if let Some(mgr) = self.replication.as_ref() {
6396            mgr.delete(&req.input.bucket);
6397            return Ok(S3Response::new(DeleteBucketReplicationOutput::default()));
6398        }
6399        self.backend.delete_bucket_replication(req).await
6400    }
6401
6402    // ---- Bucket accelerate ----
6403    async fn get_bucket_accelerate_configuration(
6404        &self,
6405        req: S3Request<GetBucketAccelerateConfigurationInput>,
6406    ) -> S3Result<S3Response<GetBucketAccelerateConfigurationOutput>> {
6407        self.backend.get_bucket_accelerate_configuration(req).await
6408    }
6409    async fn put_bucket_accelerate_configuration(
6410        &self,
6411        req: S3Request<PutBucketAccelerateConfigurationInput>,
6412    ) -> S3Result<S3Response<PutBucketAccelerateConfigurationOutput>> {
6413        self.backend.put_bucket_accelerate_configuration(req).await
6414    }
6415
6416    // ---- Bucket ownership controls ----
6417    async fn get_bucket_ownership_controls(
6418        &self,
6419        req: S3Request<GetBucketOwnershipControlsInput>,
6420    ) -> S3Result<S3Response<GetBucketOwnershipControlsOutput>> {
6421        self.backend.get_bucket_ownership_controls(req).await
6422    }
6423    async fn put_bucket_ownership_controls(
6424        &self,
6425        req: S3Request<PutBucketOwnershipControlsInput>,
6426    ) -> S3Result<S3Response<PutBucketOwnershipControlsOutput>> {
6427        self.backend.put_bucket_ownership_controls(req).await
6428    }
6429    async fn delete_bucket_ownership_controls(
6430        &self,
6431        req: S3Request<DeleteBucketOwnershipControlsInput>,
6432    ) -> S3Result<S3Response<DeleteBucketOwnershipControlsOutput>> {
6433        self.backend.delete_bucket_ownership_controls(req).await
6434    }
6435
6436    // ---- Public access block ----
6437    async fn get_public_access_block(
6438        &self,
6439        req: S3Request<GetPublicAccessBlockInput>,
6440    ) -> S3Result<S3Response<GetPublicAccessBlockOutput>> {
6441        self.backend.get_public_access_block(req).await
6442    }
6443    async fn put_public_access_block(
6444        &self,
6445        req: S3Request<PutPublicAccessBlockInput>,
6446    ) -> S3Result<S3Response<PutPublicAccessBlockOutput>> {
6447        self.backend.put_public_access_block(req).await
6448    }
6449    async fn delete_public_access_block(
6450        &self,
6451        req: S3Request<DeletePublicAccessBlockInput>,
6452    ) -> S3Result<S3Response<DeletePublicAccessBlockOutput>> {
6453        self.backend.delete_public_access_block(req).await
6454    }
6455
6456    // ====================================================================
6457    // v0.6 #41: S3 Select — server-side SQL filter on object body.
6458    //
6459    // Fetch the object via the regular `get_object` path (so SSE-C /
6460    // SSE-S4 / SSE-KMS / S4 codec all decompress + decrypt transparently),
6461    // run a small SQL subset (CSV + JSON Lines, equality / inequality /
6462    // LIKE / AND / OR / NOT) over the in-memory body, and stream the
6463    // matched rows back as AWS event-stream `Records` + `Stats` + `End`
6464    // frames.
6465    //
6466    // Limitations (deliberate, documented):
6467    //   - Parquet input is rejected with NotImplemented.
6468    //   - Aggregates / GROUP BY / JOIN / ORDER BY / LIMIT are rejected at
6469    //     parse time as InvalidRequest (s3s 0.13 doesn't expose AWS's
6470    //     domain-specific `InvalidSqlExpression` code).
6471    //   - The body is fully buffered before SQL evaluation (S3 Select
6472    //     streaming-during-evaluation is v0.7 scope).
6473    //   - GPU-accelerated WHERE evaluation is stubbed out (always None).
6474    async fn select_object_content(
6475        &self,
6476        req: S3Request<SelectObjectContentInput>,
6477    ) -> S3Result<S3Response<SelectObjectContentOutput>> {
6478        use crate::select::{
6479            EventStreamWriter, SelectInputFormat, SelectOutputFormat, run_select_csv,
6480            run_select_jsonlines,
6481        };
6482
6483        let select_bucket = req.input.bucket.clone();
6484        let select_key = req.input.key.clone();
6485        self.enforce_rate_limit(&req, &select_bucket)?;
6486        self.enforce_policy(&req, "s3:GetObject", &select_bucket, Some(&select_key))?;
6487
6488        let request = req.input.request;
6489        let sql = request.expression.clone();
6490        if request.expression_type.as_str() != "SQL" {
6491            return Err(S3Error::with_message(
6492                S3ErrorCode::InvalidExpressionType,
6493                format!(
6494                    "ExpressionType must be SQL, got: {}",
6495                    request.expression_type.as_str()
6496                ),
6497            ));
6498        }
6499
6500        let input_format = if let Some(_json) = request.input_serialization.json.as_ref() {
6501            SelectInputFormat::JsonLines
6502        } else if let Some(csv) = request.input_serialization.csv.as_ref() {
6503            let has_header = csv
6504                .file_header_info
6505                .as_ref()
6506                .map(|h| {
6507                    let s = h.as_str();
6508                    s.eq_ignore_ascii_case("USE") || s.eq_ignore_ascii_case("IGNORE")
6509                })
6510                .unwrap_or(false);
6511            let delim = csv
6512                .field_delimiter
6513                .as_deref()
6514                .and_then(|s| s.chars().next())
6515                .unwrap_or(',');
6516            SelectInputFormat::Csv {
6517                has_header,
6518                delimiter: delim,
6519            }
6520        } else if request.input_serialization.parquet.is_some() {
6521            return Err(S3Error::with_message(
6522                S3ErrorCode::NotImplemented,
6523                "Parquet input is not supported by this S3 Select implementation (v0.6: CSV / JSON Lines only)",
6524            ));
6525        } else {
6526            return Err(S3Error::with_message(
6527                S3ErrorCode::InvalidRequest,
6528                "InputSerialization requires exactly one of CSV / JSON / Parquet",
6529            ));
6530        };
6531        if let Some(ct) = request.input_serialization.compression_type.as_ref()
6532            && !ct.as_str().eq_ignore_ascii_case("NONE")
6533        {
6534            return Err(S3Error::with_message(
6535                S3ErrorCode::NotImplemented,
6536                format!(
6537                    "InputSerialization CompressionType={} is not supported (v0.6: NONE only)",
6538                    ct.as_str()
6539                ),
6540            ));
6541        }
6542
6543        let output_format = if request.output_serialization.json.is_some() {
6544            SelectOutputFormat::Json
6545        } else if request.output_serialization.csv.is_some() {
6546            SelectOutputFormat::Csv
6547        } else {
6548            return Err(S3Error::with_message(
6549                S3ErrorCode::InvalidRequest,
6550                "OutputSerialization requires exactly one of CSV / JSON",
6551            ));
6552        };
6553
6554        let get_input = GetObjectInput {
6555            bucket: select_bucket.clone(),
6556            key: select_key.clone(),
6557            sse_customer_algorithm: req.input.sse_customer_algorithm.clone(),
6558            sse_customer_key: req.input.sse_customer_key.clone(),
6559            sse_customer_key_md5: req.input.sse_customer_key_md5.clone(),
6560            ..Default::default()
6561        };
6562        let get_req = S3Request {
6563            input: get_input,
6564            method: http::Method::GET,
6565            uri: format!("/{}/{}", select_bucket, select_key)
6566                .parse()
6567                .map_err(|e| {
6568                    S3Error::with_message(
6569                        S3ErrorCode::InternalError,
6570                        format!("constructing inner GET URI: {e}"),
6571                    )
6572                })?,
6573            headers: http::HeaderMap::new(),
6574            extensions: http::Extensions::new(),
6575            credentials: req.credentials.clone(),
6576            region: req.region.clone(),
6577            service: req.service.clone(),
6578            trailing_headers: None,
6579        };
6580        let mut get_resp = self.get_object(get_req).await?;
6581        let blob = get_resp.output.body.take().ok_or_else(|| {
6582            S3Error::with_message(
6583                S3ErrorCode::InternalError,
6584                "Select: object body was empty after GET",
6585            )
6586        })?;
6587        let body_bytes = crate::blob::collect_blob(blob, self.max_body_bytes)
6588            .await
6589            .map_err(internal("collect Select body"))?;
6590        let scanned = body_bytes.len() as u64;
6591
6592        let matched_payload = match input_format {
6593            SelectInputFormat::JsonLines => run_select_jsonlines(&sql, &body_bytes, output_format)
6594                .map_err(|e| select_error_to_s3(e, "JSON Lines"))?,
6595            SelectInputFormat::Csv { .. } => {
6596                run_select_csv(&sql, &body_bytes, input_format, output_format)
6597                    .map_err(|e| select_error_to_s3(e, "CSV"))?
6598            }
6599        };
6600
6601        let returned = matched_payload.len() as u64;
6602        let processed = scanned;
6603        let mut events: Vec<S3Result<SelectObjectContentEvent>> = Vec::with_capacity(3);
6604        if !matched_payload.is_empty() {
6605            events.push(Ok(SelectObjectContentEvent::Records(RecordsEvent {
6606                payload: Some(bytes::Bytes::from(matched_payload)),
6607            })));
6608        }
6609        events.push(Ok(SelectObjectContentEvent::Stats(StatsEvent {
6610            details: Some(Stats {
6611                bytes_scanned: Some(scanned as i64),
6612                bytes_processed: Some(processed as i64),
6613                bytes_returned: Some(returned as i64),
6614            }),
6615        })));
6616        events.push(Ok(SelectObjectContentEvent::End(EndEvent {})));
6617        // Touch EventStreamWriter so the public API stays linked into the
6618        // build (the actual wire framing is delegated to s3s).
6619        let _writer = EventStreamWriter::new();
6620
6621        let stream = SelectObjectContentEventStream::new(futures::stream::iter(events));
6622        let output = SelectObjectContentOutput {
6623            payload: Some(stream),
6624        };
6625        Ok(S3Response::new(output))
6626    }
6627
6628    // ---- Bucket Inventory configuration (v0.6 #36) ----
6629    //
6630    // When an `InventoryManager` is attached, S4-server owns the
6631    // configuration store and these handlers no longer pass through to
6632    // the backend. The mapping between the s3s-typed
6633    // `InventoryConfiguration` and the inventory module's internal
6634    // `InventoryConfig` is intentionally lossy: only the fields S4
6635    // actually uses for periodic CSV emission survive the round trip
6636    // (id, source bucket, destination bucket / prefix, format, included
6637    // versions, schedule frequency). Optional fields, encryption, and
6638    // filter prefixes are accepted on PUT and re-surfaced on GET via
6639    // a best-effort default-shape `InventoryConfiguration` so the
6640    // client sees a roundtrip-clean response.
6641    async fn put_bucket_inventory_configuration(
6642        &self,
6643        req: S3Request<PutBucketInventoryConfigurationInput>,
6644    ) -> S3Result<S3Response<PutBucketInventoryConfigurationOutput>> {
6645        if let Some(mgr) = self.inventory.as_ref() {
6646            let cfg = inv_from_dto(
6647                &req.input.bucket,
6648                &req.input.id,
6649                &req.input.inventory_configuration,
6650            );
6651            mgr.put(cfg);
6652            return Ok(S3Response::new(
6653                PutBucketInventoryConfigurationOutput::default(),
6654            ));
6655        }
6656        self.backend.put_bucket_inventory_configuration(req).await
6657    }
6658
6659    async fn get_bucket_inventory_configuration(
6660        &self,
6661        req: S3Request<GetBucketInventoryConfigurationInput>,
6662    ) -> S3Result<S3Response<GetBucketInventoryConfigurationOutput>> {
6663        if let Some(mgr) = self.inventory.as_ref() {
6664            let cfg = mgr.get(&req.input.bucket, &req.input.id);
6665            if let Some(cfg) = cfg {
6666                let out = GetBucketInventoryConfigurationOutput {
6667                    inventory_configuration: Some(inv_to_dto(&cfg)),
6668                };
6669                return Ok(S3Response::new(out));
6670            }
6671            // AWS returns `NoSuchConfiguration` (404) when the id has no
6672            // matching inventory configuration on the bucket. The
6673            // generated `S3ErrorCode` enum doesn't expose a typed variant
6674            // for this code, so we round-trip through `from_bytes` which
6675            // wraps unknown codes as `Custom(...)` (= the AWS-canonical
6676            // error-code string survives into the XML response envelope).
6677            let code =
6678                S3ErrorCode::from_bytes(b"NoSuchConfiguration").unwrap_or(S3ErrorCode::NoSuchKey);
6679            return Err(S3Error::with_message(
6680                code,
6681                format!(
6682                    "no inventory configuration with id={} on bucket={}",
6683                    req.input.id, req.input.bucket
6684                ),
6685            ));
6686        }
6687        self.backend.get_bucket_inventory_configuration(req).await
6688    }
6689
6690    async fn list_bucket_inventory_configurations(
6691        &self,
6692        req: S3Request<ListBucketInventoryConfigurationsInput>,
6693    ) -> S3Result<S3Response<ListBucketInventoryConfigurationsOutput>> {
6694        if let Some(mgr) = self.inventory.as_ref() {
6695            let list = mgr.list_for_bucket(&req.input.bucket);
6696            let dto_list: Vec<InventoryConfiguration> = list.iter().map(inv_to_dto).collect();
6697            let out = ListBucketInventoryConfigurationsOutput {
6698                continuation_token: req.input.continuation_token.clone(),
6699                inventory_configuration_list: if dto_list.is_empty() {
6700                    None
6701                } else {
6702                    Some(dto_list)
6703                },
6704                is_truncated: Some(false),
6705                next_continuation_token: None,
6706            };
6707            return Ok(S3Response::new(out));
6708        }
6709        self.backend.list_bucket_inventory_configurations(req).await
6710    }
6711
6712    async fn delete_bucket_inventory_configuration(
6713        &self,
6714        req: S3Request<DeleteBucketInventoryConfigurationInput>,
6715    ) -> S3Result<S3Response<DeleteBucketInventoryConfigurationOutput>> {
6716        if let Some(mgr) = self.inventory.as_ref() {
6717            mgr.delete(&req.input.bucket, &req.input.id);
6718            return Ok(S3Response::new(
6719                DeleteBucketInventoryConfigurationOutput::default(),
6720            ));
6721        }
6722        self.backend
6723            .delete_bucket_inventory_configuration(req)
6724            .await
6725    }
6726}
6727
6728// ---------------------------------------------------------------------------
6729// v0.6 #36: Convert between the s3s-typed `InventoryConfiguration` (the wire
6730// surface) and our internal `crate::inventory::InventoryConfig`. Only the
6731// fields S4 actually uses for CSV emission survive the round trip; the
6732// missing fields (filter prefix, optional fields, encryption) are dropped on
6733// PUT and re-rendered as the AWS-default shape on GET so the client sees a
6734// well-formed `InventoryConfiguration`.
6735// ---------------------------------------------------------------------------
6736
6737fn inv_from_dto(
6738    bucket: &str,
6739    id: &str,
6740    dto: &InventoryConfiguration,
6741) -> crate::inventory::InventoryConfig {
6742    let frequency_hours = match dto.schedule.frequency.as_str() {
6743        "Weekly" => 24 * 7,
6744        // Daily is the default; anything S4 doesn't recognise (incl.
6745        // empty, which is the s3s-default) maps to Daily so the
6746        // operator's PUT doesn't silently turn into a no-op cadence.
6747        _ => 24,
6748    };
6749    // Parquet/ORC are not supported (issue #36 scope); we still accept
6750    // the PUT so callers don't fail-loud, but we record CSV and rely on
6751    // the operator catching the discrepancy on GET.
6752    let format = crate::inventory::InventoryFormat::Csv;
6753    crate::inventory::InventoryConfig {
6754        id: id.to_owned(),
6755        bucket: bucket.to_owned(),
6756        destination_bucket: dto.destination.s3_bucket_destination.bucket.clone(),
6757        destination_prefix: dto
6758            .destination
6759            .s3_bucket_destination
6760            .prefix
6761            .clone()
6762            .unwrap_or_default(),
6763        frequency_hours,
6764        format,
6765        included_object_versions: crate::inventory::IncludedVersions::from_aws_str(
6766            dto.included_object_versions.as_str(),
6767        ),
6768    }
6769}
6770
6771fn inv_to_dto(cfg: &crate::inventory::InventoryConfig) -> InventoryConfiguration {
6772    InventoryConfiguration {
6773        id: cfg.id.clone(),
6774        is_enabled: true,
6775        included_object_versions: InventoryIncludedObjectVersions::from(
6776            cfg.included_object_versions.as_aws_str().to_owned(),
6777        ),
6778        destination: InventoryDestination {
6779            s3_bucket_destination: InventoryS3BucketDestination {
6780                account_id: None,
6781                bucket: cfg.destination_bucket.clone(),
6782                encryption: None,
6783                format: InventoryFormat::from(cfg.format.as_aws_str().to_owned()),
6784                prefix: if cfg.destination_prefix.is_empty() {
6785                    None
6786                } else {
6787                    Some(cfg.destination_prefix.clone())
6788                },
6789            },
6790        },
6791        schedule: InventorySchedule {
6792            // `frequency_hours == 168` -> Weekly; everything else maps to
6793            // Daily for the wire response (the manager keeps the precise
6794            // hour count internally for due-checking).
6795            frequency: InventoryFrequency::from(
6796                if cfg.frequency_hours == 24 * 7 {
6797                    "Weekly"
6798                } else {
6799                    "Daily"
6800                }
6801                .to_owned(),
6802            ),
6803        },
6804        filter: None,
6805        optional_fields: None,
6806    }
6807}
6808
6809// ---------------------------------------------------------------------------
6810// v0.6 #35: Convert between the s3s-typed `NotificationConfiguration` (the
6811// wire surface) and our internal `crate::notifications::NotificationConfig`.
6812//
6813// We support TopicConfiguration (-> Destination::Sns) and QueueConfiguration
6814// (-> Destination::Sqs). LambdaFunction and EventBridge configurations are
6815// silently dropped on PUT (out of scope for v0.6 #35); the GET response only
6816// surfaces topic / queue rules.
6817//
6818// The webhook destination has no AWS-native wire form: operators configure
6819// webhooks via the JSON snapshot file (`--notifications-state-file`) or by
6820// poking `NotificationManager::put` directly from a custom binary. This
6821// keeps the wire surface AWS-compatible while still letting the always-
6822// available `Webhook` destination be reachable.
6823// ---------------------------------------------------------------------------
6824
6825fn notif_from_dto(dto: &NotificationConfiguration) -> crate::notifications::NotificationConfig {
6826    let mut rules: Vec<crate::notifications::NotificationRule> = Vec::new();
6827    if let Some(topics) = dto.topic_configurations.as_ref() {
6828        for (idx, t) in topics.iter().enumerate() {
6829            let events = events_from_dto(&t.events);
6830            let (prefix, suffix) = filter_from_dto(t.filter.as_ref());
6831            rules.push(crate::notifications::NotificationRule {
6832                id: t.id.clone().unwrap_or_else(|| format!("topic-{idx}")),
6833                events,
6834                destination: crate::notifications::Destination::Sns {
6835                    topic_arn: t.topic_arn.clone(),
6836                },
6837                filter_prefix: prefix,
6838                filter_suffix: suffix,
6839            });
6840        }
6841    }
6842    if let Some(queues) = dto.queue_configurations.as_ref() {
6843        for (idx, q) in queues.iter().enumerate() {
6844            let events = events_from_dto(&q.events);
6845            let (prefix, suffix) = filter_from_dto(q.filter.as_ref());
6846            rules.push(crate::notifications::NotificationRule {
6847                id: q.id.clone().unwrap_or_else(|| format!("queue-{idx}")),
6848                events,
6849                destination: crate::notifications::Destination::Sqs {
6850                    queue_arn: q.queue_arn.clone(),
6851                },
6852                filter_prefix: prefix,
6853                filter_suffix: suffix,
6854            });
6855        }
6856    }
6857    crate::notifications::NotificationConfig { rules }
6858}
6859
6860fn notif_to_dto(cfg: &crate::notifications::NotificationConfig) -> NotificationConfiguration {
6861    let mut topics: Vec<TopicConfiguration> = Vec::new();
6862    let mut queues: Vec<QueueConfiguration> = Vec::new();
6863    for rule in &cfg.rules {
6864        let events: Vec<Event> = rule
6865            .events
6866            .iter()
6867            .map(|e| Event::from(e.as_aws_str().to_owned()))
6868            .collect();
6869        let filter = filter_to_dto(rule.filter_prefix.as_deref(), rule.filter_suffix.as_deref());
6870        match &rule.destination {
6871            crate::notifications::Destination::Sns { topic_arn } => {
6872                topics.push(TopicConfiguration {
6873                    events,
6874                    filter,
6875                    id: Some(rule.id.clone()),
6876                    topic_arn: topic_arn.clone(),
6877                });
6878            }
6879            crate::notifications::Destination::Sqs { queue_arn } => {
6880                queues.push(QueueConfiguration {
6881                    events,
6882                    filter,
6883                    id: Some(rule.id.clone()),
6884                    queue_arn: queue_arn.clone(),
6885                });
6886            }
6887            // Webhook destinations have no AWS wire equivalent — they
6888            // round-trip through the JSON snapshot only. Skip them on the
6889            // GET surface (an SDK consumer wouldn't know what to do with
6890            // them anyway).
6891            crate::notifications::Destination::Webhook { .. } => {}
6892        }
6893    }
6894    NotificationConfiguration {
6895        event_bridge_configuration: None,
6896        lambda_function_configurations: None,
6897        queue_configurations: if queues.is_empty() {
6898            None
6899        } else {
6900            Some(queues)
6901        },
6902        topic_configurations: if topics.is_empty() {
6903            None
6904        } else {
6905            Some(topics)
6906        },
6907    }
6908}
6909
6910fn events_from_dto(events: &[Event]) -> Vec<crate::notifications::EventType> {
6911    events
6912        .iter()
6913        .filter_map(|e| crate::notifications::EventType::from_aws_str(e.as_ref()))
6914        .collect()
6915}
6916
6917fn filter_from_dto(
6918    f: Option<&NotificationConfigurationFilter>,
6919) -> (Option<String>, Option<String>) {
6920    let Some(f) = f else {
6921        return (None, None);
6922    };
6923    let Some(key) = f.key.as_ref() else {
6924        return (None, None);
6925    };
6926    let Some(rules) = key.filter_rules.as_ref() else {
6927        return (None, None);
6928    };
6929    let mut prefix = None;
6930    let mut suffix = None;
6931    for r in rules {
6932        let name = r.name.as_ref().map(|n| n.as_str().to_ascii_lowercase());
6933        let value = r.value.clone();
6934        match name.as_deref() {
6935            Some("prefix") => prefix = value,
6936            Some("suffix") => suffix = value,
6937            _ => {}
6938        }
6939    }
6940    (prefix, suffix)
6941}
6942
6943fn filter_to_dto(
6944    prefix: Option<&str>,
6945    suffix: Option<&str>,
6946) -> Option<NotificationConfigurationFilter> {
6947    if prefix.is_none() && suffix.is_none() {
6948        return None;
6949    }
6950    let mut rules: Vec<FilterRule> = Vec::new();
6951    if let Some(p) = prefix {
6952        rules.push(FilterRule {
6953            name: Some(FilterRuleName::from("prefix".to_owned())),
6954            value: Some(p.to_owned()),
6955        });
6956    }
6957    if let Some(s) = suffix {
6958        rules.push(FilterRule {
6959            name: Some(FilterRuleName::from("suffix".to_owned())),
6960            value: Some(s.to_owned()),
6961        });
6962    }
6963    Some(NotificationConfigurationFilter {
6964        key: Some(S3KeyFilter {
6965            filter_rules: Some(rules),
6966        }),
6967    })
6968}
6969
6970// ---------------------------------------------------------------------------
6971// v0.6 #40: Convert between the s3s-typed `ReplicationConfiguration` (the
6972// wire surface) and our internal `crate::replication::ReplicationConfig`.
6973// AWS's `ReplicationRuleFilter` is a sum type — `Prefix | Tag | And { Prefix,
6974// Tags }`; we flatten it into the single `(prefix, tag-vec)` representation
6975// the matcher needs. Sub-blocks v0.6 #40 does not implement
6976// (DeleteMarkerReplication / SourceSelectionCriteria / ReplicationTime /
6977// Metrics / EncryptionConfiguration) round-trip as `None` on GET — operators
6978// who set them on PUT see them silently dropped, mirroring "feature not
6979// supported in this release" semantics.
6980// ---------------------------------------------------------------------------
6981
6982fn replication_from_dto(dto: &ReplicationConfiguration) -> crate::replication::ReplicationConfig {
6983    let rules = dto
6984        .rules
6985        .iter()
6986        .enumerate()
6987        .map(|(idx, r)| {
6988            let id =
6989                r.id.as_ref()
6990                    .map(|s| s.as_str().to_owned())
6991                    .unwrap_or_else(|| format!("rule-{idx}"));
6992            let priority = r.priority.unwrap_or(0).max(0) as u32;
6993            let status_enabled = r.status.as_str() == ReplicationRuleStatus::ENABLED;
6994            let filter = replication_filter_from_dto(r.filter.as_ref(), r.prefix.as_deref());
6995            let destination_bucket = r.destination.bucket.clone();
6996            let destination_storage_class = r
6997                .destination
6998                .storage_class
6999                .as_ref()
7000                .map(|s| s.as_str().to_owned());
7001            crate::replication::ReplicationRule {
7002                id,
7003                priority,
7004                status_enabled,
7005                filter,
7006                destination_bucket,
7007                destination_storage_class,
7008            }
7009        })
7010        .collect();
7011    crate::replication::ReplicationConfig {
7012        role: dto.role.clone(),
7013        rules,
7014    }
7015}
7016
7017fn replication_to_dto(cfg: &crate::replication::ReplicationConfig) -> ReplicationConfiguration {
7018    let rules = cfg
7019        .rules
7020        .iter()
7021        .map(|r| {
7022            let status = if r.status_enabled {
7023                ReplicationRuleStatus::from_static(ReplicationRuleStatus::ENABLED)
7024            } else {
7025                ReplicationRuleStatus::from_static(ReplicationRuleStatus::DISABLED)
7026            };
7027            let destination = Destination {
7028                access_control_translation: None,
7029                account: None,
7030                bucket: r.destination_bucket.clone(),
7031                encryption_configuration: None,
7032                metrics: None,
7033                replication_time: None,
7034                storage_class: r
7035                    .destination_storage_class
7036                    .as_ref()
7037                    .map(|s| StorageClass::from(s.clone())),
7038            };
7039            let filter = Some(replication_filter_to_dto(&r.filter));
7040            ReplicationRule {
7041                delete_marker_replication: None,
7042                destination,
7043                existing_object_replication: None,
7044                filter,
7045                id: Some(r.id.clone()),
7046                prefix: None,
7047                priority: Some(r.priority as i32),
7048                source_selection_criteria: None,
7049                status,
7050            }
7051        })
7052        .collect();
7053    ReplicationConfiguration {
7054        role: cfg.role.clone(),
7055        rules,
7056    }
7057}
7058
7059fn replication_filter_from_dto(
7060    f: Option<&ReplicationRuleFilter>,
7061    rule_level_prefix: Option<&str>,
7062) -> crate::replication::ReplicationFilter {
7063    let mut prefix: Option<String> = rule_level_prefix.map(str::to_owned);
7064    let mut tags: Vec<(String, String)> = Vec::new();
7065    if let Some(f) = f {
7066        if let Some(p) = f.prefix.as_ref()
7067            && prefix.is_none()
7068        {
7069            prefix = Some(p.clone());
7070        }
7071        if let Some(t) = f.tag.as_ref()
7072            && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7073        {
7074            tags.push((k.clone(), v.clone()));
7075        }
7076        if let Some(and) = f.and.as_ref() {
7077            if let Some(p) = and.prefix.as_ref()
7078                && prefix.is_none()
7079            {
7080                prefix = Some(p.clone());
7081            }
7082            if let Some(ts) = and.tags.as_ref() {
7083                for t in ts {
7084                    if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7085                        tags.push((k.clone(), v.clone()));
7086                    }
7087                }
7088            }
7089        }
7090    }
7091    crate::replication::ReplicationFilter { prefix, tags }
7092}
7093
7094fn replication_filter_to_dto(f: &crate::replication::ReplicationFilter) -> ReplicationRuleFilter {
7095    if f.tags.is_empty() {
7096        ReplicationRuleFilter {
7097            and: None,
7098            prefix: f.prefix.clone(),
7099            tag: None,
7100        }
7101    } else if f.tags.len() == 1 && f.prefix.is_none() {
7102        let (k, v) = &f.tags[0];
7103        ReplicationRuleFilter {
7104            and: None,
7105            prefix: None,
7106            tag: Some(Tag {
7107                key: Some(k.clone()),
7108                value: Some(v.clone()),
7109            }),
7110        }
7111    } else {
7112        let tags: Vec<Tag> = f
7113            .tags
7114            .iter()
7115            .map(|(k, v)| Tag {
7116                key: Some(k.clone()),
7117                value: Some(v.clone()),
7118            })
7119            .collect();
7120        ReplicationRuleFilter {
7121            and: Some(ReplicationRuleAndOperator {
7122                prefix: f.prefix.clone(),
7123                tags: Some(tags),
7124            }),
7125            prefix: None,
7126            tag: None,
7127        }
7128    }
7129}
7130
7131// ---------------------------------------------------------------------------
7132// v0.6 #37: Convert between the s3s-typed `BucketLifecycleConfiguration`
7133// (the wire surface) and our internal `crate::lifecycle::LifecycleConfig`.
7134// The internal representation flattens AWS's "Filter | And" disjunction
7135// into a single `LifecycleFilter` struct of optional fields plus a tag
7136// vector. Fields S4's evaluator does not consume
7137// (`expired_object_delete_marker`, `noncurrent_version_transitions`,
7138// `transition_default_minimum_object_size`, the storage class on the
7139// noncurrent expiration) are dropped on PUT and re-rendered as their
7140// AWS-default shape on GET so the client always sees a well-formed
7141// configuration.
7142// ---------------------------------------------------------------------------
7143
7144fn dto_lifecycle_to_internal(
7145    dto: &BucketLifecycleConfiguration,
7146) -> crate::lifecycle::LifecycleConfig {
7147    crate::lifecycle::LifecycleConfig {
7148        rules: dto.rules.iter().map(dto_rule_to_internal).collect(),
7149    }
7150}
7151
7152fn dto_rule_to_internal(rule: &LifecycleRule) -> crate::lifecycle::LifecycleRule {
7153    let status = crate::lifecycle::LifecycleStatus::from_aws_str(rule.status.as_str());
7154    let filter = rule
7155        .filter
7156        .as_ref()
7157        .map(dto_filter_to_internal)
7158        .unwrap_or_default();
7159    let expiration_days = rule
7160        .expiration
7161        .as_ref()
7162        .and_then(|e| e.days)
7163        .and_then(|d| u32::try_from(d).ok());
7164    let expiration_date = rule
7165        .expiration
7166        .as_ref()
7167        .and_then(|e| e.date.as_ref())
7168        .and_then(timestamp_to_chrono_utc);
7169    let transitions: Vec<crate::lifecycle::TransitionRule> = rule
7170        .transitions
7171        .as_ref()
7172        .map(|ts| {
7173            ts.iter()
7174                .filter_map(|t| {
7175                    let days = u32::try_from(t.days?).ok()?;
7176                    let storage_class = t.storage_class.as_ref()?.as_str().to_owned();
7177                    Some(crate::lifecycle::TransitionRule {
7178                        days,
7179                        storage_class,
7180                    })
7181                })
7182                .collect()
7183        })
7184        .unwrap_or_default();
7185    let noncurrent_version_expiration_days = rule
7186        .noncurrent_version_expiration
7187        .as_ref()
7188        .and_then(|n| n.noncurrent_days)
7189        .and_then(|d| u32::try_from(d).ok());
7190    let abort_incomplete_multipart_upload_days = rule
7191        .abort_incomplete_multipart_upload
7192        .as_ref()
7193        .and_then(|a| a.days_after_initiation)
7194        .and_then(|d| u32::try_from(d).ok());
7195    crate::lifecycle::LifecycleRule {
7196        id: rule.id.clone().unwrap_or_default(),
7197        status,
7198        filter,
7199        expiration_days,
7200        expiration_date,
7201        transitions,
7202        noncurrent_version_expiration_days,
7203        abort_incomplete_multipart_upload_days,
7204    }
7205}
7206
7207fn dto_filter_to_internal(filter: &LifecycleRuleFilter) -> crate::lifecycle::LifecycleFilter {
7208    let mut prefix = filter.prefix.clone();
7209    let mut tags: Vec<(String, String)> = Vec::new();
7210    let mut size_gt: Option<u64> = filter
7211        .object_size_greater_than
7212        .and_then(|n| u64::try_from(n).ok());
7213    let mut size_lt: Option<u64> = filter
7214        .object_size_less_than
7215        .and_then(|n| u64::try_from(n).ok());
7216    if let Some(t) = &filter.tag
7217        && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7218    {
7219        tags.push((k.clone(), v.clone()));
7220    }
7221    if let Some(and) = &filter.and {
7222        if prefix.is_none() {
7223            prefix = and.prefix.clone();
7224        }
7225        if size_gt.is_none() {
7226            size_gt = and
7227                .object_size_greater_than
7228                .and_then(|n| u64::try_from(n).ok());
7229        }
7230        if size_lt.is_none() {
7231            size_lt = and
7232                .object_size_less_than
7233                .and_then(|n| u64::try_from(n).ok());
7234        }
7235        if let Some(ts) = &and.tags {
7236            for t in ts {
7237                if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7238                    tags.push((k.clone(), v.clone()));
7239                }
7240            }
7241        }
7242    }
7243    crate::lifecycle::LifecycleFilter {
7244        prefix,
7245        tags,
7246        object_size_greater_than: size_gt,
7247        object_size_less_than: size_lt,
7248    }
7249}
7250
7251fn internal_rule_to_dto(rule: &crate::lifecycle::LifecycleRule) -> LifecycleRule {
7252    let expiration = if rule.expiration_days.is_some() || rule.expiration_date.is_some() {
7253        Some(LifecycleExpiration {
7254            date: rule.expiration_date.map(chrono_utc_to_timestamp),
7255            days: rule.expiration_days.map(|d| d as i32),
7256            expired_object_delete_marker: None,
7257        })
7258    } else {
7259        None
7260    };
7261    let transitions: Option<TransitionList> = if rule.transitions.is_empty() {
7262        None
7263    } else {
7264        Some(
7265            rule.transitions
7266                .iter()
7267                .map(|t| Transition {
7268                    date: None,
7269                    days: Some(t.days as i32),
7270                    storage_class: Some(TransitionStorageClass::from(t.storage_class.clone())),
7271                })
7272                .collect(),
7273        )
7274    };
7275    let noncurrent_version_expiration =
7276        rule.noncurrent_version_expiration_days
7277            .map(|d| NoncurrentVersionExpiration {
7278                newer_noncurrent_versions: None,
7279                noncurrent_days: Some(d as i32),
7280            });
7281    let abort_incomplete_multipart_upload =
7282        rule.abort_incomplete_multipart_upload_days
7283            .map(|d| AbortIncompleteMultipartUpload {
7284                days_after_initiation: Some(d as i32),
7285            });
7286    let filter = if rule.filter.tags.is_empty()
7287        && rule.filter.object_size_greater_than.is_none()
7288        && rule.filter.object_size_less_than.is_none()
7289    {
7290        rule.filter.prefix.as_ref().map(|p| LifecycleRuleFilter {
7291            and: None,
7292            object_size_greater_than: None,
7293            object_size_less_than: None,
7294            prefix: Some(p.clone()),
7295            tag: None,
7296        })
7297    } else if rule.filter.tags.len() == 1
7298        && rule.filter.prefix.is_none()
7299        && rule.filter.object_size_greater_than.is_none()
7300        && rule.filter.object_size_less_than.is_none()
7301    {
7302        let (k, v) = rule.filter.tags[0].clone();
7303        Some(LifecycleRuleFilter {
7304            and: None,
7305            object_size_greater_than: None,
7306            object_size_less_than: None,
7307            prefix: None,
7308            tag: Some(Tag {
7309                key: Some(k),
7310                value: Some(v),
7311            }),
7312        })
7313    } else {
7314        let tags = if rule.filter.tags.is_empty() {
7315            None
7316        } else {
7317            Some(
7318                rule.filter
7319                    .tags
7320                    .iter()
7321                    .map(|(k, v)| Tag {
7322                        key: Some(k.clone()),
7323                        value: Some(v.clone()),
7324                    })
7325                    .collect(),
7326            )
7327        };
7328        Some(LifecycleRuleFilter {
7329            and: Some(LifecycleRuleAndOperator {
7330                object_size_greater_than: rule
7331                    .filter
7332                    .object_size_greater_than
7333                    .and_then(|n| i64::try_from(n).ok()),
7334                object_size_less_than: rule
7335                    .filter
7336                    .object_size_less_than
7337                    .and_then(|n| i64::try_from(n).ok()),
7338                prefix: rule.filter.prefix.clone(),
7339                tags,
7340            }),
7341            object_size_greater_than: None,
7342            object_size_less_than: None,
7343            prefix: None,
7344            tag: None,
7345        })
7346    };
7347    LifecycleRule {
7348        abort_incomplete_multipart_upload,
7349        expiration,
7350        filter,
7351        id: if rule.id.is_empty() {
7352            None
7353        } else {
7354            Some(rule.id.clone())
7355        },
7356        noncurrent_version_expiration,
7357        noncurrent_version_transitions: None,
7358        prefix: None,
7359        status: ExpirationStatus::from(rule.status.as_aws_str().to_owned()),
7360        transitions,
7361    }
7362}
7363
7364// (timestamp <-> chrono helpers `timestamp_to_chrono_utc` /
7365// `chrono_utc_to_timestamp` are defined earlier in this file for the
7366// tagging/notifications work; the lifecycle DTO converters reuse them.)
7367
7368// ---------------------------------------------------------------------------
7369// v0.5 #33: SigV4a (asymmetric ECDSA-P256) integration hook.
7370//
7371// Kept as a self-contained block at the bottom of the file so it doesn't
7372// touch the existing `S4Service` struct, `new()`, or any of the per-op
7373// handlers above. The hook is wired in by the binary at server-build time
7374// as a hyper middleware layer (see `main.rs`), NOT inside `S4Service`.
7375//
7376// Lifecycle:
7377//   1. `SigV4aGate::new(store)` is constructed once at boot from the
7378//      operator-supplied credential directory.
7379//   2. For each incoming request, `SigV4aGate::pre_route(&req,
7380//      &requested_region, &canonical_request_bytes)` is invoked BEFORE
7381//      the request hits the S3 framework. If the request claims SigV4a
7382//      and verifies, control returns to the framework. Otherwise a 403
7383//      `SignatureDoesNotMatch` is produced.
7384//   3. Plain SigV4 (HMAC-SHA256) requests pass through untouched.
7385// ---------------------------------------------------------------------------
7386
7387/// Gate that fronts the S3 service path with SigV4a verification (v0.5 #33).
7388///
7389/// Wraps a [`crate::sigv4a::SigV4aCredentialStore`] and exposes a single
7390/// `pre_route` entry point that returns `Ok(())` for both
7391/// "request is plain SigV4 — pass through" and "request is SigV4a and
7392/// verified", and an `Err(...)` containing a 403-equivalent diagnostic
7393/// otherwise. Cheap to clone (the inner store is `Arc`-backed).
7394///
7395/// v0.8.4 #76 (audit H-6): the gate now enforces an `x-amz-date`
7396/// freshness window (default 15 min, AWS-spec) and a strict credential
7397/// scope shape (`<key>/<YYYYMMDD>/s3/aws4_request`), shutting the
7398/// captured-request replay vector — previously a stolen valid SigV4a
7399/// signature could be replayed indefinitely (including DELETE).
7400#[derive(Debug, Clone)]
7401pub struct SigV4aGate {
7402    store: crate::sigv4a::SharedSigV4aCredentialStore,
7403    /// v0.8.4 #76: how far the request's `x-amz-date` may drift from
7404    /// the server's clock before being rejected with 403
7405    /// `RequestTimeTooSkewed`. Matches the AWS S3 spec default of
7406    /// 15 min when constructed via [`SigV4aGate::new`]; the operator
7407    /// can override via [`SigV4aGate::with_skew_tolerance`] (CLI flag
7408    /// `--sigv4a-skew-tolerance-seconds`).
7409    skew_tolerance: chrono::Duration,
7410}
7411
7412impl SigV4aGate {
7413    /// Default `x-amz-date` skew tolerance — 15 min, matching AWS S3.
7414    pub const DEFAULT_SKEW_TOLERANCE_SECS: i64 = 900;
7415
7416    #[must_use]
7417    pub fn new(store: crate::sigv4a::SharedSigV4aCredentialStore) -> Self {
7418        Self {
7419            store,
7420            skew_tolerance: chrono::Duration::seconds(Self::DEFAULT_SKEW_TOLERANCE_SECS),
7421        }
7422    }
7423
7424    /// v0.8.4 #76: override the `x-amz-date` skew tolerance (default
7425    /// 15 min). Operators can widen this for high-clock-drift
7426    /// environments or tighten it for compliance regimes that demand
7427    /// stricter freshness.
7428    #[must_use]
7429    pub fn with_skew_tolerance(mut self, skew: chrono::Duration) -> Self {
7430        self.skew_tolerance = skew;
7431        self
7432    }
7433
7434    /// Read the configured skew tolerance — exposed mostly for test +
7435    /// observability use.
7436    #[must_use]
7437    pub fn skew_tolerance(&self) -> chrono::Duration {
7438        self.skew_tolerance
7439    }
7440
7441    /// Inspect an incoming HTTP request. Behaviour:
7442    ///
7443    /// - Not SigV4a (no `X-Amz-Region-Set` and no SigV4a `Authorization`
7444    ///   prefix) → returns `Ok(())`; the framework's existing SigV4
7445    ///   path handles the request.
7446    /// - SigV4a + valid signature + region match + fresh x-amz-date
7447    ///   → `Ok(())`.
7448    /// - SigV4a + unknown access-key-id → `Err` with `InvalidAccessKeyId`.
7449    /// - SigV4a + bad signature / region mismatch → `Err` with
7450    ///   `SignatureDoesNotMatch`.
7451    /// - SigV4a + missing or skewed `x-amz-date` → `Err` with one of
7452    ///   the v0.8.4 #76 freshness variants (`RequestTimeTooSkewed`
7453    ///   et al.).
7454    ///
7455    /// `canonical_request_bytes` is the SigV4a string-to-sign (or
7456    /// canonical-request bytes; the caller decides) that the framework
7457    /// has already produced for this request. Keeping it as a parameter
7458    /// instead of rebuilding it inside the hook avoids duplicating the
7459    /// canonicalisation logic.
7460    pub fn pre_route<B>(
7461        &self,
7462        req: &http::Request<B>,
7463        requested_region: &str,
7464        canonical_request_bytes: &[u8],
7465    ) -> Result<(), SigV4aGateError> {
7466        self.pre_route_at(
7467            req,
7468            requested_region,
7469            canonical_request_bytes,
7470            chrono::Utc::now(),
7471        )
7472    }
7473
7474    /// Like [`SigV4aGate::pre_route`] but takes an explicit `now` for
7475    /// tests that need to pin the freshness clock. Production callers
7476    /// use `pre_route` (which calls `chrono::Utc::now()`).
7477    pub fn pre_route_at<B>(
7478        &self,
7479        req: &http::Request<B>,
7480        requested_region: &str,
7481        canonical_request_bytes: &[u8],
7482        now: chrono::DateTime<chrono::Utc>,
7483    ) -> Result<(), SigV4aGateError> {
7484        if !crate::sigv4a::detect(req) {
7485            return Ok(());
7486        }
7487        let auth_hdr = req
7488            .headers()
7489            .get(http::header::AUTHORIZATION)
7490            .and_then(|v| v.to_str().ok())
7491            .ok_or(SigV4aGateError::MissingAuthorization)?;
7492        let parsed = crate::sigv4a::parse_authorization_header(auth_hdr)
7493            .map_err(|_| SigV4aGateError::MalformedAuthorization)?;
7494        let region_set = req
7495            .headers()
7496            .get(crate::sigv4a::REGION_SET_HEADER)
7497            .and_then(|v| v.to_str().ok())
7498            .unwrap_or("*");
7499        let key = self
7500            .store
7501            .get(&parsed.access_key_id)
7502            .ok_or_else(|| SigV4aGateError::UnknownAccessKey(parsed.access_key_id.clone()))?;
7503        // v0.8.4 #76: snapshot the request headers into a
7504        // lowercase-keyed flat map so `verify_request` can do the
7505        // x-amz-date freshness checks without taking a generic
7506        // `HeaderMap` dep. Cheap because the headers list is tiny.
7507        //
7508        // v0.8.5 #84 (audit H-4): detect duplicate header names while
7509        // we flatten — `HashMap::insert` would silently overwrite the
7510        // first value with the second, mirroring the auth-confusion
7511        // vector the canonical-request builder also defends against.
7512        // Reject upfront so the rest of the gate (freshness check,
7513        // ECDSA verify) never sees a half-truncated header set. We
7514        // detect by checking `contains_key` *before* insertion rather
7515        // than by counting via `headers().get_all`, because the
7516        // upstream `HeaderMap` iteration yields each duplicate entry
7517        // as its own (name, value) pair — the second-seen entry is
7518        // exactly what `contains_key` traps.
7519        let mut header_map: std::collections::HashMap<String, String> =
7520            std::collections::HashMap::with_capacity(req.headers().len());
7521        for (name, value) in req.headers() {
7522            if let Ok(v) = value.to_str() {
7523                let lower = name.as_str().to_ascii_lowercase();
7524                if header_map.contains_key(&lower) {
7525                    return Err(SigV4aGateError::Verify(
7526                        crate::sigv4a::SigV4aError::DuplicateSignedHeader { header: lower },
7527                    ));
7528                }
7529                header_map.insert(lower, v.to_string());
7530            }
7531        }
7532        crate::sigv4a::verify_request(
7533            &parsed,
7534            &header_map,
7535            canonical_request_bytes,
7536            key,
7537            region_set,
7538            requested_region,
7539            now,
7540            self.skew_tolerance,
7541        )
7542        .map_err(SigV4aGateError::Verify)?;
7543        Ok(())
7544    }
7545}
7546
7547/// Failure modes from [`SigV4aGate::pre_route`]. All variants map to
7548/// HTTP 403 with one of the two AWS-standard error codes
7549/// (`InvalidAccessKeyId` / `SignatureDoesNotMatch` / `RequestTimeTooSkewed`)
7550/// — see [`SigV4aGateError::s3_error_code`].
7551#[derive(Debug, thiserror::Error)]
7552pub enum SigV4aGateError {
7553    #[error("missing Authorization header")]
7554    MissingAuthorization,
7555    #[error("malformed SigV4a Authorization header")]
7556    MalformedAuthorization,
7557    #[error("unknown SigV4a access-key-id: {0}")]
7558    UnknownAccessKey(String),
7559    #[error("SigV4a verification failed: {0}")]
7560    Verify(#[source] crate::sigv4a::SigV4aError),
7561}
7562
7563impl SigV4aGateError {
7564    /// AWS S3 error code that should accompany the response.
7565    ///
7566    /// v0.8.4 #76 (audit H-6): the freshness check surfaces
7567    /// `RequestTimeTooSkewed` (matches AWS spec); date / scope shape
7568    /// failures surface as `InvalidRequest` (400); other failures stay
7569    /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` (403) so the wire
7570    /// surface stays AWS-compatible.
7571    #[must_use]
7572    pub fn s3_error_code(&self) -> &'static str {
7573        match self {
7574            Self::UnknownAccessKey(_) => "InvalidAccessKeyId",
7575            Self::Verify(crate::sigv4a::SigV4aError::RequestTimeTooSkewed { .. }) => {
7576                "RequestTimeTooSkewed"
7577            }
7578            Self::Verify(
7579                crate::sigv4a::SigV4aError::MissingXAmzDate
7580                | crate::sigv4a::SigV4aError::InvalidDateFormat
7581                | crate::sigv4a::SigV4aError::DateScopeMismatch
7582                | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7583                | crate::sigv4a::SigV4aError::InvalidTerminator
7584                | crate::sigv4a::SigV4aError::WrongService { .. }
7585                | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7586            ) => "InvalidRequest",
7587            _ => "SignatureDoesNotMatch",
7588        }
7589    }
7590
7591    /// HTTP status code to accompany the response. v0.8.4 #76: format
7592    /// errors that are clearly client mistakes (missing / malformed
7593    /// `x-amz-date`, malformed credential scope, wrong service) are
7594    /// surfaced as 400 InvalidRequest; the rest stay 403.
7595    #[must_use]
7596    pub fn http_status(&self) -> http::StatusCode {
7597        match self {
7598            Self::Verify(
7599                crate::sigv4a::SigV4aError::MissingXAmzDate
7600                | crate::sigv4a::SigV4aError::InvalidDateFormat
7601                | crate::sigv4a::SigV4aError::DateScopeMismatch
7602                | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7603                | crate::sigv4a::SigV4aError::InvalidTerminator
7604                | crate::sigv4a::SigV4aError::WrongService { .. }
7605                | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7606            ) => http::StatusCode::BAD_REQUEST,
7607            _ => http::StatusCode::FORBIDDEN,
7608        }
7609    }
7610}
7611
7612#[cfg(test)]
7613mod tests {
7614    use super::*;
7615
7616    #[test]
7617    fn manifest_roundtrip_via_metadata() {
7618        let original = ChunkManifest {
7619            codec: CodecKind::CpuZstd,
7620            original_size: 1234,
7621            compressed_size: 567,
7622            crc32c: 0xdead_beef,
7623        };
7624        let mut meta: Option<Metadata> = None;
7625        write_manifest(&mut meta, &original);
7626        let extracted = extract_manifest(&meta).expect("manifest must round-trip");
7627        assert_eq!(extracted.codec, original.codec);
7628        assert_eq!(extracted.original_size, original.original_size);
7629        assert_eq!(extracted.compressed_size, original.compressed_size);
7630        assert_eq!(extracted.crc32c, original.crc32c);
7631    }
7632
7633    #[test]
7634    fn missing_metadata_yields_none() {
7635        let meta: Option<Metadata> = None;
7636        assert!(extract_manifest(&meta).is_none());
7637    }
7638
7639    #[test]
7640    fn partial_metadata_yields_none() {
7641        let mut meta = Metadata::new();
7642        meta.insert(META_CODEC.into(), "cpu-zstd".into());
7643        let opt = Some(meta);
7644        assert!(extract_manifest(&opt).is_none());
7645    }
7646
7647    #[test]
7648    fn parse_copy_source_range_basic() {
7649        let r = parse_copy_source_range("bytes=10-20").unwrap();
7650        match r {
7651            s3s::dto::Range::Int { first, last } => {
7652                assert_eq!(first, 10);
7653                assert_eq!(last, Some(20));
7654            }
7655            _ => panic!("expected Int range"),
7656        }
7657    }
7658
7659    #[test]
7660    fn parse_copy_source_range_rejects_inverted() {
7661        let err = parse_copy_source_range("bytes=20-10").unwrap_err();
7662        assert!(err.contains("last < first"));
7663    }
7664
7665    #[test]
7666    fn parse_copy_source_range_rejects_missing_prefix() {
7667        let err = parse_copy_source_range("10-20").unwrap_err();
7668        assert!(err.contains("must start with 'bytes='"));
7669    }
7670
7671    #[test]
7672    fn parse_copy_source_range_rejects_open_ended() {
7673        // S3 upload_part_copy spec requires N-M (closed); suffix and
7674        // open-ended forms are not allowed for this header.
7675        assert!(parse_copy_source_range("bytes=10-").is_err());
7676        assert!(parse_copy_source_range("bytes=-10").is_err());
7677    }
7678
7679    // v0.7 #49: safe_object_uri must round-trip every legal S3 key
7680    // (which includes spaces, slashes, control chars, raw UTF-8) into
7681    // a parseable `http::Uri` instead of panicking like the previous
7682    // `format!(...).parse().unwrap()` call sites did.
7683
7684    #[test]
7685    fn safe_object_uri_basic_ascii() {
7686        let uri = safe_object_uri("bucket", "key").expect("ascii must be safe");
7687        assert_eq!(uri.path(), "/bucket/key");
7688    }
7689
7690    #[test]
7691    fn safe_object_uri_encodes_spaces() {
7692        let uri = safe_object_uri("bucket", "key with spaces").expect("must encode spaces");
7693        // RFC 3986 path-segment encoding turns ' ' into %20.
7694        assert!(
7695            uri.path().contains("%20"),
7696            "expected percent-encoded space, got {}",
7697            uri.path()
7698        );
7699        assert!(uri.path().starts_with("/bucket/"));
7700    }
7701
7702    #[test]
7703    fn safe_object_uri_preserves_slashes() {
7704        // S3 keys legally contain '/' as a logical path separator —
7705        // the helper must NOT escape it (otherwise the synthetic URI
7706        // changes the perceived hierarchy).
7707        let uri = safe_object_uri("bucket", "key/with/slashes").expect("slashes must round-trip");
7708        assert_eq!(uri.path(), "/bucket/key/with/slashes");
7709    }
7710
7711    #[test]
7712    fn safe_object_uri_handles_newline_without_panic() {
7713        // Newlines are control chars in URIs; whether the result is
7714        // Ok (encoded as %0A) or Err (parse rejects), the helper
7715        // MUST NOT panic. Either outcome is acceptable.
7716        let _ = safe_object_uri("bucket", "key\n");
7717    }
7718
7719    #[test]
7720    fn safe_object_uri_handles_null_byte_without_panic() {
7721        let _ = safe_object_uri("bucket", "key\0bad");
7722    }
7723
7724    #[test]
7725    fn safe_object_uri_handles_unicode_without_panic() {
7726        // RTL override, BOM, plain Japanese — none should panic.
7727        let _ = safe_object_uri("bucket", "rtl\u{202E}override");
7728        let _ = safe_object_uri("bucket", "\u{FEFF}bom-key");
7729        let _ = safe_object_uri("bucket", "日本語キー");
7730    }
7731
7732    #[test]
7733    fn safe_object_uri_no_panic_for_every_byte() {
7734        // Exhaustive byte coverage: 0x00..=0xFF as a 1-byte key.
7735        // None of these may panic. (0x80..=0xFF are not valid UTF-8
7736        // by themselves; we go through `String::from_utf8_lossy` so
7737        // the helper sees a real `&str` regardless of the raw byte.)
7738        for b in 0u8..=255 {
7739            let s = String::from_utf8_lossy(&[b]).into_owned();
7740            let _ = safe_object_uri("bucket", &s);
7741        }
7742    }
7743
7744    /// v0.8.1 #58: smoke test for the DEK-handling shape used by the
7745    /// SSE-KMS branches of `put_object` and `complete_multipart_upload`.
7746    /// Mirrors the call pattern (generate_dek → length check → copy
7747    /// into stack `[u8; 32]` → reborrow as `&[u8; 32]` for `SseSource`)
7748    /// without spinning up a full `S4Service`.
7749    ///
7750    /// The real assertion this guards against is a regression where
7751    /// the `Zeroizing` wrapper is accidentally dropped before the
7752    /// stack copy lands (e.g. someone refactors to use
7753    /// `let dek = kms.generate_dek(...).await?.0; drop(dek); ...`)
7754    /// or where `&**dek` is rewritten in a way that doesn't compile.
7755    #[tokio::test]
7756    async fn kms_dek_lifetime_within_function_scope() {
7757        use crate::kms::{KmsBackend, LocalKms};
7758        use std::collections::HashMap;
7759        use std::path::PathBuf;
7760        use zeroize::Zeroizing;
7761
7762        let mut keks = HashMap::new();
7763        keks.insert("scope".to_string(), [33u8; 32]);
7764        let kms = LocalKms::from_keks(PathBuf::from("/tmp/kms-scope-test"), keks);
7765
7766        // Mirror the put_object KMS branch shape exactly.
7767        let (dek, wrapped) = kms.generate_dek("scope").await.unwrap();
7768        assert_eq!(dek.len(), 32);
7769        let mut dek_arr: Zeroizing<[u8; 32]> = Zeroizing::new([0u8; 32]);
7770        dek_arr.copy_from_slice(&dek);
7771
7772        // The reborrow used at the SseSource construction site —
7773        // mirrors the call-site pattern where `let dek_ref: &[u8; 32]`
7774        // auto-derefs from a `Zeroizing<[u8; 32]>` reference.
7775        let dek_ref: &[u8; 32] = &dek_arr;
7776        // Sanity: the reborrow points at the same bytes.
7777        assert_eq!(dek_ref, &*dek_arr);
7778        // Wrapped key id flows through unchanged.
7779        assert_eq!(wrapped.key_id, "scope");
7780
7781        // At end of scope, both `dek` (Zeroizing<Vec<u8>>) and
7782        // `dek_arr` (Zeroizing<[u8; 32]>) are dropped, wiping the
7783        // backing memory. Cannot directly assert the wipe (would be
7784        // UB to read freed memory), so this test instead enforces
7785        // that the call shape compiles and executes; the wipe itself
7786        // is exercised by the `zeroize` crate's own test suite.
7787    }
7788
7789    /// v0.8.5 #86 (audit M-2): the replication dispatcher must
7790    /// `acquire_owned()` a permit from `replication_semaphore` before
7791    /// kicking off the destination PUT, so a saturated semaphore
7792    /// back-pressures the in-flight queue depth instead of letting it
7793    /// grow without bound. We exercise the field directly (initial
7794    /// permit count, override via `with_replication_max_concurrent`,
7795    /// permit drop on `Drop`) — the full `spawn_replication_if_matched`
7796    /// integration is exercised by the existing replication tests in
7797    /// `tests/feature_e2e.rs` once a `ReplicationManager` is attached.
7798    #[tokio::test]
7799    async fn replication_semaphore_caps_concurrent_dispatchers() {
7800        // Build a minimal `S4Service` directly — no handler path is
7801        // exercised, only the constructor + setter + accessor shape.
7802        let registry = Arc::new(
7803            CodecRegistry::new(CodecKind::Passthrough)
7804                .with(Arc::new(s4_codec::passthrough::Passthrough)),
7805        );
7806        let dispatcher = Arc::new(s4_codec::dispatcher::AlwaysDispatcher(
7807            CodecKind::Passthrough,
7808        ));
7809        let s4 = S4Service::new(NoopBackend, registry, dispatcher);
7810
7811        // Default cap matches the documented constant.
7812        assert_eq!(
7813            s4.replication_semaphore().available_permits(),
7814            S4Service::<NoopBackend>::DEFAULT_REPLICATION_MAX_CONCURRENT,
7815            "fresh S4Service must expose DEFAULT_REPLICATION_MAX_CONCURRENT permits"
7816        );
7817
7818        // Override via the builder — replaces the underlying `Semaphore`.
7819        let s4 = s4.with_replication_max_concurrent(2);
7820        assert_eq!(
7821            s4.replication_semaphore().available_permits(),
7822            2,
7823            "with_replication_max_concurrent(2) must expose exactly 2 permits"
7824        );
7825
7826        // Acquiring permits must reduce `available_permits()` and
7827        // dropping them must restore the count — this is the contract
7828        // `spawn_replication_if_matched` relies on for back-pressure.
7829        let sem = Arc::clone(s4.replication_semaphore());
7830        let p1 = sem.clone().acquire_owned().await.expect("permit 1");
7831        let p2 = sem.clone().acquire_owned().await.expect("permit 2");
7832        assert_eq!(
7833            sem.available_permits(),
7834            0,
7835            "two acquired permits must zero `available_permits()`"
7836        );
7837        // A third `try_acquire_owned` must fail — the cap is enforced
7838        // synchronously, no extra spawn slips through.
7839        assert!(
7840            sem.clone().try_acquire_owned().is_err(),
7841            "third acquire must back-pressure: cap was 2"
7842        );
7843        drop(p1);
7844        drop(p2);
7845        assert_eq!(
7846            sem.available_permits(),
7847            2,
7848            "dropping permits must restore cap"
7849        );
7850
7851        // Lower-bound clamp: a 0 cap would deadlock all dispatchers,
7852        // so the setter clamps it to 1 instead of accepting it
7853        // (callers are warned in the CLI doc).
7854        let s4 = s4.with_replication_max_concurrent(0);
7855        assert_eq!(
7856            s4.replication_semaphore().available_permits(),
7857            1,
7858            "cap=0 must be clamped to 1 to avoid total deadlock"
7859        );
7860    }
7861
7862    /// v0.8.5 #86 (audit M-1): the access-log flusher must return a
7863    /// `JoinHandle<()>` that the caller can `abort()` on shutdown
7864    /// without leaving a dangling task. The pre-#86 call site dropped
7865    /// the handle at end-of-block (silently detaching it); the fix is
7866    /// hoisting it into a process-lived `Vec` so the graceful-shutdown
7867    /// branch in `main.rs` can wait for clean exit. This test exercises
7868    /// the `JoinHandle.abort()` shape directly so a future refactor that
7869    /// stops returning the handle (or returns a non-abortable wrapper)
7870    /// trips this regression guard.
7871    #[tokio::test]
7872    async fn flusher_handle_can_be_aborted_cleanly() {
7873        // Stand up a minimal `AccessLog` pointing at a tmp dir so the
7874        // flusher's `create_dir_all` succeeds. The dir is cleaned up
7875        // by the OS / test harness; we don't assert on the contents.
7876        let tmp = std::env::temp_dir().join(format!(
7877            "s4-86-flusher-{}-{}",
7878            std::process::id(),
7879            std::time::SystemTime::now()
7880                .duration_since(std::time::UNIX_EPOCH)
7881                .map(|d| d.as_nanos())
7882                .unwrap_or(0)
7883        ));
7884        let dest = crate::access_log::AccessLogDest { dir: tmp.clone() };
7885        let log = crate::access_log::AccessLog::new(dest);
7886        let handle = log.spawn_flusher(None);
7887        assert!(
7888            !handle.is_finished(),
7889            "freshly-spawned flusher must not yet be finished"
7890        );
7891        handle.abort();
7892        // `await`-ing an aborted handle returns `Err(JoinError)` whose
7893        // `is_cancelled()` is true.
7894        let join_result = handle.await;
7895        assert!(
7896            join_result.is_err(),
7897            "aborted flusher must surface JoinError, got Ok"
7898        );
7899        assert!(
7900            join_result.unwrap_err().is_cancelled(),
7901            "JoinError must report .is_cancelled() = true after abort()"
7902        );
7903        let _ = std::fs::remove_dir_all(&tmp);
7904    }
7905
7906    /// Stub backend used solely by the v0.8.5 #86 unit tests above —
7907    /// the `S4Service` constructor needs `B: S3` but the tests only
7908    /// exercise builder / accessor shape, never a handler call. Every
7909    /// `S3` method falls through to the trait's default
7910    /// `NotImplemented` (which `s3s` provides automatically).
7911    struct NoopBackend;
7912
7913    #[async_trait::async_trait]
7914    impl S3 for NoopBackend {}
7915
7916    /// v0.8.5 #81 (audit H-7): the panic-catch wrapper at the
7917    /// dispatcher spawn site must intercept a panicking inner future,
7918    /// log at ERROR, and bump the per-kind counter — instead of letting
7919    /// the panic propagate as a `JoinError` that no operator dashboard
7920    /// scrapes. We exercise the wrapper directly (rather than driving a
7921    /// full `spawn_replication_if_matched` end-to-end, which would
7922    /// require a full `S4Service` + backend) because the wrapper shape
7923    /// is the load-bearing piece — any inner-future swap would still
7924    /// route through the same `AssertUnwindSafe(...).catch_unwind()`
7925    /// closure we want to lock in here.
7926    #[tokio::test]
7927    async fn dispatcher_panic_caught_and_metric_bumped() {
7928        use futures::FutureExt as _;
7929
7930        let handle = crate::metrics::test_metrics_handle();
7931        let kind = "replication";
7932
7933        // Mirror the production wrapper shape verbatim — if the
7934        // production code ever stops using `AssertUnwindSafe.catch_unwind`
7935        // this test shouldn't keep passing on a hand-rolled copy that
7936        // diverged.
7937        let panicking = async {
7938            panic!("simulated dispatcher panic");
7939        };
7940        let result = std::panic::AssertUnwindSafe(panicking).catch_unwind().await;
7941        assert!(
7942            result.is_err(),
7943            "catch_unwind must surface the panic instead of swallowing it"
7944        );
7945        // Bump the production counter via the same helper the wrapper
7946        // calls so the rendered output gates on the production code
7947        // path, not a parallel bookkeeping copy.
7948        crate::metrics::record_dispatcher_panic(kind);
7949
7950        let rendered = handle.render();
7951        assert!(
7952            rendered.contains("s4_dispatcher_panics_total"),
7953            "expected s4_dispatcher_panics_total in metrics output, got: {rendered}"
7954        );
7955        assert!(
7956            rendered.contains("kind=\"replication\""),
7957            "expected kind=\"replication\" label in metrics output, got: {rendered}"
7958        );
7959    }
7960}
s4_server/service.rs

s4_server/
service.rs