Skip to main content

s4_server/
service.rs

1//! `s3s::S3` 実装 — `s3s_aws::Proxy` への delegation を default にしつつ、
2//! `put_object` / `get_object` 経路で `s4_codec::CodecRegistry` を呼ぶ。
3//!
4//! ## カバー範囲 (Phase 1 月 2)
5//!
6//! - 圧縮 hook あり: `put_object`, `get_object`
7//! - 純 delegation (圧縮なし): `head_bucket`, `list_buckets`, `create_bucket`, `delete_bucket`,
8//!   `head_object`, `delete_object`, `delete_objects`, `copy_object`, `list_objects`,
9//!   `list_objects_v2`, `create_multipart_upload`, `upload_part`,
10//!   `complete_multipart_upload`, `abort_multipart_upload`, `list_multipart_uploads`,
11//!   `list_parts`
12//! - 未対応 (デフォルトで NotImplemented): その他 80+ ops (Tagging / ACL / Lifecycle 等は Phase 2)
13//!
14//! ## アーキテクチャ
15//!
16//! - `S4Service<B>` は backend (B: S3) と `Arc<CodecRegistry>` と `Arc<dyn CodecDispatcher>`
17//!   を保持する。`CodecRegistry` 経由で複数 codec を抱えられるので、ひとつの S4 インスタンスが
18//!   複数 codec で書かれた object を透過的に GET できる
19//! - PUT: dispatcher が body の先頭 sample から codec を選び、registry で compress、
20//!   manifest を S3 metadata に書いて backend に forward
21//! - GET: backend から取得 → metadata から manifest を復元 → registry.decompress で
22//!   manifest 指定の codec で解凍 → 元の bytes を return
23//!
24//! ## 既知の制限事項
25//!
26//! - **Multipart Upload は per-part 圧縮が未実装**: 現状は upload_part を素通し。
27//!   Phase 1 月 2 後半で per-part compress + complete_multipart_upload で manifest 集約。
28//! - **PUT body は memory に collect**: max_body_bytes 上限あり (default 5 GiB = S3 単発 PUT 上限)。
29//!   Streaming-aware 圧縮は Phase 2。
30
31use std::sync::Arc;
32
33use base64::Engine as _;
34use bytes::BytesMut;
35use s3s::dto::*;
36use s3s::{S3, S3Error, S3ErrorCode, S3Request, S3Response, S3Result};
37use s4_codec::index::{FrameIndex, build_index_from_body, decode_index, encode_index, sidecar_key};
38use s4_codec::multipart::{
39    FRAME_HEADER_BYTES, FrameHeader, FrameIter, S3_MULTIPART_MIN_PART_BYTES, pad_to_minimum,
40    write_frame,
41};
42use s4_codec::{ChunkManifest, CodecDispatcher, CodecKind, CodecRegistry, CompressTelemetry};
43use std::time::Instant;
44use tracing::{debug, info};
45
46use crate::blob::{
47    bytes_to_blob, chain_sample_with_rest, collect_blob, collect_with_sample, peek_sample,
48};
49use crate::streaming::{
50    Crc32cVerifyingReader, async_read_to_blob, blob_to_async_read, cpu_zstd_decompress_stream,
51    pick_chunk_size, streaming_compress_to_frames, supports_streaming_compress,
52    supports_streaming_decompress,
53};
54
55/// PUT body の先頭 sampling で渡す最大 byte 数。
56const SAMPLE_BYTES: usize = 4096;
57
58/// v0.8 #55: stamp the GPU pipeline metrics (`s4_gpu_compress_seconds`,
59/// `s4_gpu_throughput_bytes_per_sec`, `s4_gpu_oom_total`) from a
60/// `CompressTelemetry` returned by `CodecRegistry::compress_with_telemetry`.
61/// CPU codecs (`gpu_seconds = None`) are no-ops here — they're already
62/// covered by the existing `s4_request_latency_seconds` / `s4_bytes_*`
63/// counters in the request-level `record_put` / `record_get` calls.
64#[inline]
65fn stamp_gpu_compress_telemetry(tel: &CompressTelemetry) {
66    if let Some(secs) = tel.gpu_seconds {
67        crate::metrics::record_gpu_compress(tel.codec, secs, tel.bytes_in, tel.bytes_out);
68    }
69    if tel.oom {
70        crate::metrics::record_gpu_oom(tel.codec);
71    }
72}
73
74/// v0.7 #49: percent-encoding set covering everything that is **not** an
75/// `unreserved` character per RFC 3986 §2.3, **plus** we additionally
76/// encode the path-reserved sub-delims that `http::Uri` rejects in a
77/// path segment (`?`, `#`, `%`, control bytes, space, etc.). We
78/// deliberately keep `/` un-encoded because S3 keys legally use `/` as
79/// a logical separator and the rest of the synthetic URI relies on the
80/// path layout `/{bucket}/{key}` round-tripping byte-for-byte.
81const URI_KEY_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
82    .add(b' ')
83    .add(b'"')
84    .add(b'#')
85    .add(b'<')
86    .add(b'>')
87    .add(b'?')
88    .add(b'`')
89    .add(b'{')
90    .add(b'}')
91    .add(b'|')
92    .add(b'\\')
93    .add(b'^')
94    .add(b'[')
95    .add(b']')
96    .add(b'%');
97
98/// v0.7 #49: build the synthetic `/{bucket}/{key}` request URI used by
99/// the sidecar / replication helpers when they re-enter the backend
100/// trait without going through the HTTP layer. S3 object keys can
101/// contain spaces, control bytes, and arbitrary Unicode that would
102/// make `format!(...).parse::<http::Uri>()` panic; we percent-encode
103/// the key bytes (RFC 3986 path segment) and the bucket name (defensive
104/// — bucket names are normally DNS-safe, but the helper is the single
105/// choke-point) before splicing them in. If the encoded form *still*
106/// fails to parse (extremely unlikely once everything outside the
107/// unreserved set is escaped) we surface a typed `400 InvalidObjectName`
108/// instead of crashing the worker.
109pub(crate) fn safe_object_uri(bucket: &str, key: &str) -> S3Result<http::Uri> {
110    use percent_encoding::utf8_percent_encode;
111    let bucket_enc = utf8_percent_encode(bucket, URI_KEY_ENCODE_SET);
112    let key_enc = utf8_percent_encode(key, URI_KEY_ENCODE_SET);
113    let raw = format!("/{bucket_enc}/{key_enc}");
114    raw.parse::<http::Uri>().map_err(|e| {
115        // S3 spec uses `InvalidObjectName` (HTTP 400) for keys that
116        // can't be represented in a request URI. The generated
117        // `S3ErrorCode` enum doesn't expose a typed variant for it,
118        // so we round-trip through `from_bytes` which preserves the
119        // canonical wire string while falling back to InvalidArgument
120        // if even that lookup fails (cannot happen at runtime — kept
121        // as a belt-and-suspenders branch so this helper never
122        // panics).
123        let code =
124            S3ErrorCode::from_bytes(b"InvalidObjectName").unwrap_or(S3ErrorCode::InvalidArgument);
125        S3Error::with_message(
126            code,
127            format!("object key cannot be encoded as a request URI: {e}"),
128        )
129    })
130}
131
132/// v0.8.12 HIGH-12 fix: verify a client-supplied integrity checksum
133/// against the received body BEFORE we strip the header on the way
134/// to the backend. Returns `Err(BadDigest)` on mismatch (matches
135/// AWS S3 wire behaviour); `Ok(())` when the supplied digest matches
136/// OR when the supplied algorithm is one we don't yet implement
137/// (the latter is logged so operators see the gap — fail-open on
138/// unsupported algorithms is the documented trade in the v0.8.11
139/// CHANGELOG, with full coverage tracked as a follow-up issue).
140///
141/// Algorithms covered: `Content-MD5` (base64 MD5),
142/// `x-amz-checksum-crc32c` (base64 big-endian u32),
143/// `x-amz-checksum-sha256` (base64 SHA-256). The remaining S3
144/// checksum algorithms (CRC32 non-Castagnoli, SHA-1, CRC64-NVME)
145/// are accepted and silently passed; verifying them needs new
146/// dependencies and was held back to keep the v0.8.12 surface
147/// bounded.
148#[allow(clippy::too_many_arguments)]
149fn verify_client_body_checksums(
150    body: &[u8],
151    content_md5_b64: Option<&str>,
152    checksum_crc32_b64: Option<&str>,
153    checksum_crc32c_b64: Option<&str>,
154    checksum_sha1_b64: Option<&str>,
155    checksum_sha256_b64: Option<&str>,
156    checksum_crc64nvme_b64: Option<&str>,
157) -> S3Result<()> {
158    use base64::Engine as _;
159    use md5::Md5;
160    use sha2::Sha256;
161    // `Digest` from md-5 / sha2 brings the `new`, `update`, `finalize`
162    // trait methods into scope. Bind anonymously so this `use` is
163    // never flagged as unused while still serving its real purpose.
164    use md5::Digest as _;
165    let b64 = base64::engine::general_purpose::STANDARD;
166    let bad = |what: &str| {
167        let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
168        S3Error::with_message(
169            code,
170            format!("client-supplied {what} did not match the received body"),
171        )
172    };
173    if let Some(claimed) = content_md5_b64 {
174        let want = b64.decode(claimed).map_err(|_| {
175            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed Content-MD5")
176        })?;
177        if want.len() != 16 {
178            return Err(S3Error::with_message(
179                S3ErrorCode::InvalidDigest,
180                "Content-MD5 must decode to 16 bytes",
181            ));
182        }
183        let mut h = Md5::new();
184        h.update(body);
185        let got = h.finalize();
186        // `subtle::ConstantTimeEq` would be ideal but the existing
187        // `constant_time_eq` helper in sse.rs is private; use a
188        // straightforward byte compare. The attacker doesn't get to
189        // choose the body retroactively, so a timing oracle here
190        // doesn't help them. `&got[..]` derefs the GenericArray
191        // into a `&[u8]` (the deprecated `.as_slice()` is gone in
192        // generic-array 1.x; CI runs `-D warnings`).
193        if got[..] != *want.as_slice() {
194            return Err(bad("Content-MD5"));
195        }
196    }
197    if let Some(claimed) = checksum_crc32c_b64 {
198        let want = b64.decode(claimed).map_err(|_| {
199            S3Error::with_message(
200                S3ErrorCode::InvalidDigest,
201                "malformed x-amz-checksum-crc32c",
202            )
203        })?;
204        if want.len() != 4 {
205            return Err(S3Error::with_message(
206                S3ErrorCode::InvalidDigest,
207                "x-amz-checksum-crc32c must decode to 4 bytes (big-endian u32)",
208            ));
209        }
210        let got = crc32c::crc32c(body).to_be_bytes();
211        if got != want.as_slice() {
212            return Err(bad("x-amz-checksum-crc32c"));
213        }
214    }
215    if let Some(claimed) = checksum_sha256_b64 {
216        let want = b64.decode(claimed).map_err(|_| {
217            S3Error::with_message(
218                S3ErrorCode::InvalidDigest,
219                "malformed x-amz-checksum-sha256",
220            )
221        })?;
222        if want.len() != 32 {
223            return Err(S3Error::with_message(
224                S3ErrorCode::InvalidDigest,
225                "x-amz-checksum-sha256 must decode to 32 bytes",
226            ));
227        }
228        let mut h = Sha256::new();
229        h.update(body);
230        let got = h.finalize();
231        if got[..] != *want.as_slice() {
232            return Err(bad("x-amz-checksum-sha256"));
233        }
234    }
235    // v0.8.12 #128 (MED-C): CRC32 (IEEE 802.3 — the non-Castagnoli
236    // variant AWS uses for `x-amz-checksum-crc32`). 4-byte
237    // big-endian value, base64-encoded.
238    if let Some(claimed) = checksum_crc32_b64 {
239        let want = b64.decode(claimed).map_err(|_| {
240            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-crc32")
241        })?;
242        if want.len() != 4 {
243            return Err(S3Error::with_message(
244                S3ErrorCode::InvalidDigest,
245                "x-amz-checksum-crc32 must decode to 4 bytes (big-endian u32)",
246            ));
247        }
248        let mut h = crc32fast::Hasher::new();
249        h.update(body);
250        let got = h.finalize().to_be_bytes();
251        if got != want.as_slice() {
252            return Err(bad("x-amz-checksum-crc32"));
253        }
254    }
255    // v0.8.12 #128 (MED-C): SHA-1. 20-byte digest, base64-encoded.
256    if let Some(claimed) = checksum_sha1_b64 {
257        use sha1::Sha1;
258        let want = b64.decode(claimed).map_err(|_| {
259            S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-sha1")
260        })?;
261        if want.len() != 20 {
262            return Err(S3Error::with_message(
263                S3ErrorCode::InvalidDigest,
264                "x-amz-checksum-sha1 must decode to 20 bytes",
265            ));
266        }
267        let mut h = Sha1::new();
268        h.update(body);
269        let got = h.finalize();
270        if got[..] != *want.as_slice() {
271            return Err(bad("x-amz-checksum-sha1"));
272        }
273    }
274    // v0.8.12 #128 (MED-C): CRC64-NVME — AWS's newest checksum
275    // algorithm. NVMe spec: poly 0xad93d23594c93659, init / xorout
276    // 0xffffffffffffffff, refin / refout true. The reflected
277    // polynomial + 256-entry lookup table are computed lazily on
278    // first call (small enough to inline rather than pull in a
279    // dedicated crc64 crate).
280    if let Some(claimed) = checksum_crc64nvme_b64 {
281        let want = b64.decode(claimed).map_err(|_| {
282            S3Error::with_message(
283                S3ErrorCode::InvalidDigest,
284                "malformed x-amz-checksum-crc64nvme",
285            )
286        })?;
287        if want.len() != 8 {
288            return Err(S3Error::with_message(
289                S3ErrorCode::InvalidDigest,
290                "x-amz-checksum-crc64nvme must decode to 8 bytes (big-endian u64)",
291            ));
292        }
293        let got = crc64_nvme(body).to_be_bytes();
294        if got != want.as_slice() {
295            return Err(bad("x-amz-checksum-crc64nvme"));
296        }
297    }
298    Ok(())
299}
300
301/// v0.8.12 #128 (MED-C): CRC-64/NVME (AWS S3 `x-amz-checksum-crc64nvme`).
302/// NVMe spec: poly 0xad93d23594c93659, init 0xffffffffffffffff, refin
303/// true, refout true, xorout 0xffffffffffffffff. The reflected
304/// polynomial table is computed lazily on first call via
305/// [`std::sync::OnceLock`]; subsequent calls share the 256-entry table.
306fn crc64_nvme(bytes: &[u8]) -> u64 {
307    use std::sync::OnceLock;
308    static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
309    let tbl = TABLE.get_or_init(|| {
310        // Reflected polynomial (bit-reverse of 0xad93d23594c93659).
311        const POLY_REFLECTED: u64 = 0x9a6c_9329_ac4b_c9b5;
312        let mut t = [0u64; 256];
313        let mut i = 0usize;
314        while i < 256 {
315            let mut c = i as u64;
316            let mut j = 0;
317            while j < 8 {
318                c = if c & 1 != 0 {
319                    (c >> 1) ^ POLY_REFLECTED
320                } else {
321                    c >> 1
322                };
323                j += 1;
324            }
325            t[i] = c;
326            i += 1;
327        }
328        t
329    });
330    let mut crc: u64 = !0u64;
331    for &b in bytes {
332        let idx = ((crc as u8) ^ b) as usize;
333        crc = (crc >> 8) ^ tbl[idx];
334    }
335    !crc
336}
337
338/// v0.4 #20: captured at the start of a handler, before the request is
339/// consumed by the backend call, so the matching `record_access` at
340/// end-of-request can fill in the structured access log entry.
341struct AccessLogPreamble {
342    remote_ip: Option<String>,
343    requester: Option<String>,
344    request_uri: String,
345    user_agent: Option<String>,
346}
347
348pub struct S4Service<B: S3> {
349    /// Wrapped in `Arc` so the v0.6 #40 cross-bucket replication
350    /// dispatcher can clone it into a detached `tokio::spawn` task
351    /// (Arc::clone is cheap; backend trait methods take `&self` so no
352    /// other handler is affected by the indirection).
353    backend: Arc<B>,
354    registry: Arc<CodecRegistry>,
355    dispatcher: Arc<dyn CodecDispatcher>,
356    max_body_bytes: usize,
357    policy: Option<crate::policy::SharedPolicy>,
358    /// v0.3 #13: surfaced as the `aws:SecureTransport` Condition key. Set
359    /// to `true` when the listener is wrapped in TLS (or ACME), so policies
360    /// gating "deny if not over TLS" can do their job. Defaults to `false`
361    /// (HTTP); set via [`S4Service::with_secure_transport`] at boot.
362    secure_transport: bool,
363    /// v0.4 #19: optional per-(principal, bucket) token-bucket limiter.
364    rate_limits: Option<crate::rate_limit::SharedRateLimits>,
365    /// v0.4 #20: optional S3-style access log emitter.
366    access_log: Option<crate::access_log::SharedAccessLog>,
367    /// v0.4 #21 / v0.5 #29: optional server-side encryption keyring
368    /// (AES-256-GCM). When set, every PUT body gets wrapped in S4E2
369    /// (with the keyring's active key id) after the compress + framing
370    /// steps; every GET that sniffs as S4E1/S4E2 is decrypted before
371    /// frame parsing. A `with_sse_key(...)` call wraps the supplied
372    /// key in a 1-slot keyring so single-key (v0.4) operators get the
373    /// same behaviour they had before, just on the v2 frame.
374    sse_keyring: Option<crate::sse::SharedSseKeyring>,
375    /// v0.5 #34: optional first-class versioning state machine. When
376    /// `Some(...)`, S4-server itself owns the per-bucket versioning
377    /// state + per-(bucket, key) version chain; PUT / GET / DELETE /
378    /// list_object_versions / get_bucket_versioning /
379    /// put_bucket_versioning handlers consult the manager instead of
380    /// passing through. When `None` (default), the legacy
381    /// backend-passthrough behaviour applies so existing v0.4
382    /// deployments are unaffected until they explicitly call
383    /// `with_versioning(...)`.
384    versioning: Option<Arc<crate::versioning::VersioningManager>>,
385    /// v0.5 #28: optional SSE-KMS envelope-encryption backend. When
386    /// `Some(...)`, PUTs carrying `x-amz-server-side-encryption: aws:kms`
387    /// generate a fresh DEK via the backend, encrypt the body with it
388    /// (S4E4 frame), and persist only the wrapped DEK. GETs sniffing as
389    /// S4E4 unwrap the DEK through the same backend before decrypt.
390    /// `kms_default_key_id` is used when the request omits an explicit
391    /// `x-amz-server-side-encryption-aws-kms-key-id` (mirrors AWS S3
392    /// bucket-default behaviour).
393    kms: Option<Arc<dyn crate::kms::KmsBackend>>,
394    kms_default_key_id: Option<String>,
395    /// v0.5 #30: optional Object Lock (WORM) enforcement layer. When
396    /// `Some(...)`, `delete_object` and overwrite-style `put_object`
397    /// consult the manager and refuse the operation with HTTP 403
398    /// `AccessDenied` while the object is locked (Compliance until
399    /// expiry, Governance unless the bypass header is set, or any time
400    /// a legal hold is on). PUT also auto-applies the bucket-default
401    /// retention to brand-new objects when configured. When `None`
402    /// (default), the legacy backend-passthrough behaviour applies, so
403    /// existing v0.4 deployments are unaffected until they explicitly
404    /// call `with_object_lock(...)`.
405    object_lock: Option<Arc<crate::object_lock::ObjectLockManager>>,
406    /// v0.6 #38: optional first-class CORS bucket configuration manager.
407    /// When `Some(...)`, S4-server itself owns per-bucket CORS rules and
408    /// `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
409    /// consult the manager instead of passing through to the backend.
410    /// `handle_preflight` (public method on `S4Service`) routes OPTIONS-
411    /// style preflight matching through the same store; the actual HTTP
412    /// OPTIONS routing wire-up at the listener level is a follow-up
413    /// (s3s framework does not surface OPTIONS as a typed handler).
414    cors: Option<Arc<crate::cors::CorsManager>>,
415    /// v0.6 #36: optional first-class S3 Inventory manager. When
416    /// `Some(...)`, S4-server itself owns per-(bucket, id) inventory
417    /// configurations and `put_bucket_inventory_configuration` /
418    /// `get_bucket_inventory_configuration` /
419    /// `list_bucket_inventory_configurations` /
420    /// `delete_bucket_inventory_configuration` consult the manager
421    /// instead of passing through to the backend. The actual periodic
422    /// CSV emission is driven by a tokio task in `main.rs` that calls
423    /// `InventoryManager::run_once_for_test` on a fixed cadence; the
424    /// service handlers below only deal with config-level CRUD.
425    inventory: Option<Arc<crate::inventory::InventoryManager>>,
426    /// v0.6 #35: optional first-class S3 bucket-notification manager.
427    /// When `Some(...)`, S4-server itself owns per-bucket notification
428    /// configurations and `put_bucket_notification_configuration` /
429    /// `get_bucket_notification_configuration` consult the manager
430    /// instead of passing through to the backend. Successful PUT /
431    /// DELETE handlers fire matching destinations on a detached tokio
432    /// task (best-effort; see `crate::notifications::dispatch_event`).
433    notifications: Option<Arc<crate::notifications::NotificationManager>>,
434    /// v0.6 #37: optional first-class S3 Lifecycle configuration
435    /// manager. When `Some(...)`, S4-server itself owns per-bucket
436    /// lifecycle rules and `put_bucket_lifecycle_configuration` /
437    /// `get_bucket_lifecycle_configuration` /
438    /// `delete_bucket_lifecycle` consult the manager instead of
439    /// passing through to the backend. The actual background scanner
440    /// (list_objects_v2 -> evaluate -> delete / metadata-rewrite per
441    /// rule) is a v0.7+ follow-up; the test path
442    /// `S4Service::run_lifecycle_once_for_test` exercises the
443    /// evaluator end-to-end so this v0.6 #37 wiring is enough to ship
444    /// the configuration-management half without putting a
445    /// half-wired bucket-walk in front of users.
446    lifecycle: Option<Arc<crate::lifecycle::LifecycleManager>>,
447    /// v0.6 #39: optional first-class object + bucket Tagging manager.
448    /// When `Some(...)`, S4-server itself owns per-(bucket, key) and
449    /// per-bucket tag state — `PutObjectTagging` /
450    /// `GetObjectTagging` / `DeleteObjectTagging` /
451    /// `PutBucketTagging` / `GetBucketTagging` /
452    /// `DeleteBucketTagging` route through the manager (replacing the
453    /// previous backend-passthrough behaviour). `put_object` also
454    /// pre-parses the `x-amz-tagging` header / `Tagging` input field
455    /// so the IAM policy evaluator can gate on
456    /// `s3:RequestObjectTag/<key>` and `s3:ExistingObjectTag/<key>`.
457    /// On a successful PUT the parsed tags are persisted; on a
458    /// successful DELETE the matching tag entry is dropped.
459    tagging: Option<Arc<crate::tagging::TagManager>>,
460    /// v0.6 #40: optional first-class cross-bucket replication manager.
461    /// When `Some(...)`, S4-server itself owns per-bucket replication
462    /// rules; `PutBucketReplication` / `GetBucketReplication` /
463    /// `DeleteBucketReplication` route through the manager (replacing
464    /// the previous backend-passthrough behaviour). On every successful
465    /// `put_object` the manager's rule list is consulted; the
466    /// highest-priority matching enabled rule wins, the per-key status
467    /// is recorded as `Pending`, and the source body and metadata are
468    /// handed to a detached tokio task that PUTs to the destination
469    /// bucket through the same backend. The replica is stamped with
470    /// `x-amz-replication-status: REPLICA` in its metadata; the
471    /// source-side status is updated to `Completed` on success or
472    /// `Failed` after the 3-attempt retry budget is exhausted (drop
473    /// counter bumps in either-side case so dashboards see the loss).
474    /// `head_object` / `get_object` echo the recorded status back as
475    /// `x-amz-replication-status` so consumers can poll progress.
476    /// Limited to single-instance (same `S4Service`) replication; true
477    /// cross-region (multi-instance) is a v0.7+ follow-up.
478    replication: Option<Arc<crate::replication::ReplicationManager>>,
479    /// v0.6 #42: optional MFA-Delete enforcement layer. When `Some(...)`,
480    /// every DELETE / DELETE-version / delete-marker / `PutBucketVersioning`
481    /// request against a bucket whose MFA-Delete state is `Enabled`
482    /// must carry `x-amz-mfa: <serial> <code>` (RFC 6238 6-digit TOTP);
483    /// missing or invalid tokens return HTTP 403 `AccessDenied`. When
484    /// `None` (default), the gate is a no-op so existing v0.4 / v0.5
485    /// deployments are unaffected until they explicitly call
486    /// `with_mfa_delete(...)`.
487    mfa_delete: Option<Arc<crate::mfa::MfaDeleteManager>>,
488    /// v0.5 #32: when `true`, every PUT must carry an SSE indicator
489    /// (`x-amz-server-side-encryption`, the SSE-C customer-key headers,
490    /// or be matched against a configured server-managed keyring/KMS).
491    /// Set by `--compliance-mode strict` after the boot-time
492    /// prerequisite check passes.
493    compliance_strict: bool,
494    /// v0.7 #47: optional SigV4a (asymmetric ECDSA-P256-SHA256) verify
495    /// gate. When `Some(...)`, the listener-side middleware (see
496    /// [`crate::routing::try_sigv4a_verify`]) inspects every incoming
497    /// request and short-circuits SigV4a-signed ones — verifying the
498    /// signature against the credential store and returning 403
499    /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` on failure. Plain
500    /// SigV4 (HMAC-SHA256) requests pass through to s3s untouched. When
501    /// `None`, the middleware is a no-op so the existing SigV4 path is
502    /// unaffected (operators opt in via `--sigv4a-credentials <DIR>`).
503    sigv4a_gate: Option<Arc<SigV4aGate>>,
504    /// v0.8 #54 BUG-5..10: per-`upload_id` side-table that ferries the
505    /// SSE / Tagging / Object-Lock context captured at
506    /// `CreateMultipartUpload` time through to `UploadPart` /
507    /// `CompleteMultipartUpload`. Always-on (no `with_*` flag) — the
508    /// store is gateway-internal and idle when no multipart is in
509    /// flight. See [`crate::multipart_state`] for rationale.
510    multipart_state: Arc<crate::multipart_state::MultipartStateStore>,
511    /// v0.8 #52: plaintext bytes per S4E5 chunk on the SSE-S4 PUT
512    /// path. `0` (default) → use the legacy buffered S4E2 path
513    /// (whole-body AES-GCM tag, GET buffers + verifies before
514    /// emitting). Non-zero → use the chunked S4E5 frame so GET can
515    /// stream-decrypt chunk-by-chunk. Wired by `--sse-chunk-size`
516    /// in `main.rs`. SSE-C and SSE-KMS are intentionally unaffected
517    /// (chunked variants tracked in a follow-up issue).
518    sse_chunk_size: usize,
519    /// v0.8.5 #86 (audit M-2): bounded permit pool gating the detached
520    /// replication dispatcher in [`Self::spawn_replication_if_matched`].
521    /// Without this cap, a high-volume PUT workload (1k req/s × N enabled
522    /// rules × slow destination = O(10k) in-flight tokio tasks) could
523    /// exhaust process memory before the destination drains. Each
524    /// dispatcher spawn `acquire_owned`s one permit and holds it for the
525    /// lifetime of the destination PUT + status stamp; once the cap is
526    /// reached the dispatcher async-blocks on `acquire_owned()` so the
527    /// listener path itself never stalls — only the in-flight replica
528    /// queue depth is bounded. Default 1024 (operator-tunable via
529    /// `--replication-max-concurrent`).
530    replication_semaphore: Arc<tokio::sync::Semaphore>,
531    /// v0.8.11 CRIT-4 fix: trust the `X-Forwarded-For` header for the
532    /// `aws:SourceIp` Condition key only when the operator has
533    /// explicitly opted in via `--trust-x-forwarded-for`. Default
534    /// (`false`) makes the policy evaluator see `source_ip = None`
535    /// for incoming requests, so a public-internet client can no
536    /// longer spoof an internal CIDR by setting `X-Forwarded-For`
537    /// themselves. Operators behind a trusted reverse proxy that
538    /// scrubs / sets `X-Forwarded-For` enable the flag; gateways
539    /// listening directly on the public internet leave it off and
540    /// gain a clear fail-closed default. A future release plumbs
541    /// the TCP peer address through the s3s service trait so we can
542    /// validate the forwarded header against a `--trusted-proxies`
543    /// CIDR list; until then the boolean opt-in closes the immediate
544    /// auth-bypass surface.
545    trust_x_forwarded_for: bool,
546}
547
548impl<B: S3> S4Service<B> {
549    /// AWS S3 単発 PUT の API 上限 (5 GiB)
550    pub const DEFAULT_MAX_BODY_BYTES: usize = 5 * 1024 * 1024 * 1024;
551
552    /// v0.8.5 #86 (audit M-2): default cap on simultaneously-in-flight
553    /// replication dispatcher tasks. See the `replication_semaphore`
554    /// field doc for the rationale + override path.
555    pub const DEFAULT_REPLICATION_MAX_CONCURRENT: usize = 1024;
556
557    pub fn new(
558        backend: B,
559        registry: Arc<CodecRegistry>,
560        dispatcher: Arc<dyn CodecDispatcher>,
561    ) -> Self {
562        Self {
563            backend: Arc::new(backend),
564            registry,
565            dispatcher,
566            max_body_bytes: Self::DEFAULT_MAX_BODY_BYTES,
567            policy: None,
568            secure_transport: false,
569            rate_limits: None,
570            access_log: None,
571            sse_keyring: None,
572            versioning: None,
573            kms: None,
574            kms_default_key_id: None,
575            object_lock: None,
576            cors: None,
577            inventory: None,
578            notifications: None,
579            lifecycle: None,
580            tagging: None,
581            replication: None,
582            mfa_delete: None,
583            compliance_strict: false,
584            sigv4a_gate: None,
585            multipart_state: Arc::new(crate::multipart_state::MultipartStateStore::new()),
586            // v0.8 #52: chunked SSE-S4 disabled by default — opt
587            // in via `S4Service::with_sse_chunk_size(...)` /
588            // `--sse-chunk-size <BYTES>`. Default keeps the legacy
589            // S4E2 buffered path so existing deployments are
590            // bit-for-bit unchanged.
591            sse_chunk_size: 0,
592            // v0.8.5 #86 (audit M-2): default cap of 1024 in-flight
593            // replication tasks. Picked to be (a) ample headroom over a
594            // typical steady-state replication rate (the v0.8.3 #66
595            // status-sweep doc cites 1k keys/hour as a "steady" rate, so
596            // even a 100x burst lands well under 1024), (b) small enough
597            // that the worst-case memory pinned by stalled dispatchers
598            // — body bytes + metadata — stays bounded (1024 × 5 MiB
599            // typical S3 PUT ≈ 5 GiB, recoverable). Operators with
600            // wider cross-region fan-out can override via
601            // `--replication-max-concurrent`.
602            replication_semaphore: Arc::new(tokio::sync::Semaphore::new(
603                Self::DEFAULT_REPLICATION_MAX_CONCURRENT,
604            )),
605            // v0.8.11 CRIT-4: default fail-closed — ignore client-
606            // supplied `X-Forwarded-For` until the operator opts in
607            // through `with_trust_x_forwarded_for(true)`.
608            trust_x_forwarded_for: false,
609        }
610    }
611
612    /// v0.8.11 CRIT-4 fix: opt in to consuming the leftmost token of
613    /// the `X-Forwarded-For` header as `aws:SourceIp`. Only enable
614    /// when the gateway sits behind a trusted reverse proxy that
615    /// strips (or rewrites) any client-supplied value. When left
616    /// off (default), the policy evaluator sees `source_ip = None`
617    /// regardless of what the client sends — closing the
618    /// public-internet `X-Forwarded-For: 10.0.0.1` IAM-allowlist
619    /// bypass.
620    #[must_use]
621    pub fn with_trust_x_forwarded_for(mut self, on: bool) -> Self {
622        self.trust_x_forwarded_for = on;
623        self
624    }
625
626    /// v0.7 #47: attach the SigV4a verify gate. Once set, the
627    /// listener-side middleware (`crate::routing::try_sigv4a_verify`)
628    /// short-circuits any incoming `AWS4-ECDSA-P256-SHA256` request,
629    /// verifying it against the supplied credential store and
630    /// returning 403 on failure. Plain SigV4 (HMAC-SHA256) requests
631    /// are unaffected. When the gate is unset (default), the
632    /// middleware skips entirely so existing SigV4 deployments keep
633    /// working.
634    #[must_use]
635    pub fn with_sigv4a_gate(mut self, gate: Arc<SigV4aGate>) -> Self {
636        self.sigv4a_gate = Some(gate);
637        self
638    }
639
640    /// v0.7 #47: borrow the attached SigV4a gate. Used by `main.rs`
641    /// to snapshot the gate `Arc` before the s3s `ServiceBuilder`
642    /// consumes the `S4Service` (the listener-side middleware needs
643    /// the same `Arc` because s3s' SigV4 verifier rejects SigV4a
644    /// algorithm tokens with "unknown algorithm" — match has to
645    /// happen at the hyper layer instead).
646    #[must_use]
647    pub fn sigv4a_gate(&self) -> Option<&Arc<SigV4aGate>> {
648        self.sigv4a_gate.as_ref()
649    }
650
651    /// v0.8.2 #62: borrow the multipart state store so `main.rs` can
652    /// snapshot the `Arc` before the s3s `ServiceBuilder` consumes
653    /// the `S4Service`. The background `sweep_stale` task in `main.rs`
654    /// holds this `Arc` and ticks once an hour to drop abandoned
655    /// upload contexts (and their `Zeroizing<[u8; 32]>` SSE-C keys).
656    #[must_use]
657    pub fn multipart_state(&self) -> &Arc<crate::multipart_state::MultipartStateStore> {
658        &self.multipart_state
659    }
660
661    /// v0.6 #39: attach the in-memory object + bucket Tagging manager.
662    /// Once set, `Put/Get/Delete` `Object/Bucket Tagging` route
663    /// through the manager (instead of forwarding to the backend),
664    /// and `put_object`'s `x-amz-tagging` parse path becomes the
665    /// source of `s3:RequestObjectTag/<key>` for the IAM policy
666    /// evaluator. The manager itself is shared via `Arc`.
667    #[must_use]
668    pub fn with_tagging(mut self, mgr: Arc<crate::tagging::TagManager>) -> Self {
669        self.tagging = Some(mgr);
670        self
671    }
672
673    /// v0.6 #39: borrow the attached tagging manager (test /
674    /// introspection — the snapshotter in `main.rs`, when wired,
675    /// will keep its own `Arc` clone).
676    #[must_use]
677    pub fn tag_manager(&self) -> Option<&Arc<crate::tagging::TagManager>> {
678        self.tagging.as_ref()
679    }
680
681    /// v0.6 #36: attach the in-memory S3 Inventory manager. Once set,
682    /// `put_bucket_inventory_configuration` /
683    /// `get_bucket_inventory_configuration` /
684    /// `list_bucket_inventory_configurations` /
685    /// `delete_bucket_inventory_configuration` route through the
686    /// manager. The actual periodic CSV / manifest emission is
687    /// orchestrated by a tokio task started in `main.rs`; the manager
688    /// itself is shared between the handler and the scheduler via
689    /// `Arc`.
690    #[must_use]
691    pub fn with_inventory(mut self, mgr: Arc<crate::inventory::InventoryManager>) -> Self {
692        self.inventory = Some(mgr);
693        self
694    }
695
696    /// v0.6 #36: borrow the attached inventory manager (test /
697    /// introspection — the background scheduler in `main.rs` keeps its
698    /// own `Arc` clone, so this accessor is for the test path that
699    /// invokes `run_once_for_test` directly).
700    #[must_use]
701    pub fn inventory_manager(&self) -> Option<&Arc<crate::inventory::InventoryManager>> {
702        self.inventory.as_ref()
703    }
704
705    /// v0.6 #37: attach the in-memory S3 Lifecycle configuration
706    /// manager. Once set, `put_bucket_lifecycle_configuration` /
707    /// `get_bucket_lifecycle_configuration` / `delete_bucket_lifecycle`
708    /// route through the manager (replacing the previous backend-
709    /// passthrough behaviour). The actual periodic scanner that walks
710    /// the source bucket and invokes Expiration / Transition /
711    /// NoncurrentExpiration actions is a v0.7+ follow-up — see
712    /// [`Self::run_lifecycle_once_for_test`] for the in-memory test
713    /// path that exercises the evaluator end-to-end.
714    #[must_use]
715    pub fn with_lifecycle(mut self, mgr: Arc<crate::lifecycle::LifecycleManager>) -> Self {
716        self.lifecycle = Some(mgr);
717        self
718    }
719
720    /// v0.6 #37: borrow the attached lifecycle manager (test /
721    /// introspection — the background scheduler in `main.rs` keeps its
722    /// own `Arc` clone, so this accessor is for the test path that
723    /// invokes the evaluator directly).
724    #[must_use]
725    pub fn lifecycle_manager(&self) -> Option<&Arc<crate::lifecycle::LifecycleManager>> {
726        self.lifecycle.as_ref()
727    }
728
729    /// v0.6 #37: synchronous test entry that runs the lifecycle evaluator
730    /// against a caller-provided list of `(key, age, size, tags)` tuples
731    /// and returns the `(key, action)` pairs that should fire. The actual
732    /// backend invocation (S3.delete_object / metadata rewrite) is left
733    /// to the caller — the unit + E2E tests use this to verify the
734    /// evaluator without spawning the (deferred) background scanner.
735    /// Returns an empty `Vec` when no lifecycle manager is attached or
736    /// no rule matches.
737    #[must_use]
738    pub fn run_lifecycle_once_for_test(
739        &self,
740        bucket: &str,
741        objects: &[crate::lifecycle::EvaluateBatchEntry],
742    ) -> Vec<(String, crate::lifecycle::LifecycleAction)> {
743        let Some(mgr) = self.lifecycle.as_ref() else {
744            return Vec::new();
745        };
746        crate::lifecycle::evaluate_batch(mgr, bucket, objects)
747    }
748
749    /// v0.6 #35: attach the in-memory bucket-notification manager. Once
750    /// set, `put_bucket_notification_configuration` /
751    /// `get_bucket_notification_configuration` route through the manager
752    /// (replacing the previous backend-passthrough behaviour); successful
753    /// `put_object` / `delete_object` calls fire matching destinations
754    /// on a detached tokio task via
755    /// `crate::notifications::dispatch_event` (best-effort, fire-and-
756    /// forget — failures bump the manager's `dropped_total` counter and
757    /// log at warn but do NOT fail the originating S3 request).
758    #[must_use]
759    pub fn with_notifications(
760        mut self,
761        mgr: Arc<crate::notifications::NotificationManager>,
762    ) -> Self {
763        self.notifications = Some(mgr);
764        self
765    }
766
767    /// v0.6 #35: borrow the attached notifications manager (test /
768    /// introspection — used by the metrics layer to read
769    /// `dropped_total`).
770    #[must_use]
771    pub fn notifications_manager(&self) -> Option<&Arc<crate::notifications::NotificationManager>> {
772        self.notifications.as_ref()
773    }
774
775    /// v0.6 #35: internal helper used by the DELETE handlers to fire a
776    /// matching notification on a detached tokio task. No-op when no
777    /// manager is attached or no rule on the bucket matches the given
778    /// (event, key) tuple.
779    fn fire_delete_notification(
780        &self,
781        bucket: &str,
782        key: &str,
783        event: crate::notifications::EventType,
784        version_id: Option<String>,
785    ) {
786        let Some(mgr) = self.notifications.as_ref() else {
787            return;
788        };
789        let dests = mgr.match_destinations(bucket, &event, key);
790        if dests.is_empty() {
791            return;
792        }
793        tokio::spawn(crate::notifications::dispatch_event(
794            Arc::clone(mgr),
795            bucket.to_owned(),
796            key.to_owned(),
797            event,
798            None,
799            None,
800            version_id,
801            format!("S4-{}", uuid::Uuid::new_v4()),
802        ));
803    }
804
805    /// v0.6 #40: attach the in-memory cross-bucket replication manager.
806    /// Once set, `put_bucket_replication` / `get_bucket_replication` /
807    /// `delete_bucket_replication` route through the manager (replacing
808    /// the previous backend-passthrough behaviour); a successful
809    /// `put_object` whose key matches an enabled rule fires a detached
810    /// tokio task that PUTs the same body + metadata to the rule's
811    /// destination bucket, stamping the replica with
812    /// `x-amz-replication-status: REPLICA`. Failures after the retry
813    /// budget bump the manager's `dropped_total` counter and are
814    /// surfaced in the `s4_replication_dropped_total` Prometheus
815    /// counter; successes bump `s4_replication_replicated_total`.
816    #[must_use]
817    pub fn with_replication(mut self, mgr: Arc<crate::replication::ReplicationManager>) -> Self {
818        self.replication = Some(mgr);
819        self
820    }
821
822    /// v0.6 #40: borrow the attached replication manager (test /
823    /// introspection — used by the metrics layer to read
824    /// `dropped_total`).
825    #[must_use]
826    pub fn replication_manager(&self) -> Option<&Arc<crate::replication::ReplicationManager>> {
827        self.replication.as_ref()
828    }
829
830    /// v0.6 #40: internal helper used by the PUT handlers to fire a
831    /// detached cross-bucket replication task. No-op when no manager
832    /// is attached, the source backend PUT failed, or no rule on the
833    /// source bucket matches the (key, tags) tuple. The `body` is the
834    /// post-compression / post-encryption `Bytes` that was sent to
835    /// the source backend (refcount-cloned), and `metadata` is the
836    /// metadata map that already includes the manifest /
837    /// `s4-encrypted` markers — the replica decodes through the same
838    /// path. The destination PUT runs through `Arc<B>::put_object`.
839    ///
840    /// ## v0.8.2 #61: generation token + shadow-key destination
841    ///
842    /// `pending_version` is the source-side `PutOutcome` minted by the
843    /// caller's versioning branch (or `None` for unversioned /
844    /// suspended buckets). When `pending_version.versioned_response`
845    /// is `true`, the dispatcher writes the destination under the same
846    /// shadow path the source uses (`<key>.__s4ver__/<vid>`) so the
847    /// destination's version chain receives the new version the same
848    /// way `?versionId=` GET resolves it. Closes audit C-1.
849    ///
850    /// The dispatcher also mints a fresh `generation` token before
851    /// spawning, threaded through to [`crate::replication::
852    /// replicate_object`]. Closes audit C-3 — a stale retry of an
853    /// older PUT can no longer overwrite the destination's newer bytes
854    /// because the CAS guard sees the higher stored generation and
855    /// drops its destination write.
856    ///
857    /// ## Asymmetric versioning policy (out of scope)
858    ///
859    /// We assume source + destination buckets share the same
860    /// versioning policy (both Enabled or both Suspended /
861    /// Unversioned). Cross-bucket policy queries would require a
862    /// backend round-trip per replication, which is not worth it for
863    /// the single-instance scope. Operators who configure asymmetric
864    /// versioning will see destination-side `?versionId=` lookups
865    /// miss — documented as out-of-scope until a future per-rule
866    /// `destination_versioning_policy` knob lands.
867    // 8 args is the post-#61 shape: replication needs the
868    // source bucket+key, the canonical tag set for rule-matching,
869    // the post-codec body+metadata for the destination PUT, the
870    // backend-success gate, and the pending version-id for the
871    // shadow-key destination override. A shape struct would just
872    // split the (single) call site so opt for the inline form.
873    #[allow(clippy::too_many_arguments)]
874    fn spawn_replication_if_matched(
875        &self,
876        source_bucket: &str,
877        source_key: &str,
878        request_tags: &Option<crate::tagging::TagSet>,
879        body: &bytes::Bytes,
880        metadata: &Option<std::collections::HashMap<String, String>>,
881        backend_ok: bool,
882        pending_version: Option<&crate::versioning::PutOutcome>,
883    ) where
884        B: Send + Sync + 'static,
885    {
886        if !backend_ok {
887            return;
888        }
889        let Some(mgr) = self.replication.as_ref() else {
890            return;
891        };
892        // Pull the request's tags into the (k, v) shape the matcher
893        // expects. The tagging manager would have the canonical
894        // post-PUT view but at this point in the pipeline it's
895        // already been written above; for the rule-match decision
896        // the request's tags are sufficient (= the tags this PUT
897        // applies, S3 PutObject is full-replace on tags).
898        let object_tags: Vec<(String, String)> = request_tags
899            .as_ref()
900            .map(|ts| ts.iter().cloned().collect())
901            .unwrap_or_default();
902        let Some(rule) = mgr.match_rule(source_bucket, source_key, &object_tags) else {
903            return;
904        };
905        // v0.8.2 #61: mint the per-PUT generation BEFORE the eager
906        // Pending stamp so the stamp itself carries the right
907        // generation (the CAS in `record_status_if_newer` would
908        // otherwise see a `generation=0` Pending and accept any
909        // stale retry).
910        let generation = mgr.next_generation();
911        // Eagerly mark the source key as Pending so a HEAD between
912        // the source PUT returning and the spawned task completing
913        // surfaces the in-flight state. CAS-guarded so a slower
914        // older PUT can't downgrade a newer Completed back to Pending.
915        let _ = mgr.record_status_if_newer(
916            source_bucket,
917            source_key,
918            generation,
919            crate::replication::ReplicationStatus::Pending,
920        );
921        // v0.8.2 #61: derive the destination storage key. For a
922        // versioning-Enabled source the destination receives the
923        // same shadow-key path so a `?versionId=<vid>` GET on the
924        // destination resolves through the same lookup the source
925        // uses. Suspended / Unversioned sources keep the logical
926        // key (= `None` override = dispatcher uses `source_key`).
927        let destination_key_override = pending_version
928            .filter(|pv| pv.versioned_response)
929            .map(|pv| versioned_shadow_key(source_key, &pv.version_id));
930        // v0.8.3 #68 (audit M-1): capture the source object's Object
931        // Lock state so the dispatcher can decorate the destination
932        // PUT with the matching AWS-wire lock headers. Without this,
933        // a Compliance / Governance / legal-hold protected source
934        // would replicate to a destination where DELETE succeeds
935        // (the WORM posture would only hold on the source).
936        let source_lock_state = self
937            .object_lock
938            .as_ref()
939            .and_then(|mgr| mgr.get(source_bucket, source_key));
940        // v0.8.3 #68: hand the destination-side ObjectLockManager to
941        // the dispatcher closure so we can persist the propagated
942        // lock state on successful destination PUT (the destination
943        // PUT below bypasses S4Service::put_object — we drive the
944        // backend directly — so the explicit_lock_mode commit block
945        // in put_object never fires for replicas. We replay it here
946        // against the destination key.)
947        let dest_lock_mgr = self.object_lock.as_ref().map(Arc::clone);
948        let mgr_cl = Arc::clone(mgr);
949        let backend = Arc::clone(&self.backend);
950        let body_cl = body.clone();
951        let metadata_cl = metadata.clone();
952        let source_bucket_cl = source_bucket.to_owned();
953        let source_key_cl = source_key.to_owned();
954        let source_lock_state_for_closure = source_lock_state.clone();
955        let source_bucket_for_warn = source_bucket.to_owned();
956        // v0.8.5 #86 (audit M-2): bound the in-flight replication queue
957        // depth. Acquire happens INSIDE the spawned task (not on the
958        // listener path) so a saturated semaphore back-pressures the
959        // dispatcher pool without stalling the source PUT response —
960        // the source has already returned 200 to the client by the time
961        // the spawn body runs. A failed `acquire_owned` only happens
962        // when the semaphore is closed (we never close it, so the
963        // logged-and-skipped fallback is unreachable in practice).
964        let semaphore = Arc::clone(&self.replication_semaphore);
965        tokio::spawn(async move {
966            let _permit = match semaphore.acquire_owned().await {
967                Ok(p) => p,
968                Err(e) => {
969                    tracing::warn!(
970                        bucket = %source_bucket_cl,
971                        key = %source_key_cl,
972                        "S4 replication dispatcher could not acquire semaphore permit (closed? {e}); skipping replica"
973                    );
974                    return;
975                }
976            };
977            let do_put = move |dest_bucket: String,
978                               dest_key: String,
979                               dest_body: bytes::Bytes,
980                               dest_meta: Option<std::collections::HashMap<String, String>>| {
981                let backend = Arc::clone(&backend);
982                let dest_lock_mgr = dest_lock_mgr.clone();
983                let lock_state = source_lock_state_for_closure.clone();
984                let warn_src = source_bucket_for_warn.clone();
985                async move {
986                    let req = S3Request {
987                        input: PutObjectInput {
988                            bucket: dest_bucket.clone(),
989                            key: dest_key.clone(),
990                            body: Some(bytes_to_blob(dest_body)),
991                            metadata: dest_meta,
992                            ..Default::default()
993                        },
994                        method: http::Method::PUT,
995                        uri: "/".parse().unwrap(),
996                        headers: http::HeaderMap::new(),
997                        extensions: http::Extensions::new(),
998                        credentials: None,
999                        region: None,
1000                        service: None,
1001                        trailing_headers: None,
1002                    };
1003                    let put_result = backend
1004                        .put_object(req)
1005                        .await
1006                        .map(|_| ())
1007                        .map_err(|e| format!("destination put_object: {e}"));
1008                    // v0.8.3 #68: on successful destination PUT,
1009                    // persist the propagated lock state into the
1010                    // destination's ObjectLockManager so a subsequent
1011                    // DELETE on the destination is refused. Three cases:
1012                    //   - PUT failed     → skip (no replica to protect)
1013                    //   - lock_state None → nothing to propagate
1014                    //   - dest manager None (operator misconfig)
1015                    //                     → log warn-once + bump skip metric
1016                    if put_result.is_ok()
1017                        && let Some(state) = lock_state
1018                    {
1019                        match dest_lock_mgr {
1020                            Some(ref mgr) => {
1021                                mgr.set(&dest_bucket, &dest_key, state);
1022                            }
1023                            None => {
1024                                crate::replication::warn_lock_propagation_skipped(
1025                                    &warn_src,
1026                                    &dest_bucket,
1027                                );
1028                            }
1029                        }
1030                    }
1031                    put_result
1032                }
1033            };
1034            // v0.8.5 #81 (audit H-7): wrap the dispatcher body in
1035            // `futures::FutureExt::catch_unwind` so a panic inside
1036            // `replicate_object` (or any of the user-supplied closures
1037            // it drives — `do_put`, the destination backend, the lock
1038            // manager) does NOT bubble out of the detached task as a
1039            // `JoinError` that no operator dashboard scrapes. Caught
1040            // panics bump `s4_dispatcher_panics_total{kind="replication"}`
1041            // + log at ERROR with the panic payload, so silent feature
1042            // degradation (= every replication PUT panicking and
1043            // dropping the replica without any visible signal) becomes
1044            // a first-class metric the operator can alert on.
1045            //
1046            // `AssertUnwindSafe` is required because the inner future
1047            // captures `Arc<...>` clones + a `do_put` closure that are
1048            // not `UnwindSafe` by default; the safety contract here is
1049            // "we don't continue using any of those captures after the
1050            // panic" which trivially holds (we drop them and return).
1051            use futures::FutureExt as _;
1052            let dispatcher_kind = "replication";
1053            let fut = crate::replication::replicate_object(
1054                rule,
1055                source_bucket_cl,
1056                source_key_cl,
1057                body_cl,
1058                metadata_cl,
1059                do_put,
1060                mgr_cl,
1061                generation,
1062                destination_key_override,
1063                source_lock_state,
1064            );
1065            if let Err(panic) = std::panic::AssertUnwindSafe(fut).catch_unwind().await {
1066                let panic_msg = panic
1067                    .downcast_ref::<&'static str>()
1068                    .copied()
1069                    .map(str::to_owned)
1070                    .or_else(|| panic.downcast_ref::<String>().cloned())
1071                    .unwrap_or_else(|| "(non-string panic payload)".to_owned());
1072                tracing::error!(
1073                    kind = dispatcher_kind,
1074                    panic_payload = %panic_msg,
1075                    "S4 dispatcher task panicked (caught by catch_unwind, runtime not poisoned)"
1076                );
1077                crate::metrics::record_dispatcher_panic(dispatcher_kind);
1078            }
1079        });
1080    }
1081
1082    /// v0.6 #42: attach the in-memory MFA-Delete enforcement manager.
1083    /// Once set, every DELETE / DELETE-version / delete-marker /
1084    /// `PutBucketVersioning` request against a bucket whose MFA-Delete
1085    /// state is `Enabled` requires a valid `x-amz-mfa: <serial> <code>`
1086    /// header (RFC 6238 6-digit TOTP); the gate is a no-op for buckets
1087    /// where MFA-Delete is `Disabled` (S3 default).
1088    #[must_use]
1089    pub fn with_mfa_delete(mut self, mgr: Arc<crate::mfa::MfaDeleteManager>) -> Self {
1090        self.mfa_delete = Some(mgr);
1091        self
1092    }
1093
1094    /// v0.6 #42: borrow the attached MFA-Delete manager (test /
1095    /// introspection — used by the snapshot path in `main.rs` to call
1096    /// `to_json` for restart-recoverable state).
1097    #[must_use]
1098    pub fn mfa_delete_manager(&self) -> Option<&Arc<crate::mfa::MfaDeleteManager>> {
1099        self.mfa_delete.as_ref()
1100    }
1101
1102    /// v0.6 #38: attach the in-memory CORS configuration manager. Once
1103    /// set, `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
1104    /// route through the manager instead of forwarding to the backend,
1105    /// and [`Self::handle_preflight`] becomes useful for the (future)
1106    /// listener-side OPTIONS interceptor.
1107    #[must_use]
1108    pub fn with_cors(mut self, mgr: Arc<crate::cors::CorsManager>) -> Self {
1109        self.cors = Some(mgr);
1110        self
1111    }
1112
1113    /// v0.6 #38: Borrow the attached CORS manager (test / introspection).
1114    #[must_use]
1115    pub fn cors_manager(&self) -> Option<&Arc<crate::cors::CorsManager>> {
1116        self.cors.as_ref()
1117    }
1118
1119    /// v0.6 #38: evaluate a CORS preflight request against the bucket's
1120    /// configured rules and, if a rule matches, return the headers that
1121    /// the (future) listener-side OPTIONS interceptor must put on the
1122    /// 200 response: `Access-Control-Allow-Origin`, `Access-Control-
1123    /// Allow-Methods`, `Access-Control-Allow-Headers`, optionally
1124    /// `Access-Control-Max-Age` and `Access-Control-Expose-Headers`.
1125    ///
1126    /// Returns `None` when no manager is attached, no config is
1127    /// registered for the bucket, or no rule matches the (origin,
1128    /// method, headers) triple. The caller is responsible for turning
1129    /// `None` into the appropriate 403 response.
1130    ///
1131    /// **Note:** the OPTIONS routing itself (i.e. wiring this method
1132    /// into the hyper-util listener path) is a follow-up — s3s does not
1133    /// surface OPTIONS as a typed S3 handler, so this method is
1134    /// currently call-able only from inside other handlers and tests.
1135    #[must_use]
1136    pub fn handle_preflight(
1137        &self,
1138        bucket: &str,
1139        origin: &str,
1140        method: &str,
1141        request_headers: &[String],
1142    ) -> Option<std::collections::HashMap<String, String>> {
1143        let mgr = self.cors.as_ref()?;
1144        let rule = mgr.match_preflight(bucket, origin, method, request_headers)?;
1145        let mut h = std::collections::HashMap::new();
1146        // Echo the matched origin back. If the rule used "*" we still
1147        // echo "*" (S3 spec — the spec does not require us to echo the
1148        // *requesting* origin when the wildcard matched).
1149        let allow_origin = if rule.allowed_origins.iter().any(|o| o == "*") {
1150            "*".to_string()
1151        } else {
1152            origin.to_string()
1153        };
1154        h.insert("Access-Control-Allow-Origin".to_string(), allow_origin);
1155        h.insert(
1156            "Access-Control-Allow-Methods".to_string(),
1157            rule.allowed_methods.join(", "),
1158        );
1159        if !rule.allowed_headers.is_empty() {
1160            // For the Allow-Headers response, echo back the rule's
1161            // pattern list verbatim (S3 echoes the configured list,
1162            // including "*" if present). Browsers honour exact-match
1163            // rules.
1164            h.insert(
1165                "Access-Control-Allow-Headers".to_string(),
1166                rule.allowed_headers.join(", "),
1167            );
1168        }
1169        if let Some(secs) = rule.max_age_seconds {
1170            h.insert("Access-Control-Max-Age".to_string(), secs.to_string());
1171        }
1172        if !rule.expose_headers.is_empty() {
1173            h.insert(
1174                "Access-Control-Expose-Headers".to_string(),
1175                rule.expose_headers.join(", "),
1176            );
1177        }
1178        Some(h)
1179    }
1180
1181    /// v0.5 #32: enable strict compliance mode. Every PUT must carry an
1182    /// SSE indicator (server-side encryption header or SSE-C customer
1183    /// key); requests without one are rejected with 400 InvalidRequest.
1184    /// Boot-time prerequisite checking lives in the binary
1185    /// (`validate_compliance_mode`) so this flag is purely the runtime
1186    /// switch.
1187    #[must_use]
1188    pub fn with_compliance_strict(mut self, on: bool) -> Self {
1189        self.compliance_strict = on;
1190        self
1191    }
1192
1193    /// v0.5 #30: attach the in-memory Object Lock (WORM) enforcement
1194    /// manager. Once set, `delete_object` and overwrite-path
1195    /// `put_object` refuse operations on locked keys with HTTP 403
1196    /// `AccessDenied`; new PUTs to a bucket with a default retention
1197    /// policy auto-create per-object lock state.
1198    #[must_use]
1199    pub fn with_object_lock(mut self, mgr: Arc<crate::object_lock::ObjectLockManager>) -> Self {
1200        self.object_lock = Some(mgr);
1201        self
1202    }
1203
1204    /// v0.7 #45: borrow the attached Object Lock manager (read-only —
1205    /// the lifecycle scanner uses this to skip currently-locked objects
1206    /// before issuing `delete_object`, since an Object Lock always wins
1207    /// over Lifecycle Expiration in AWS S3 semantics). Mirrors the
1208    /// shape of [`Self::lifecycle_manager`] /
1209    /// [`Self::tag_manager`] — purely additive accessor, no handler
1210    /// behaviour change.
1211    #[must_use]
1212    pub fn object_lock_manager(&self) -> Option<&Arc<crate::object_lock::ObjectLockManager>> {
1213        self.object_lock.as_ref()
1214    }
1215
1216    /// v0.5 #28: attach an SSE-KMS backend. `default_key_id` is used
1217    /// when a PUT requests SSE-KMS without naming a specific KMS key
1218    /// (operators set this to mirror AWS S3's bucket-default key).
1219    #[must_use]
1220    pub fn with_kms_backend(
1221        mut self,
1222        kms: Arc<dyn crate::kms::KmsBackend>,
1223        default_key_id: Option<String>,
1224    ) -> Self {
1225        self.kms = Some(kms);
1226        self.kms_default_key_id = default_key_id;
1227        self
1228    }
1229
1230    /// v0.5 #34: attach the first-class versioning state machine. Once
1231    /// set, this `S4Service` owns the per-bucket versioning state +
1232    /// per-(bucket, key) version chain; `put_object` / `get_object` /
1233    /// `delete_object` / `list_object_versions` /
1234    /// `get_bucket_versioning` / `put_bucket_versioning` consult the
1235    /// manager instead of passing through to the backend. The backend
1236    /// is still used as the byte store: Suspended / Unversioned buckets
1237    /// keep using `<key>` directly (legacy), Enabled buckets redirect
1238    /// each version's bytes to a shadow key
1239    /// (`<key>.__s4ver__/<version-id>`) so older versions survive newer
1240    /// PUTs to the same logical key.
1241    #[must_use]
1242    pub fn with_versioning(mut self, mgr: Arc<crate::versioning::VersioningManager>) -> Self {
1243        self.versioning = Some(mgr);
1244        self
1245    }
1246
1247    /// v0.8.5 #86 (audit M-3): borrow the attached versioning manager so
1248    /// the SIGUSR1 snapshot dump-back hook in `main.rs` can re-emit the
1249    /// in-memory state to the operator's `--versioning-state-file`
1250    /// without restarting the gateway. Mirrors the shape of
1251    /// [`Self::object_lock_manager`] / [`Self::lifecycle_manager`] —
1252    /// purely additive accessor, no handler behaviour change.
1253    #[must_use]
1254    pub fn versioning_manager(&self) -> Option<&Arc<crate::versioning::VersioningManager>> {
1255        self.versioning.as_ref()
1256    }
1257
1258    /// v0.8.5 #86 (audit M-2): override the default replication-dispatch
1259    /// concurrency cap (1024). Wired by the `--replication-max-concurrent`
1260    /// CLI flag in `main.rs`. Operators running heavy cross-region
1261    /// fan-out may need to raise this; operators on memory-constrained
1262    /// hosts may need to lower it. The new value replaces the existing
1263    /// `Semaphore` (so calling this after dispatchers are already in
1264    /// flight is fine — the in-flight tasks hold permits from the old
1265    /// semaphore which is dropped when its last permit is released).
1266    /// A `max` of 0 would deadlock all replicas; the value is silently
1267    /// clamped to 1 instead.
1268    #[must_use]
1269    pub fn with_replication_max_concurrent(mut self, max: usize) -> Self {
1270        let max = max.max(1);
1271        self.replication_semaphore = Arc::new(tokio::sync::Semaphore::new(max));
1272        self
1273    }
1274
1275    /// v0.8.5 #86 (audit M-2): borrow the in-flight replication
1276    /// concurrency permit pool. Tests inspect `available_permits()`
1277    /// after invoking `spawn_replication_if_matched` to verify the
1278    /// dispatcher actually `acquire_owned`s before kicking off the
1279    /// destination PUT.
1280    #[must_use]
1281    pub fn replication_semaphore(&self) -> &Arc<tokio::sync::Semaphore> {
1282        &self.replication_semaphore
1283    }
1284
1285    /// v0.4 #21 (kept for back-compat): attach a single SSE-S4 key.
1286    /// Internally wraps it in a 1-slot keyring with id=1 active, so
1287    /// new objects ride the v0.5 S4E2 frame while previously-written
1288    /// S4E1 bytes (this same key) still decrypt via the keyring's S4E1
1289    /// fallback path. Operators wanting true rotation should call
1290    /// [`Self::with_sse_keyring`] instead.
1291    #[must_use]
1292    pub fn with_sse_key(mut self, key: crate::sse::SharedSseKey) -> Self {
1293        let keyring = crate::sse::SseKeyring::new(1, key);
1294        self.sse_keyring = Some(std::sync::Arc::new(keyring));
1295        self
1296    }
1297
1298    /// v0.5 #29: attach a multi-key SSE-S4 keyring. PUT encrypts under
1299    /// the active key (S4E2 frame stamped with that key's id); GET
1300    /// dispatches on the body's magic — S4E1 falls back to trying every
1301    /// key in the ring (active first) so v0.4 objects survive a
1302    /// migration; S4E2 looks up the explicit key_id from the header.
1303    #[must_use]
1304    pub fn with_sse_keyring(mut self, keyring: crate::sse::SharedSseKeyring) -> Self {
1305        self.sse_keyring = Some(keyring);
1306        self
1307    }
1308
1309    /// v0.8 #52: opt the SSE-S4 PUT path into the chunked S4E5 frame
1310    /// (so the matching GET can stream-decrypt chunk-by-chunk
1311    /// instead of buffering the entire body before tag verify).
1312    /// `bytes` is the plaintext slice size — typically 1 MiB; 0
1313    /// disables the path and reverts to the legacy S4E2 buffered
1314    /// frame.
1315    ///
1316    /// SSE-C (S4E3) and SSE-KMS (S4E4) are intentionally untouched:
1317    /// the chunked envelopes for those flows are a follow-up issue
1318    /// (the customer-key wire surface needs separate version
1319    /// negotiation).
1320    ///
1321    /// Has no effect when `with_sse_keyring` / `with_sse_key` is
1322    /// not also set — the chunked path runs only on the SSE-S4
1323    /// branch of `put_object`.
1324    #[must_use]
1325    pub fn with_sse_chunk_size(mut self, bytes: usize) -> Self {
1326        self.sse_chunk_size = bytes;
1327        self
1328    }
1329
1330    /// v0.4 #20: attach an S3-style access-log emitter. Each completed
1331    /// PUT / GET / DELETE / List handler emits one entry into the
1332    /// emitter's buffer; a background flusher (started separately, see
1333    /// [`crate::access_log::AccessLog::spawn_flusher`]) writes hourly
1334    /// rotated `.log` files into the configured directory.
1335    #[must_use]
1336    pub fn with_access_log(mut self, log: crate::access_log::SharedAccessLog) -> Self {
1337        self.access_log = Some(log);
1338        self
1339    }
1340
1341    /// Capture the per-request access-log preamble before the request is
1342    /// consumed by the backend call. Returns `None` if no access logger
1343    /// is configured (cheap early-out so the handler doesn't pay the
1344    /// header-clone cost when access logging is off).
1345    fn access_log_preamble<I>(&self, req: &S3Request<I>) -> Option<AccessLogPreamble> {
1346        self.access_log.as_ref()?;
1347        Some(AccessLogPreamble {
1348            // v0.8.11 CRIT-4 fix: same trust gate as `request_context`.
1349            // Recording a client-controllable header in the access log
1350            // would poison forensic queries; leave it `None` until the
1351            // operator declares X-Forwarded-For is set by a trusted
1352            // proxy.
1353            remote_ip: if self.trust_x_forwarded_for {
1354                req.headers
1355                    .get("x-forwarded-for")
1356                    .and_then(|v| v.to_str().ok())
1357                    .and_then(|raw| raw.split(',').next())
1358                    .map(|s| s.trim().to_owned())
1359            } else {
1360                None
1361            },
1362            requester: Self::principal_of(req).map(str::to_owned),
1363            request_uri: format!("{} {}", req.method, req.uri.path()),
1364            user_agent: req
1365                .headers
1366                .get("user-agent")
1367                .and_then(|v| v.to_str().ok())
1368                .map(str::to_owned),
1369        })
1370    }
1371
1372    /// Internal — called by handlers at end-of-request with a captured
1373    /// preamble. Best-effort: swallows the await fast (clones Arc +
1374    /// pushes), no error propagation back to the request path.
1375    #[allow(clippy::too_many_arguments)]
1376    async fn record_access(
1377        &self,
1378        preamble: Option<AccessLogPreamble>,
1379        operation: &'static str,
1380        bucket: &str,
1381        key: Option<&str>,
1382        http_status: u16,
1383        bytes_sent: u64,
1384        object_size: u64,
1385        total_time_ms: u64,
1386        error_code: Option<&str>,
1387    ) {
1388        let (Some(log), Some(p)) = (self.access_log.as_ref(), preamble) else {
1389            return;
1390        };
1391        log.record(crate::access_log::AccessLogEntry {
1392            time: std::time::SystemTime::now(),
1393            bucket: bucket.to_owned(),
1394            remote_ip: p.remote_ip,
1395            requester: p.requester,
1396            operation,
1397            key: key.map(str::to_owned),
1398            request_uri: p.request_uri,
1399            http_status,
1400            error_code: error_code.map(str::to_owned),
1401            bytes_sent,
1402            object_size,
1403            total_time_ms,
1404            user_agent: p.user_agent,
1405        })
1406        .await;
1407    }
1408
1409    /// v0.4 #19: attach a per-(principal, bucket) token-bucket rate limiter.
1410    /// When set, every PUT / GET / DELETE / List / Copy / multipart op is
1411    /// throttle-checked before the policy gate; throttled requests return
1412    /// `S3ErrorCode::SlowDown` (HTTP 503) and bump
1413    /// `s4_rate_limit_throttled_total{principal,bucket}`.
1414    #[must_use]
1415    pub fn with_rate_limits(mut self, rl: crate::rate_limit::SharedRateLimits) -> Self {
1416        self.rate_limits = Some(rl);
1417        self
1418    }
1419
1420    /// Helper used by request handlers to apply the rate limit. Returns
1421    /// `Ok(())` when allowed (or no rate limiter is configured), or a
1422    /// `SlowDown` S3Error otherwise.
1423    fn enforce_rate_limit<I>(&self, req: &S3Request<I>, bucket: &str) -> S3Result<()> {
1424        let Some(rl) = self.rate_limits.as_ref() else {
1425            return Ok(());
1426        };
1427        let principal_id = Self::principal_of(req);
1428        if !rl.check(principal_id, bucket) {
1429            crate::metrics::record_rate_limit_throttle(principal_id.unwrap_or("-"), bucket);
1430            return Err(S3Error::with_message(
1431                S3ErrorCode::SlowDown,
1432                format!("rate-limited: bucket={bucket}"),
1433            ));
1434        }
1435        Ok(())
1436    }
1437
1438    /// Tell the policy evaluator that the listener is reached over TLS
1439    /// (or ACME). When `true`, the `aws:SecureTransport` Condition key
1440    /// resolves to `true`. Defaults to `false`.
1441    #[must_use]
1442    pub fn with_secure_transport(mut self, on: bool) -> Self {
1443        self.secure_transport = on;
1444        self
1445    }
1446
1447    #[must_use]
1448    pub fn with_max_body_bytes(mut self, n: usize) -> Self {
1449        self.max_body_bytes = n;
1450        self
1451    }
1452
1453    /// Attach an optional bucket policy (v0.2 #7). When `Some(...)`, every
1454    /// PUT / GET / DELETE / List handler runs `policy.evaluate(...)` before
1455    /// delegating to the backend; failures return `S3ErrorCode::AccessDenied`.
1456    /// When `None` (the default), no policy enforcement happens.
1457    #[must_use]
1458    pub fn with_policy(mut self, policy: crate::policy::SharedPolicy) -> Self {
1459        self.policy = Some(policy);
1460        self
1461    }
1462
1463    /// Pull the SigV4 access key id off the request's credentials, if any.
1464    /// Used as the `principal_id` for policy evaluation.
1465    fn principal_of<I>(req: &S3Request<I>) -> Option<&str> {
1466        req.credentials.as_ref().map(|c| c.access_key.as_str())
1467    }
1468
1469    /// v0.3 #13: build the per-request policy context from the incoming
1470    /// `S3Request`. Pulls `aws:UserAgent` from the User-Agent header,
1471    /// `aws:SourceIp` from the standard `X-Forwarded-For` header (most
1472    /// production deployments are behind an LB / reverse proxy that sets
1473    /// this), `aws:CurrentTime` from the system clock, and
1474    /// `aws:SecureTransport` from the per-listener TLS flag.
1475    fn request_context<I>(&self, req: &S3Request<I>) -> crate::policy::RequestContext {
1476        let user_agent = req
1477            .headers
1478            .get("user-agent")
1479            .and_then(|v| v.to_str().ok())
1480            .map(str::to_owned);
1481        // v0.8.11 CRIT-4 fix: `X-Forwarded-For` is a client-controllable
1482        // header. Trusting it unconditionally lets any public-internet
1483        // request claim it came from a trusted CIDR (e.g.
1484        // `curl -H 'X-Forwarded-For: 10.0.0.1'` to satisfy a
1485        // `Condition: NotIpAddress aws:SourceIp [10.0.0.0/8]` Deny).
1486        // We now only consume the header when the operator has
1487        // declared "this gateway sits behind a trusted reverse proxy
1488        // that scrubs client-supplied values" via
1489        // `with_trust_x_forwarded_for(true)` /
1490        // `--trust-x-forwarded-for`. Default leaves `source_ip` as
1491        // `None`, which fails closed for IP-allowlist Allow rules
1492        // and fails open for IP-blocklist Deny rules — operators
1493        // who need either case behind a public listener must opt in
1494        // or move the gate to the reverse proxy. The leftmost
1495        // comma-separated token is the originator per the
1496        // `X-Forwarded-For: client, proxy1, proxy2` convention.
1497        let source_ip = if self.trust_x_forwarded_for {
1498            req.headers
1499                .get("x-forwarded-for")
1500                .and_then(|v| v.to_str().ok())
1501                .and_then(|raw| raw.split(',').next())
1502                .and_then(|s| s.trim().parse().ok())
1503        } else {
1504            None
1505        };
1506        crate::policy::RequestContext {
1507            source_ip,
1508            user_agent,
1509            request_time: Some(std::time::SystemTime::now()),
1510            secure_transport: self.secure_transport,
1511            existing_object_tags: None,
1512            request_object_tags: None,
1513            extra: Default::default(),
1514        }
1515    }
1516
1517    /// Helper used by request handlers to enforce the optional policy.
1518    /// Returns `Ok(())` when allowed (or no policy is configured), or an
1519    /// `AccessDenied` S3Error otherwise. Bumps the policy denial Prometheus
1520    /// counter on deny.
1521    fn enforce_policy<I>(
1522        &self,
1523        req: &S3Request<I>,
1524        action: &'static str,
1525        bucket: &str,
1526        key: Option<&str>,
1527    ) -> S3Result<()> {
1528        self.enforce_policy_with_extra(req, action, bucket, key, None, None)
1529    }
1530
1531    /// v0.6 #39: variant of [`Self::enforce_policy`] that lets the
1532    /// caller plumb tag context (existing-on-object + on-request) into
1533    /// the policy evaluator. Both arguments default to `None`, in
1534    /// which case the resulting `RequestContext` is identical to
1535    /// [`Self::enforce_policy`]'s — so for handlers that don't deal
1536    /// with tags this is a transparent no-op.
1537    fn enforce_policy_with_extra<I>(
1538        &self,
1539        req: &S3Request<I>,
1540        action: &'static str,
1541        bucket: &str,
1542        key: Option<&str>,
1543        request_tags: Option<&crate::tagging::TagSet>,
1544        existing_tags: Option<&crate::tagging::TagSet>,
1545    ) -> S3Result<()> {
1546        let Some(policy) = self.policy.as_ref() else {
1547            return Ok(());
1548        };
1549        let principal_id = Self::principal_of(req);
1550        let mut ctx = self.request_context(req);
1551        if let Some(t) = request_tags {
1552            ctx.request_object_tags = Some(t.clone());
1553        }
1554        if let Some(t) = existing_tags {
1555            ctx.existing_object_tags = Some(t.clone());
1556        }
1557        let decision = policy.evaluate_with(action, bucket, key, principal_id, &ctx);
1558        if decision.allow {
1559            Ok(())
1560        } else {
1561            crate::metrics::record_policy_denial(action, bucket);
1562            tracing::info!(
1563                action,
1564                bucket,
1565                key = ?key,
1566                principal = ?principal_id,
1567                source_ip = ?ctx.source_ip,
1568                user_agent = ?ctx.user_agent,
1569                secure_transport = ctx.secure_transport,
1570                matched_sid = ?decision.matched_sid,
1571                effect = ?decision.matched_effect,
1572                "S4 policy denied request"
1573            );
1574            Err(S3Error::with_message(
1575                S3ErrorCode::AccessDenied,
1576                format!("denied by S4 policy: {action} on bucket={bucket}"),
1577            ))
1578        }
1579    }
1580
1581    /// テスト用: backend を取り戻す (test helper、production では使わない).
1582    /// v0.6 #40 で `backend` が `Arc<B>` 化したので `Arc::try_unwrap` で
1583    /// 1-clone の場合のみ返す。共有されている (= replication dispatcher が
1584    /// 同じ Arc を持っていて未完了) 場合は `Err` を返さず panic させる
1585    /// (test 用途専用 helper の caller 契約を維持)。
1586    pub fn into_backend(self) -> B {
1587        Arc::try_unwrap(self.backend).unwrap_or_else(|_| {
1588            panic!("into_backend: backend Arc still shared (replication dispatcher in flight?)")
1589        })
1590    }
1591
1592    /// 必要 frame だけを backend に Range GET し、frame parse + decompress + slice
1593    /// した結果を返す sidecar fast path。Range request の **帯域節約版**。
1594    async fn partial_range_get(
1595        &self,
1596        req: &S3Request<GetObjectInput>,
1597        plan: s4_codec::index::RangePlan,
1598        client_start: u64,
1599        client_end_exclusive: u64,
1600        total_original: u64,
1601        get_start: Instant,
1602    ) -> S3Result<S3Response<GetObjectOutput>> {
1603        // 必要 byte 範囲だけを backend に partial GET
1604        let backend_range = s3s::dto::Range::Int {
1605            first: plan.byte_start,
1606            last: Some(plan.byte_end_exclusive - 1),
1607        };
1608        let backend_input = GetObjectInput {
1609            bucket: req.input.bucket.clone(),
1610            key: req.input.key.clone(),
1611            range: Some(backend_range),
1612            ..Default::default()
1613        };
1614        let backend_req = S3Request {
1615            input: backend_input,
1616            method: req.method.clone(),
1617            uri: req.uri.clone(),
1618            headers: req.headers.clone(),
1619            extensions: http::Extensions::new(),
1620            credentials: req.credentials.clone(),
1621            region: req.region.clone(),
1622            service: req.service.clone(),
1623            trailing_headers: None,
1624        };
1625        let mut backend_resp = self.backend.get_object(backend_req).await?;
1626        let blob = backend_resp.output.body.take().ok_or_else(|| {
1627            S3Error::with_message(
1628                S3ErrorCode::InternalError,
1629                "backend partial GET returned empty body",
1630            )
1631        })?;
1632        let bytes = collect_blob(blob, self.max_body_bytes)
1633            .await
1634            .map_err(internal("collect partial body"))?;
1635
1636        // frame parse + decompress
1637        let mut combined = BytesMut::new();
1638        for frame in FrameIter::new(bytes) {
1639            let (header, payload) = frame.map_err(|e| {
1640                S3Error::with_message(
1641                    S3ErrorCode::InternalError,
1642                    format!("partial-range frame parse: {e}"),
1643                )
1644            })?;
1645            let chunk_manifest = ChunkManifest {
1646                codec: header.codec,
1647                original_size: header.original_size,
1648                compressed_size: header.compressed_size,
1649                crc32c: header.crc32c,
1650            };
1651            let decompressed = self
1652                .registry
1653                .decompress(payload, &chunk_manifest)
1654                .await
1655                .map_err(internal("partial-range decompress"))?;
1656            combined.extend_from_slice(&decompressed);
1657        }
1658        let combined = combined.freeze();
1659        let sliced = combined
1660            .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
1661
1662        // response 組立て
1663        let returned_size = sliced.len() as u64;
1664        backend_resp.output.content_length = Some(returned_size as i64);
1665        backend_resp.output.content_range = Some(format!(
1666            "bytes {client_start}-{}/{total_original}",
1667            client_end_exclusive - 1
1668        ));
1669        backend_resp.output.checksum_crc32 = None;
1670        backend_resp.output.checksum_crc32c = None;
1671        backend_resp.output.checksum_crc64nvme = None;
1672        backend_resp.output.checksum_sha1 = None;
1673        backend_resp.output.checksum_sha256 = None;
1674        backend_resp.output.e_tag = None;
1675        backend_resp.output.body = Some(bytes_to_blob(sliced));
1676        backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
1677
1678        let elapsed = get_start.elapsed();
1679        crate::metrics::record_get(
1680            "partial",
1681            plan.byte_end_exclusive - plan.byte_start,
1682            returned_size,
1683            elapsed.as_secs_f64(),
1684            true,
1685        );
1686        info!(
1687            op = "get_object",
1688            bucket = %req.input.bucket,
1689            key = %req.input.key,
1690            bytes_in = plan.byte_end_exclusive - plan.byte_start,
1691            bytes_out = returned_size,
1692            total_object_size = total_original,
1693            range = true,
1694            path = "sidecar-partial",
1695            latency_ms = elapsed.as_millis() as u64,
1696            "S4 partial Range GET via sidecar index"
1697        );
1698        Ok(backend_resp)
1699    }
1700
1701    /// `<key>.s4index` sidecar object を backend に書く。失敗しても本体 PUT は
1702    /// 成功扱いにしたいので、err は warn ログのみ (Range GET の partial path が
1703    /// 使えなくなるが、full read fallback で意味的には正しい結果を返す)。
1704    async fn write_sidecar(&self, bucket: &str, key: &str, index: &FrameIndex) {
1705        let bytes = encode_index(index);
1706        let len = bytes.len() as i64;
1707        let sidecar = sidecar_key(key);
1708        // v0.7 #49: synthetic re-entry URI must be percent-encoded; if
1709        // the (already legally-arbitrary) S3 key produces something we
1710        // cannot encode at all, drop the sidecar PUT (the GET path
1711        // falls back to a full read on a missing sidecar) instead of
1712        // panicking on `parse().unwrap()`.
1713        let uri = match safe_object_uri(bucket, &sidecar) {
1714            Ok(u) => u,
1715            Err(e) => {
1716                tracing::warn!(
1717                    bucket,
1718                    key,
1719                    "S4 write_sidecar skipped (key not URI-encodable): {e}"
1720                );
1721                return;
1722            }
1723        };
1724        let put_input = PutObjectInput {
1725            bucket: bucket.into(),
1726            key: sidecar,
1727            body: Some(bytes_to_blob(bytes)),
1728            content_length: Some(len),
1729            content_type: Some("application/x-s4-index".into()),
1730            ..Default::default()
1731        };
1732        let put_req = S3Request {
1733            input: put_input,
1734            method: http::Method::PUT,
1735            uri,
1736            headers: http::HeaderMap::new(),
1737            extensions: http::Extensions::new(),
1738            credentials: None,
1739            region: None,
1740            service: None,
1741            trailing_headers: None,
1742        };
1743        if let Err(e) = self.backend.put_object(put_req).await {
1744            tracing::warn!(
1745                bucket,
1746                key,
1747                "S4 write_sidecar failed (Range GET will fall back to full read): {e}"
1748            );
1749        }
1750    }
1751
1752    /// v0.8.4 #73 H-2: confirm that the sidecar we just decoded still
1753    /// describes the current backend object before we trust its frame
1754    /// offsets for a partial Range GET. The sidecar carries the source
1755    /// `etag` and `compressed_size` that were observed at PUT time; we
1756    /// HEAD the backend object and compare.
1757    ///
1758    /// Decision matrix:
1759    /// - sidecar `source_etag = None` (legacy v1 / build_index_from_body
1760    ///   that wasn't stamped) → return `true` (best-effort, preserves
1761    ///   pre-v0.8.4 behaviour for existing on-disk sidecars).
1762    /// - HEAD fails → return `false` (we can't tell either way; full GET
1763    ///   path will surface the real backend error to the client).
1764    /// - HEAD ETag matches → `true`.
1765    /// - HEAD ETag differs OR HEAD size differs from
1766    ///   `source_compressed_size` → `false` (sidecar stale or attacker-
1767    ///   written; fall back to full GET).
1768    async fn sidecar_version_binding_ok(
1769        &self,
1770        bucket: &str,
1771        key: &str,
1772        index: &FrameIndex,
1773    ) -> bool {
1774        let Some(ref expected_etag) = index.source_etag else {
1775            // Legacy sidecar without the v0.8.4 #73 H-2 binding —
1776            // back-compat: trust it (the partial fetch is the same
1777            // best-effort path that v0.8.3 and earlier shipped).
1778            return true;
1779        };
1780        let head_input = HeadObjectInput {
1781            bucket: bucket.into(),
1782            key: key.into(),
1783            ..Default::default()
1784        };
1785        let uri = match safe_object_uri(bucket, key) {
1786            Ok(u) => u,
1787            Err(_) => return false,
1788        };
1789        let head_req = S3Request {
1790            input: head_input,
1791            method: http::Method::HEAD,
1792            uri,
1793            headers: http::HeaderMap::new(),
1794            extensions: http::Extensions::new(),
1795            credentials: None,
1796            region: None,
1797            service: None,
1798            trailing_headers: None,
1799        };
1800        let head = match self.backend.head_object(head_req).await {
1801            Ok(r) => r.output,
1802            Err(e) => {
1803                tracing::debug!(
1804                    bucket,
1805                    key,
1806                    "S4 sidecar version-binding HEAD failed, falling back to full GET: {e}"
1807                );
1808                return false;
1809            }
1810        };
1811        // ETag is a strong-vs-weak enum; we compare on the unwrapped string
1812        // form (matches what the PUT path stamped — see below).
1813        let live_etag = head.e_tag.as_ref().map(|t| t.value());
1814        if live_etag != Some(expected_etag.as_str()) {
1815            tracing::debug!(
1816                bucket,
1817                key,
1818                "sidecar stale (ETag mismatch), falling back to full GET (sidecar={:?}, live={:?})",
1819                expected_etag,
1820                live_etag,
1821            );
1822            return false;
1823        }
1824        if let Some(expected_size) = index.source_compressed_size
1825            && let Some(live_size) = head.content_length
1826            && live_size as u64 != expected_size
1827        {
1828            tracing::debug!(
1829                bucket,
1830                key,
1831                "sidecar stale (size mismatch), falling back to full GET (sidecar={}, live={})",
1832                expected_size,
1833                live_size,
1834            );
1835            return false;
1836        }
1837        true
1838    }
1839
1840    /// `<key>.s4index` sidecar を backend から読み出す。なければ None。
1841    async fn read_sidecar(&self, bucket: &str, key: &str) -> Option<FrameIndex> {
1842        let sidecar = sidecar_key(key);
1843        // v0.7 #49: same encode-or-bail treatment as write_sidecar.
1844        let uri = safe_object_uri(bucket, &sidecar).ok()?;
1845        let get_input = GetObjectInput {
1846            bucket: bucket.into(),
1847            key: sidecar,
1848            ..Default::default()
1849        };
1850        let get_req = S3Request {
1851            input: get_input,
1852            method: http::Method::GET,
1853            uri,
1854            headers: http::HeaderMap::new(),
1855            extensions: http::Extensions::new(),
1856            credentials: None,
1857            region: None,
1858            service: None,
1859            trailing_headers: None,
1860        };
1861        let resp = self.backend.get_object(get_req).await.ok()?;
1862        let blob = resp.output.body?;
1863        let bytes = collect_blob(blob, 64 * 1024 * 1024).await.ok()?;
1864        decode_index(bytes).ok()
1865    }
1866
1867    /// Multipart object (frame 列) を解凍 → 元 bytes を再構築。
1868    ///
1869    /// **per-frame codec dispatch**: 各 frame header に codec_id が入っているので、
1870    /// frame ごとに registry が違う codec を呼ぶことができる。同一 object 内で
1871    /// 異なる codec が混在していても透過的に解凍可能 (parquet 風 mixed columns 等)。
1872    async fn decompress_multipart(&self, bytes: bytes::Bytes) -> S3Result<bytes::Bytes> {
1873        let mut out = BytesMut::new();
1874        // v0.8.15 H-h: cap the *aggregate* decoded output. Each
1875        // individual frame is already bounded by
1876        // `validate_decompress_manifest` (default 5 GiB per frame),
1877        // but a forged multi-frame body can declare many frames
1878        // each near the limit — without an object-level ceiling, a
1879        // single GET could pin tens of GiB of plaintext in
1880        // `BytesMut::extend_from_slice`. Use the gateway's
1881        // `max_body_bytes` (same cap that bounds PUT bodies) so a
1882        // GET can never produce more plaintext than a PUT can ever
1883        // legitimately have stored.
1884        let aggregate_cap = self.max_body_bytes;
1885        let mut produced: usize = 0;
1886        for frame in FrameIter::new(bytes) {
1887            let (header, payload) = frame.map_err(|e| {
1888                S3Error::with_message(
1889                    S3ErrorCode::InternalError,
1890                    format!("multipart frame parse: {e}"),
1891                )
1892            })?;
1893            let chunk_manifest = ChunkManifest {
1894                codec: header.codec,
1895                original_size: header.original_size,
1896                compressed_size: header.compressed_size,
1897                crc32c: header.crc32c,
1898            };
1899            // v0.8.15 H-h: pre-flight check on the declared
1900            // `original_size` so a forged manifest claiming a frame
1901            // that would push us past the cap is rejected before we
1902            // start decoding. Defence-in-depth alongside the
1903            // post-decode `produced` check below.
1904            if (produced as u64).saturating_add(header.original_size) > aggregate_cap as u64 {
1905                return Err(S3Error::with_message(
1906                    S3ErrorCode::InternalError,
1907                    format!(
1908                        "multipart aggregate output exceeds cap: would reach \
1909                         {produced_total} bytes after this frame, cap is {aggregate_cap}",
1910                        produced_total = (produced as u64).saturating_add(header.original_size),
1911                    ),
1912                ));
1913            }
1914            let decompressed = self
1915                .registry
1916                .decompress(payload, &chunk_manifest)
1917                .await
1918                .map_err(internal("multipart frame decompress"))?;
1919            produced = produced.saturating_add(decompressed.len());
1920            if produced > aggregate_cap {
1921                return Err(S3Error::with_message(
1922                    S3ErrorCode::InternalError,
1923                    format!(
1924                        "multipart aggregate output exceeded cap: {produced} bytes \
1925                         emitted, cap is {aggregate_cap}"
1926                    ),
1927                ));
1928            }
1929            out.extend_from_slice(&decompressed);
1930        }
1931        Ok(out.freeze())
1932    }
1933}
1934
1935/// Parse a CopySourceRange header value (`bytes=N-M`, `bytes=N-`, `bytes=-N`)
1936/// into the s3s::dto::Range used by the GetObject path. The S3 spec only
1937/// allows `bytes=N-M` for upload_part_copy (no suffix or open-ended), so
1938/// reject the other variants for parity with AWS.
1939fn parse_copy_source_range(s: &str) -> Result<s3s::dto::Range, String> {
1940    let rest = s
1941        .strip_prefix("bytes=")
1942        .ok_or_else(|| format!("CopySourceRange must start with 'bytes=', got {s:?}"))?;
1943    let (a, b) = rest
1944        .split_once('-')
1945        .ok_or_else(|| format!("CopySourceRange must be 'bytes=N-M', got {s:?}"))?;
1946    let first: u64 = a
1947        .parse()
1948        .map_err(|_| format!("CopySourceRange first byte not a number: {a:?}"))?;
1949    let last: u64 = b
1950        .parse()
1951        .map_err(|_| format!("CopySourceRange last byte not a number: {b:?}"))?;
1952    if last < first {
1953        return Err(format!("CopySourceRange last < first: {s:?}"));
1954    }
1955    Ok(s3s::dto::Range::Int {
1956        first,
1957        last: Some(last),
1958    })
1959}
1960
1961/// v0.5 #34: synthesize the backend storage key for a given
1962/// (logical key, version-id) pair on an Enabled-versioning bucket.
1963///
1964/// Uses the `__s4ver__/` infix because:
1965/// - it's not a substring of `.s4index` / `.s4ver` natural keys (no false-positive
1966///   listing filter collisions)
1967/// - directory-style separator keeps S3 console "browse by prefix" UX intact
1968///   (versions roll up under one virtual folder per object)
1969/// - human-readable on debug logs / `aws s3 ls`
1970///
1971/// `list_objects` / `list_objects_v2` / `list_object_versions` MUST filter
1972/// keys containing `.__s4ver__/` from results so customers don't see internal
1973/// shadow objects.
1974pub fn versioned_shadow_key(key: &str, version_id: &str) -> String {
1975    format!("{key}.__s4ver__/{version_id}")
1976}
1977
1978/// Test for the marker substring used by [`versioned_shadow_key`]. Cheap str
1979/// scan; both list_objects filter and the GET passthrough check use this.
1980fn is_versioning_shadow_key(key: &str) -> bool {
1981    key.contains(".__s4ver__/")
1982}
1983
1984/// v0.6 #42: wall-clock seconds since the UNIX epoch — fed to
1985/// `mfa::check_mfa` so the TOTP verifier can match the client's
1986/// authenticator app's view of "now". Falls back to `0` on the
1987/// (impossible-in-practice) clock-before-1970 path so the verifier
1988/// rejects rather than panicking.
1989fn current_unix_secs() -> u64 {
1990    std::time::SystemTime::now()
1991        .duration_since(std::time::UNIX_EPOCH)
1992        .map(|d| d.as_secs())
1993        .unwrap_or(0)
1994}
1995
1996/// v0.6 #42: translate an `MfaError` into the matching S3 wire error.
1997///
1998/// - `Missing` / `SerialMismatch` / `InvalidCode` → `403 AccessDenied`
1999///   (S3 spec for MFA Delete: every gating failure surfaces as
2000///   `AccessDenied`, not a separate `MFA*` code).
2001/// - `Malformed` → `400 InvalidRequest` (the request itself is
2002///   syntactically broken, not a permission issue).
2003fn mfa_error_to_s3(e: crate::mfa::MfaError) -> S3Error {
2004    match e {
2005        crate::mfa::MfaError::Missing => S3Error::with_message(
2006            S3ErrorCode::AccessDenied,
2007            "MFA token required for this operation",
2008        ),
2009        crate::mfa::MfaError::Malformed => {
2010            S3Error::with_message(S3ErrorCode::InvalidRequest, "malformed x-amz-mfa header")
2011        }
2012        crate::mfa::MfaError::SerialMismatch => S3Error::with_message(
2013            S3ErrorCode::AccessDenied,
2014            "MFA serial does not match configured device",
2015        ),
2016        crate::mfa::MfaError::InvalidCode => {
2017            S3Error::with_message(S3ErrorCode::AccessDenied, "invalid MFA code")
2018        }
2019    }
2020}
2021
2022fn is_multipart_object(metadata: &Option<Metadata>) -> bool {
2023    metadata
2024        .as_ref()
2025        .and_then(|m| m.get(META_MULTIPART))
2026        .map(|v| v == "true")
2027        .unwrap_or(false)
2028}
2029
2030const META_CODEC: &str = "s4-codec";
2031const META_ORIGINAL_SIZE: &str = "s4-original-size";
2032const META_COMPRESSED_SIZE: &str = "s4-compressed-size";
2033const META_CRC32C: &str = "s4-crc32c";
2034/// Multipart upload で per-part frame format を使ったオブジェクトであることを示す。
2035/// GET 時にこの flag を見て frame parser を起動する。
2036const META_MULTIPART: &str = "s4-multipart";
2037/// v0.2 #4: single-PUT でも S4F2 framed format で書かれていることを示す。
2038/// 旧 v0.1 single-PUT は raw 圧縮 bytes (この flag なし)。GET 時にこの flag を
2039/// 見て framed 経路 (= multipart と同じ FrameIter parse) に流す。
2040const META_FRAMED: &str = "s4-framed";
2041
2042fn is_framed_v2_object(metadata: &Option<Metadata>) -> bool {
2043    metadata
2044        .as_ref()
2045        .and_then(|m| m.get(META_FRAMED))
2046        .map(|v| v == "true")
2047        .unwrap_or(false)
2048}
2049
2050/// v0.4 #21: detect SSE-S4 by the metadata flag we set on PUT.
2051fn is_sse_encrypted(metadata: &Option<Metadata>) -> bool {
2052    metadata
2053        .as_ref()
2054        .and_then(|m| m.get("s4-encrypted"))
2055        .map(|v| v == "aes-256-gcm")
2056        .unwrap_or(false)
2057}
2058
2059/// v0.5 #27: pull the three SSE-C headers off an input struct. The S3
2060/// contract is "all three or none" — partial sets are a 400.
2061///
2062/// Returns `Ok(None)` when no SSE-C headers were sent (server-managed or
2063/// no encryption), `Ok(Some(material))` on validated client key, and
2064/// `Err` for malformed or partial inputs.
2065fn extract_sse_c_material(
2066    algorithm: &Option<String>,
2067    key: &Option<String>,
2068    md5: &Option<String>,
2069) -> S3Result<Option<crate::sse::CustomerKeyMaterial>> {
2070    match (algorithm, key, md5) {
2071        (None, None, None) => Ok(None),
2072        (Some(a), Some(k), Some(m)) => crate::sse::parse_customer_key_headers(a, k, m)
2073            .map(Some)
2074            .map_err(sse_c_error_to_s3),
2075        _ => Err(S3Error::with_message(
2076            S3ErrorCode::InvalidRequest,
2077            "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
2078        )),
2079    }
2080}
2081
2082/// v0.5 #28: detect SSE-KMS request — `x-amz-server-side-encryption: aws:kms`.
2083/// Returns the key-id to wrap under, falling back to the gateway default.
2084fn extract_kms_key_id(
2085    sse: &Option<ServerSideEncryption>,
2086    sse_kms_key_id: &Option<String>,
2087    gateway_default: Option<&str>,
2088) -> Option<String> {
2089    let asks_for_kms = sse
2090        .as_ref()
2091        .map(|s| s.as_str() == ServerSideEncryption::AWS_KMS)
2092        .unwrap_or(false);
2093    if !asks_for_kms {
2094        return None;
2095    }
2096    sse_kms_key_id
2097        .clone()
2098        .or_else(|| gateway_default.map(str::to_owned))
2099}
2100
2101/// v0.5 #28: map kms module errors to AWS-shaped S3 error codes.
2102/// `KeyNotFound` is operator misconfig (400); `BackendUnavailable` is a
2103/// transient KMS outage (503). Other variants are 500 InternalError.
2104fn kms_error_to_s3(e: crate::kms::KmsError) -> S3Error {
2105    use crate::kms::KmsError as K;
2106    match e {
2107        K::KeyNotFound { key_id } => S3Error::with_message(
2108            S3ErrorCode::InvalidArgument,
2109            format!("KMS key not found: {key_id}"),
2110        ),
2111        K::BackendUnavailable { message } => S3Error::with_message(
2112            S3ErrorCode::ServiceUnavailable,
2113            format!("KMS backend unavailable: {message}"),
2114        ),
2115        other => S3Error::with_message(S3ErrorCode::InternalError, format!("KMS error: {other}")),
2116    }
2117}
2118
2119/// v0.5 #27: map sse module errors to AWS-shaped S3 error codes.
2120/// `WrongCustomerKey` → 403 AccessDenied (matches AWS behaviour);
2121/// `InvalidCustomerKey` / algorithm / required / unexpected → 400.
2122fn sse_c_error_to_s3(e: crate::sse::SseError) -> S3Error {
2123    use crate::sse::SseError as E;
2124    match e {
2125        E::WrongCustomerKey => S3Error::with_message(
2126            S3ErrorCode::AccessDenied,
2127            "SSE-C key does not match the key used at PUT time",
2128        ),
2129        E::InvalidCustomerKey { reason } => {
2130            S3Error::with_message(S3ErrorCode::InvalidArgument, format!("SSE-C: {reason}"))
2131        }
2132        E::CustomerKeyAlgorithmUnsupported { algo } => S3Error::with_message(
2133            S3ErrorCode::InvalidArgument,
2134            format!("SSE-C unsupported algorithm: {algo:?} (only AES256 is allowed)"),
2135        ),
2136        E::CustomerKeyRequired => S3Error::with_message(
2137            S3ErrorCode::InvalidRequest,
2138            "object is SSE-C encrypted; supply x-amz-server-side-encryption-customer-* headers",
2139        ),
2140        E::CustomerKeyUnexpected => S3Error::with_message(
2141            S3ErrorCode::InvalidRequest,
2142            "object is not SSE-C encrypted; do not send x-amz-server-side-encryption-customer-* headers",
2143        ),
2144        other => S3Error::with_message(S3ErrorCode::InternalError, format!("SSE error: {other}")),
2145    }
2146}
2147
2148fn extract_manifest(metadata: &Option<Metadata>) -> Option<ChunkManifest> {
2149    let m = metadata.as_ref()?;
2150    let codec = m
2151        .get(META_CODEC)
2152        .and_then(|s| s.parse::<CodecKind>().ok())?;
2153    let original_size = m.get(META_ORIGINAL_SIZE)?.parse().ok()?;
2154    let compressed_size = m.get(META_COMPRESSED_SIZE)?.parse().ok()?;
2155    let crc32c = m.get(META_CRC32C)?.parse().ok()?;
2156    Some(ChunkManifest {
2157        codec,
2158        original_size,
2159        compressed_size,
2160        crc32c,
2161    })
2162}
2163
2164fn write_manifest(metadata: &mut Option<Metadata>, manifest: &ChunkManifest) {
2165    let meta = metadata.get_or_insert_with(Default::default);
2166    meta.insert(META_CODEC.into(), manifest.codec.as_str().into());
2167    meta.insert(
2168        META_ORIGINAL_SIZE.into(),
2169        manifest.original_size.to_string(),
2170    );
2171    meta.insert(
2172        META_COMPRESSED_SIZE.into(),
2173        manifest.compressed_size.to_string(),
2174    );
2175    meta.insert(META_CRC32C.into(), manifest.crc32c.to_string());
2176}
2177
2178fn internal<E: std::fmt::Display>(prefix: &'static str) -> impl FnOnce(E) -> S3Error {
2179    move |e| S3Error::with_message(S3ErrorCode::InternalError, format!("{prefix}: {e}"))
2180}
2181
2182/// v0.6 #41: map a `select::SelectError` to the S3 error surface. AWS
2183/// uses a domain-specific `InvalidSqlExpression` code for parse / unsupported
2184/// errors, but s3s 0.13 doesn't expose that as a typed variant — we
2185/// fall back to the well-known `InvalidRequest` 400 with a descriptive
2186/// message that includes the original error context.
2187fn select_error_to_s3(e: crate::select::SelectError, fmt: &str) -> S3Error {
2188    use crate::select::SelectError;
2189    match e {
2190        SelectError::Parse(msg) => S3Error::with_message(
2191            S3ErrorCode::InvalidRequest,
2192            format!("SQL parse error: {msg}"),
2193        ),
2194        SelectError::UnsupportedFeature(msg) => S3Error::with_message(
2195            S3ErrorCode::InvalidRequest,
2196            format!("unsupported SQL feature: {msg}"),
2197        ),
2198        SelectError::RowEval(msg) => S3Error::with_message(
2199            S3ErrorCode::InvalidRequest,
2200            format!("SQL row evaluation error: {msg}"),
2201        ),
2202        SelectError::InputFormat(msg) => S3Error::with_message(
2203            S3ErrorCode::InvalidRequest,
2204            format!("{fmt} input format error: {msg}"),
2205        ),
2206    }
2207}
2208
2209/// v0.5 #30: parse the `x-amz-bypass-governance-retention` header into a
2210/// boolean flag. AWS S3 accepts `true` (case-insensitive); any other value
2211/// (including missing) is treated as `false`.
2212fn parse_bypass_governance_header(headers: &http::HeaderMap) -> bool {
2213    headers
2214        .get("x-amz-bypass-governance-retention")
2215        .and_then(|v| v.to_str().ok())
2216        .map(|s| s.eq_ignore_ascii_case("true"))
2217        .unwrap_or(false)
2218}
2219
2220/// Convert s3s `Timestamp` into a `chrono::DateTime<Utc>` by formatting it
2221/// as an RFC3339 string and re-parsing through `chrono`. The string format
2222/// avoids pulling the `time` crate (transitive dep of s3s, not declared by
2223/// s4-server) into our direct deps. Returns `None` if the format/parse fails
2224/// or the value is outside `chrono`'s supported range.
2225fn timestamp_to_chrono_utc(ts: &Timestamp) -> Option<chrono::DateTime<chrono::Utc>> {
2226    let mut buf = Vec::new();
2227    ts.format(s3s::dto::TimestampFormat::DateTime, &mut buf)
2228        .ok()?;
2229    let s = std::str::from_utf8(&buf).ok()?;
2230    chrono::DateTime::parse_from_rfc3339(s)
2231        .ok()
2232        .map(|dt| dt.with_timezone(&chrono::Utc))
2233}
2234
2235/// Inverse of [`timestamp_to_chrono_utc`] — emit RFC3339 (the s3s
2236/// `DateTime` wire format) and re-parse via `Timestamp::parse`.
2237fn chrono_utc_to_timestamp(dt: chrono::DateTime<chrono::Utc>) -> Timestamp {
2238    // chrono's RFC3339 output format matches s3s' parser ("...Z" with
2239    // optional sub-second precision). Fall back to UNIX_EPOCH if anything
2240    // unexpected happens — we never produce malformed strings, so this
2241    // branch is unreachable in practice.
2242    let s = dt.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
2243    Timestamp::parse(s3s::dto::TimestampFormat::DateTime, &s).unwrap_or_default()
2244}
2245
2246/// v0.6 #39: convert our internal [`crate::tagging::TagSet`] into the
2247/// s3s `Vec<Tag>` wire shape used on `GetObject/BucketTaggingOutput`.
2248/// Both halves of every pair land in the `Some(_)` slot — AWS marks
2249/// the field optional but always populates it on response.
2250fn tagset_to_aws(set: &crate::tagging::TagSet) -> Vec<Tag> {
2251    set.iter()
2252        .map(|(k, v)| Tag {
2253            key: Some(k.clone()),
2254            value: Some(v.clone()),
2255        })
2256        .collect()
2257}
2258
2259/// v0.6 #39: inverse of [`tagset_to_aws`] for input handlers. Missing
2260/// keys / values become empty strings (mirrors AWS, which rejects
2261/// `<Key/>` with InvalidTag at the parser layer; downstream
2262/// `TagSet::validate` then enforces our size limits).
2263fn aws_to_tagset(tags: &[Tag]) -> Result<crate::tagging::TagSet, crate::tagging::TagError> {
2264    let pairs = tags
2265        .iter()
2266        .map(|t| {
2267            (
2268                t.key.clone().unwrap_or_default(),
2269                t.value.clone().unwrap_or_default(),
2270            )
2271        })
2272        .collect();
2273    crate::tagging::TagSet::from_pairs(pairs)
2274}
2275
2276/// `Range` request を decompressed object サイズ `total` に適用して `(start, end_exclusive)`
2277/// を返す。`Range::Int { first, last }` は `bytes=first-last` (last は inclusive)、
2278/// `Range::Suffix { length }` は末尾 `length` byte。S3 仕様に準拠。
2279pub fn resolve_range(range: &s3s::dto::Range, total: u64) -> Result<(u64, u64), String> {
2280    if total == 0 {
2281        return Err("cannot range-get zero-length object".into());
2282    }
2283    match range {
2284        s3s::dto::Range::Int { first, last } => {
2285            let start = *first;
2286            let end_inclusive = match last {
2287                Some(l) => (*l).min(total - 1),
2288                None => total - 1,
2289            };
2290            if start > end_inclusive || start >= total {
2291                return Err(format!(
2292                    "range bytes={start}-{:?} out of object size {total}",
2293                    last
2294                ));
2295            }
2296            Ok((start, end_inclusive + 1))
2297        }
2298        s3s::dto::Range::Suffix { length } => {
2299            let len = (*length).min(total);
2300            Ok((total - len, total))
2301        }
2302    }
2303}
2304
2305#[async_trait::async_trait]
2306impl<B: S3> S3 for S4Service<B> {
2307    // === 圧縮を挟む path (PUT) ===
2308    #[tracing::instrument(
2309        name = "s4.put_object",
2310        skip(self, req),
2311        fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_in, bytes_out, latency_ms)
2312    )]
2313    async fn put_object(
2314        &self,
2315        mut req: S3Request<PutObjectInput>,
2316    ) -> S3Result<S3Response<PutObjectOutput>> {
2317        let put_start = Instant::now();
2318        let put_bucket = req.input.bucket.clone();
2319        let put_key = req.input.key.clone();
2320        // v0.8.15 M-1: reject user PUTs targeting reserved sidecar
2321        // names (`<key>.s4index`). Without this gate, a user
2322        // uploading `report.s4index` would have their object silently
2323        // hidden from `ListObjectsV2` (the list filter strips the
2324        // `.s4index` suffix) and risk being deleted by the sidecar-
2325        // cleanup path on a sibling DeleteObject. Fail fast with the
2326        // AWS-canonical `InvalidObjectName` code.
2327        if s4_codec::index::is_reserved_sidecar_key(&put_key) {
2328            let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
2329                .unwrap_or(S3ErrorCode::InvalidArgument);
2330            return Err(S3Error::with_message(
2331                code,
2332                format!(
2333                    "object key {put_key:?} is reserved (suffix `{}` is used for S4 internal \
2334                     sidecars); pick a different key",
2335                    s4_codec::index::SIDECAR_SUFFIX,
2336                ),
2337            ));
2338        }
2339        let access_preamble = self.access_log_preamble(&req);
2340        self.enforce_rate_limit(&req, &put_bucket)?;
2341        // v0.6 #39: parse `x-amz-tagging` (URL-encoded query string) so
2342        // the IAM policy gate sees the request's tags via
2343        // `s3:RequestObjectTag/<key>`. `existing_object_tags` is also
2344        // resolved from the Tagging manager (when wired) so
2345        // `s3:ExistingObjectTag/<key>` works on overwrite.
2346        let request_tags: Option<crate::tagging::TagSet> = req
2347            .input
2348            .tagging
2349            .as_deref()
2350            .map(crate::tagging::parse_tagging_header)
2351            .transpose()
2352            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
2353        let existing_tags: Option<crate::tagging::TagSet> = self
2354            .tagging
2355            .as_ref()
2356            .and_then(|m| m.get_object_tags(&put_bucket, &put_key));
2357        self.enforce_policy_with_extra(
2358            &req,
2359            "s3:PutObject",
2360            &put_bucket,
2361            Some(&put_key),
2362            request_tags.as_ref(),
2363            existing_tags.as_ref(),
2364        )?;
2365        // v0.5 #30: an Object Lock-protected key cannot be overwritten by
2366        // a non-versioned PUT (Suspended / Unversioned bucket). Enabled
2367        // bucket PUTs are exempt because they materialise a fresh
2368        // version under a shadow key (`<key>.__s4ver__/<vid>`) — the
2369        // locked version's bytes are untouched. The check mirrors the
2370        // delete path (Compliance never bypassable, Governance via the
2371        // bypass header, legal hold never).
2372        if let Some(mgr) = self.object_lock.as_ref()
2373            && let Some(state) = mgr.get(&put_bucket, &put_key)
2374        {
2375            let bucket_versioned_enabled = self
2376                .versioning
2377                .as_ref()
2378                .map(|v| v.state(&put_bucket) == crate::versioning::VersioningState::Enabled)
2379                .unwrap_or(false);
2380            if !bucket_versioned_enabled {
2381                let bypass = parse_bypass_governance_header(&req.headers);
2382                let now = chrono::Utc::now();
2383                if !state.can_delete(now, bypass) {
2384                    crate::metrics::record_policy_denial("s3:PutObject", &put_bucket);
2385                    return Err(S3Error::with_message(
2386                        S3ErrorCode::AccessDenied,
2387                        "Access Denied because object protected by object lock",
2388                    ));
2389                }
2390            }
2391        }
2392        // v0.5 #30: per-PUT explicit retention / legal hold (S3
2393        // `x-amz-object-lock-mode`, `x-amz-object-lock-retain-until-date`,
2394        // `x-amz-object-lock-legal-hold`). Captured before the body
2395        // moves into the backend; persisted into the manager only on
2396        // backend success below.
2397        let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
2398            .input
2399            .object_lock_mode
2400            .as_ref()
2401            .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
2402        let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
2403            .input
2404            .object_lock_retain_until_date
2405            .as_ref()
2406            .and_then(timestamp_to_chrono_utc);
2407        let explicit_legal_hold_on: Option<bool> = req
2408            .input
2409            .object_lock_legal_hold_status
2410            .as_ref()
2411            .map(|s| s.as_str().eq_ignore_ascii_case("ON"));
2412        if let Some(blob) = req.input.body.take() {
2413            // Sample 4 KiB から codec を決定。streaming-aware codec なら streaming
2414            // compress fast path、そうでなければ従来の collect-then-compress。
2415            let (sample, rest_stream) = peek_sample(blob, SAMPLE_BYTES)
2416                .await
2417                .map_err(internal("peek put sample"))?;
2418            let sample_len = sample.len().min(SAMPLE_BYTES);
2419            // v0.8 #56: pass the request's Content-Length (when present) so
2420            // the sampling dispatcher can promote large objects to a GPU
2421            // codec. Chunked transfers (no Content-Length) keep CPU.
2422            let total_size_hint = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2423            let kind = self
2424                .dispatcher
2425                .pick_with_size_hint(&sample[..sample_len], total_size_hint)
2426                .await;
2427
2428            // Passthrough buys nothing from S4F2 wrapping (no compression =
2429            // no per-chunk frame to skip past) and the +28-byte header
2430            // overhead breaks size-sensitive callers that expect a true
2431            // pass-through. So passthrough always uses the legacy raw-blob
2432            // path; only compressing codecs go through the framed path.
2433            //
2434            // v0.8.14 follow-up to #127 MED-B: the previous attempt
2435            // forced the buffered path whenever the client supplied
2436            // any whole-body checksum so `verify_client_body_checksums`
2437            // could run. Modern AWS SDKs auto-add an
2438            // `x-amz-checksum-crc32` trailer by default, which made
2439            // every SDK PUT lose the streaming-framed path and
2440            // therefore lose its sidecar — silent data path
2441            // regression caught by
2442            // `range_get_falls_back_to_full_when_sidecar_etag_stale`
2443            // and `upload_part_copy_propagates_source_version_id`
2444            // on the MinIO E2E job. The streaming PUT path now
2445            // passes through unchanged; client-supplied checksums on
2446            // streaming PUTs are NOT verified (same fail-open as
2447            // pre-v0.8.12). The buffered PUT branch and UploadPart
2448            // do verify, which covers the buffered upload case the
2449            // HIGH-12 audit was scoped to. True streaming verify
2450            // (tee-into-hasher on the chained input) remains the
2451            // tracked follow-up.
2452            let use_framed = supports_streaming_compress(kind) && kind != CodecKind::Passthrough;
2453            let (compressed, manifest, is_framed) = if use_framed {
2454                // streaming fast path: input は memory に collect しない
2455                let chained = chain_sample_with_rest(sample, rest_stream);
2456                debug!(
2457                    bucket = ?req.input.bucket,
2458                    key = ?req.input.key,
2459                    codec = kind.as_str(),
2460                    path = "streaming-framed",
2461                    "S4 put_object: compressing (streaming, S4F2 multi-frame)"
2462                );
2463                // v0.4 #16: pick the chunk size based on the request's
2464                // Content-Length when known, falling back to the 4 MiB
2465                // default for chunked transfers.
2466                let chunk_size = pick_chunk_size(req.input.content_length.map(|n| n as u64));
2467                // v0.8.4 #73 M2: pass the request's Content-Length so
2468                // streaming_compress_to_frames can fail-fast on a mid-PUT
2469                // truncation (client disconnect after sending half the
2470                // body). `None` is the chunked-Transfer-Encoding case
2471                // where the upstream genuinely doesn't know the size and
2472                // the backend's framing layer is the only truncation
2473                // signal we have.
2474                let expected_input_size =
2475                    req.input.content_length.and_then(|n| u64::try_from(n).ok());
2476                let (body, manifest) = streaming_compress_to_frames(
2477                    chained,
2478                    Arc::clone(&self.registry),
2479                    kind,
2480                    chunk_size,
2481                    expected_input_size,
2482                )
2483                .await
2484                .map_err(|e| match e {
2485                    s4_codec::CodecError::TruncatedStream { expected, got } => {
2486                        // 400 IncompleteBody: client advertised N bytes
2487                        // but disconnected after `got`. Mirrors AWS S3's
2488                        // canonical error code for the same shape so SDK
2489                        // retries kick in instead of treating the PUT as
2490                        // a successful upload of a half-body.
2491                        S3Error::with_message(
2492                            S3ErrorCode::IncompleteBody,
2493                            format!("PUT body truncated: expected {expected} bytes, got {got}"),
2494                        )
2495                    }
2496                    // v0.8.15 M-4: 400
2497                    // `RequestBodyLengthMismatch` for over-length
2498                    // bodies. AWS S3 returns this when the declared
2499                    // `Content-Length` is smaller than the wire body;
2500                    // S4 used to silently accept the surplus bytes.
2501                    // `IncompleteBody` is the closest typed variant
2502                    // in the s3s enum — we widen the message so the
2503                    // SDK / curl side sees the shape unambiguously.
2504                    s4_codec::CodecError::OverlengthStream { expected, got } => {
2505                        let code = S3ErrorCode::from_bytes(b"RequestBodyLengthMismatch")
2506                            .unwrap_or(S3ErrorCode::IncompleteBody);
2507                        S3Error::with_message(
2508                            code,
2509                            format!(
2510                                "PUT body length mismatch: Content-Length declared {expected} \
2511                                 bytes, body carried at least {got}"
2512                            ),
2513                        )
2514                    }
2515                    other => internal("streaming framed compress")(other),
2516                })?;
2517                (body, manifest, true)
2518            } else {
2519                // GPU codec 等で streaming-aware でないものは bytes-buffered path
2520                // (raw 圧縮 bytes、framed なし — back-compat 互換 path)
2521                let bytes = collect_with_sample(sample, rest_stream, self.max_body_bytes)
2522                    .await
2523                    .map_err(internal("collect put body (buffered path)"))?;
2524                // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
2525                // checksum algorithms against the received body on
2526                // the buffered path. The streaming-framed branch
2527                // above redirects here when ANY checksum header is
2528                // present (#127 MED-B), so this is the single
2529                // checkpoint for client-supplied integrity.
2530                verify_client_body_checksums(
2531                    &bytes,
2532                    req.input.content_md5.as_deref(),
2533                    req.input.checksum_crc32.as_deref(),
2534                    req.input.checksum_crc32c.as_deref(),
2535                    req.input.checksum_sha1.as_deref(),
2536                    req.input.checksum_sha256.as_deref(),
2537                    req.input.checksum_crc64nvme.as_deref(),
2538                )?;
2539                debug!(
2540                    bucket = ?req.input.bucket,
2541                    key = ?req.input.key,
2542                    bytes = bytes.len(),
2543                    codec = kind.as_str(),
2544                    path = "buffered",
2545                    "S4 put_object: compressing (buffered, raw blob)"
2546                );
2547                // v0.8 #55: telemetry-returning compress so we can stamp
2548                // GPU-pipeline Prometheus metrics (`s4_gpu_compress_seconds`,
2549                // throughput gauge, OOM counter) for nvcomp / dietgpu codecs.
2550                // CPU codecs come back with `gpu_seconds = None` and the
2551                // stamp helper short-circuits — no extra cost on CPU path.
2552                let (compress_res, tel) = self.registry.compress_with_telemetry(bytes, kind).await;
2553                stamp_gpu_compress_telemetry(&tel);
2554                let (body, m) = compress_res.map_err(internal("registry compress"))?;
2555                (body, m, false)
2556            };
2557
2558            write_manifest(&mut req.input.metadata, &manifest);
2559            if is_framed {
2560                // v0.2 #4: framed body であることを GET 側に伝える meta flag。
2561                req.input
2562                    .metadata
2563                    .get_or_insert_with(Default::default)
2564                    .insert(META_FRAMED.into(), "true".into());
2565            }
2566            // 重要: content_length を圧縮後サイズで更新する。
2567            // これを忘れると下流 (aws-sdk-s3 → S3) が宣言サイズ分の bytes を
2568            // 待ち続けて RequestTimeout で失敗する (S3 仕様)。
2569            req.input.content_length = Some(compressed.len() as i64);
2570            // body を書き換えたので、客側が送ってきた original body 用の
2571            // checksum / MD5 ヘッダは無効化する (そのまま転送すると下流 S3 が
2572            // XAmzContentChecksumMismatch を返す)。S4 自身の整合性は
2573            // ChunkManifest.crc32c で担保している。
2574            req.input.checksum_algorithm = None;
2575            req.input.checksum_crc32 = None;
2576            req.input.checksum_crc32c = None;
2577            req.input.checksum_crc64nvme = None;
2578            req.input.checksum_sha1 = None;
2579            req.input.checksum_sha256 = None;
2580            req.input.content_md5 = None;
2581            let original_size = manifest.original_size;
2582            let compressed_size = manifest.compressed_size;
2583            let codec_label = manifest.codec.as_str();
2584            // (sidecar_index is built below, after the SSE-mode
2585            // extraction, so v0.8.12 HIGH-10 can short-circuit the
2586            // build when the on-disk bytes are about to be encrypted.)
2587            // v0.4 #21 / v0.5 #29 / v0.5 #27: encrypt-after-compress.
2588            // Precedence:
2589            //   - SSE-C headers present → per-request customer key (S4E3)
2590            //   - server-managed keyring configured → active key (S4E2)
2591            //   - neither → no encryption (raw compressed body)
2592            // The `s4-encrypted: aes-256-gcm` metadata flag is set in
2593            // both encrypted modes; the on-disk frame magic distinguishes
2594            // S4E1 / S4E2 / S4E3 so GET picks the right decrypt path.
2595            // v0.7 #48 BUG-2/3 fix: take() the SSE fields off req.input
2596            // so the encryption headers are NOT forwarded to the
2597            // backend. S4 owns the encrypt-then-store contract; if we
2598            // leave the headers in place, real S3-compat backends
2599            // (MinIO / AWS) try to apply their own SSE on top and
2600            // either reject (MinIO requires HTTPS for SSE-C) or fail
2601            // (MinIO has no KMS configured). MemoryBackend ignored
2602            // these so mock tests passed.
2603            let sse_c_alg = req.input.sse_customer_algorithm.take();
2604            let sse_c_key = req.input.sse_customer_key.take();
2605            let sse_c_md5 = req.input.sse_customer_key_md5.take();
2606            let sse_header = req.input.server_side_encryption.take();
2607            let sse_kms_key = req.input.ssekms_key_id.take();
2608            let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
2609            // v0.5 #28: SSE-KMS request? Resolves to None unless the
2610            // request asks for `aws:kms` AND a key id is available
2611            // (explicit header or gateway default). When set, we'll
2612            // generate a per-object DEK below.
2613            let kms_key_id = extract_kms_key_id(
2614                &sse_header,
2615                &sse_kms_key,
2616                self.kms_default_key_id.as_deref(),
2617            );
2618            // v0.8.12 HIGH-10 fix: the sidecar offsets describe the
2619            // pre-encrypt `compressed` body, but the bytes the
2620            // backend stores when any SSE mode is active are
2621            // *post-encrypt* (different length, different layout).
2622            // A Range GET on an SSE-encrypted object would slice the
2623            // ciphertext at the stale offsets, hand the wrong bytes
2624            // to the frame parser, and 500. Suppress the sidecar
2625            // entirely when SSE is going to be applied below;
2626            // encrypted-object Range GET falls back to the buffered
2627            // path (decrypt full body → frame parse → slice), trading
2628            // partial-fetch performance for correctness. An
2629            // encryption-aware sidecar format is a follow-up issue.
2630            let will_encrypt =
2631                sse_c_material.is_some() || kms_key_id.is_some() || self.sse_keyring.is_some();
2632            let sidecar_index = if is_framed && !will_encrypt {
2633                s4_codec::index::build_index_from_body(&compressed).ok()
2634            } else {
2635                None
2636            };
2637            // v0.5 #32: in compliance-strict mode, every PUT must
2638            // declare SSE — either client-supplied (SSE-C), KMS, or by
2639            // virtue of a server-side keyring being configured (which
2640            // applies SSE-S4 to every PUT automatically). Requests that
2641            // would otherwise land as plain compressed bytes are
2642            // rejected with 400 InvalidRequest.
2643            if self.compliance_strict
2644                && sse_c_material.is_none()
2645                && kms_key_id.is_none()
2646                && self.sse_keyring.is_none()
2647                && sse_header.as_ref().map(|s| s.as_str()) != Some(ServerSideEncryption::AES256)
2648            {
2649                return Err(S3Error::with_message(
2650                    S3ErrorCode::InvalidRequest,
2651                    "compliance-mode strict: PUT must include x-amz-server-side-encryption \
2652                     (AES256 or aws:kms) or x-amz-server-side-encryption-customer-* headers",
2653                ));
2654            }
2655            // SSE-C and SSE-KMS are mutually exclusive on a single PUT
2656            // (AWS S3 returns 400 InvalidArgument). SSE-C wins by spec.
2657            if sse_c_material.is_some() && kms_key_id.is_some() {
2658                return Err(S3Error::with_message(
2659                    S3ErrorCode::InvalidArgument,
2660                    "SSE-C and SSE-KMS cannot be used together on the same PUT",
2661                ));
2662            }
2663            // KMS path needs to call generate_dek().await before the
2664            // body_to_send branch; capture the result here.
2665            //
2666            // v0.8.1 #58: the plaintext DEK lives in three places
2667            // during one PUT:
2668            //
2669            //   1. The `Zeroizing<Vec<u8>>` returned by `generate_dek`
2670            //      — wiped when the binding `dek` falls out of scope at
2671            //      the end of this `if`-arm.
2672            //   2. The stack `[u8; 32]` we copy into for `SseSource::Kms`
2673            //      — wrapped in `Zeroizing<[u8; 32]>` so it's wiped when
2674            //      the outer `kms_wrap` `Option` is dropped at the end
2675            //      of `put_object`.
2676            //   3. AES-GCM internal key state inside the `aes-gcm`
2677            //      crate during `encrypt_with_source` — out of scope
2678            //      for this fix; tracked separately in v0.8.2.
2679            let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
2680                if let Some(ref key_id) = kms_key_id {
2681                    let kms = self.kms.as_ref().ok_or_else(|| {
2682                    S3Error::with_message(
2683                        S3ErrorCode::InvalidRequest,
2684                        "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
2685                    )
2686                })?;
2687                    // `dek` is `Zeroizing<Vec<u8>>`; deref + slice access
2688                    // works unchanged via `Deref<Target=Vec<u8>>`.
2689                    let (dek, wrapped) = kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
2690                    if dek.len() != 32 {
2691                        return Err(S3Error::with_message(
2692                            S3ErrorCode::InternalError,
2693                            format!(
2694                                "KMS backend returned a DEK of {} bytes (expected 32)",
2695                                dek.len()
2696                            ),
2697                        ));
2698                    }
2699                    let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
2700                        zeroize::Zeroizing::new([0u8; 32]);
2701                    dek_arr.copy_from_slice(&dek);
2702                    // `dek` (the `Zeroizing<Vec<u8>>`) is dropped at the
2703                    // end of this scope, wiping the heap allocation.
2704                    Some((dek_arr, wrapped))
2705                } else {
2706                    None
2707                };
2708            // v0.7 #48 BUG-4 fix: stamp the SSE *type* into metadata
2709            // alongside `s4-encrypted` so HEAD (which doesn't fetch the
2710            // body) can echo the correct `x-amz-server-side-encryption`
2711            // value. Without this, HEAD on an SSE-KMS object would not
2712            // echo `aws:kms` because the frame magic is only available
2713            // on the body (which HEAD doesn't read).
2714            let body_to_send = if let Some(ref m) = sse_c_material {
2715                let meta = req.input.metadata.get_or_insert_with(Default::default);
2716                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2717                meta.insert("s4-sse-type".into(), "AES256".into());
2718                meta.insert(
2719                    "s4-sse-c-key-md5".into(),
2720                    base64::engine::general_purpose::STANDARD.encode(m.key_md5),
2721                );
2722                crate::sse::encrypt_with_source(
2723                    &compressed,
2724                    crate::sse::SseSource::CustomerKey {
2725                        key: &m.key,
2726                        key_md5: &m.key_md5,
2727                    },
2728                )
2729            } else if let Some((ref dek, ref wrapped)) = kms_wrap {
2730                let meta = req.input.metadata.get_or_insert_with(Default::default);
2731                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2732                meta.insert("s4-sse-type".into(), "aws:kms".into());
2733                meta.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
2734                // v0.8.1 #58: `dek` is `&Zeroizing<[u8; 32]>`; `SseSource::Kms`
2735                // wants `&[u8; 32]`. Rust auto-derefs `&Zeroizing<T>` to
2736                // `&T` here via `Deref<Target=T>`, so the binding picks
2737                // up the inner array reference without copying. The array
2738                // stays in the `Zeroizing` wrapper that owns it and gets
2739                // wiped when `kms_wrap` drops at the end of `put_object`.
2740                let dek_ref: &[u8; 32] = dek;
2741                crate::sse::encrypt_with_source(
2742                    &compressed,
2743                    crate::sse::SseSource::Kms {
2744                        dek: dek_ref,
2745                        wrapped,
2746                    },
2747                )
2748            } else if let Some(keyring) = self.sse_keyring.as_ref() {
2749                // SSE-S4 is server-driven transparent encryption; the
2750                // client didn't ask for SSE. We stamp `s4-encrypted`
2751                // (internal flag the GET path needs) but deliberately
2752                // do NOT stamp `s4-sse-type` — that lights up the HEAD
2753                // echo of `x-amz-server-side-encryption: AES256`,
2754                // which would falsely advertise AWS-style SSE-S3
2755                // semantics the operator didn't request.
2756                let meta = req.input.metadata.get_or_insert_with(Default::default);
2757                meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2758                // v0.8 #52: when `--sse-chunk-size > 0` is configured,
2759                // emit the chunked S4E5 frame so the matching GET can
2760                // stream-decrypt instead of buffering 5 GiB before
2761                // emitting a byte. Falls back to the buffered S4E2
2762                // frame at chunk_size=0 (default) so existing
2763                // deployments are bit-for-bit unchanged.
2764                if self.sse_chunk_size > 0 {
2765                    crate::sse::encrypt_v2_chunked(&compressed, keyring, self.sse_chunk_size)
2766                        .map_err(|e| {
2767                            S3Error::with_message(
2768                                S3ErrorCode::InternalError,
2769                                format!("SSE-S4 chunked encrypt failed: {e}"),
2770                            )
2771                        })?
2772                } else {
2773                    crate::sse::encrypt_v2(&compressed, keyring)
2774                }
2775            } else {
2776                compressed.clone()
2777            };
2778            // v0.6 #40: capture the about-to-be-sent body + metadata so
2779            // the replication dispatcher (run after the source PUT
2780            // succeeds) can hand the same backend bytes to the
2781            // destination bucket. `Bytes` clone is cheap (refcounted).
2782            let replication_body = body_to_send.clone();
2783            let replication_metadata = req.input.metadata.clone();
2784            // v0.7 #48 BUG-1 fix: SSE encryption (S4E1/E2/E3/E4 frames)
2785            // makes the body longer than the post-compression bytes
2786            // (header + nonce + tag overhead). The earlier
2787            // content_length stamp at compressed.len() is now stale, so
2788            // re-stamp from the actual bytes about to be sent or the
2789            // backend (real S3 / MinIO) rejects with
2790            // `StreamLengthMismatch`. MemoryBackend never validated
2791            // this, which is why mock-only tests passed.
2792            req.input.content_length = Some(body_to_send.len() as i64);
2793            req.input.body = Some(bytes_to_blob(body_to_send));
2794            // v0.5 #34: pre-allocate a version-id when the bucket is
2795            // Enabled, then redirect the backend storage key to the
2796            // shadow path so older versions survive newer PUTs.
2797            // Suspended / Unversioned buckets keep using the plain
2798            // `<key>` (S3 spec: Suspended overwrites the same backend
2799            // object). Pre-allocation (instead of recording after PUT)
2800            // ensures the shadow key + the response's
2801            // `x-amz-version-id` use the same vid.
2802            let pending_version: Option<crate::versioning::PutOutcome> = self
2803                .versioning
2804                .as_ref()
2805                .map(|mgr| mgr.state(&put_bucket))
2806                .map(|state| match state {
2807                    crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
2808                        version_id: crate::versioning::VersioningManager::new_version_id(),
2809                        versioned_response: true,
2810                    },
2811                    crate::versioning::VersioningState::Suspended
2812                    | crate::versioning::VersioningState::Unversioned => {
2813                        crate::versioning::PutOutcome {
2814                            version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
2815                            versioned_response: false,
2816                        }
2817                    }
2818                });
2819            if let Some(ref pv) = pending_version
2820                && pv.versioned_response
2821            {
2822                req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
2823            }
2824            // v0.8.4 #73 H-2: capture the to-be-stored body length BEFORE
2825            // the move into `req.input` is consumed by the backend call.
2826            // The sidecar's `source_compressed_size` is checked against
2827            // the live HEAD `Content-Length` on Range GET to detect a
2828            // backend-side mutation.
2829            let backend_object_size = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2830            let mut backend_resp = self.backend.put_object(req).await;
2831            if let Some(mut idx) = sidecar_index
2832                && let Ok(ref resp) = backend_resp
2833                && idx.entries.len() > 1
2834            {
2835                // 1 chunk しかない (small object) なら sidecar は意味がない (=
2836                // partial fetch しても full body と同じ範囲) ので省略。
2837                // Sidecar は user-visible key で書く (latest version の
2838                // partial fetch path 用)。Old versions の Range GET は今 task
2839                // の scope 外 (full read fallback でも意味的には正しい)。
2840                //
2841                // v0.8.4 #73 H-2: stamp the version-binding fields the
2842                // GET path needs to detect a stale / attacker-written
2843                // sidecar. ETag comes from the backend's PUT response —
2844                // when missing (some backends don't return an ETag) we
2845                // synthesize a CRC-derived stable identifier so the
2846                // sidecar still binds to *something*; the GET HEAD will
2847                // see the same backend ETag (None vs None) and treat the
2848                // pair as consistent.
2849                let source_etag = resp.output.e_tag.as_ref().map(|t| t.value().to_string());
2850                idx.source_etag = source_etag;
2851                idx.source_compressed_size = backend_object_size;
2852                self.write_sidecar(&put_bucket, &put_key, &idx).await;
2853            }
2854            // v0.5 #34: commit the new version into the manager only on
2855            // backend success. Use the pre-allocated vid so the response
2856            // header and the chain entry agree.
2857            if let (Some(mgr), Some(pv), Ok(resp)) = (
2858                self.versioning.as_ref(),
2859                pending_version.as_ref(),
2860                backend_resp.as_mut(),
2861            ) {
2862                let etag = resp
2863                    .output
2864                    .e_tag
2865                    .clone()
2866                    .map(ETag::into_value)
2867                    .unwrap_or_else(|| format!("\"crc32c-{}\"", manifest.crc32c));
2868                let now = chrono::Utc::now();
2869                mgr.commit_put_with_version(
2870                    &put_bucket,
2871                    &put_key,
2872                    crate::versioning::VersionEntry {
2873                        version_id: pv.version_id.clone(),
2874                        etag,
2875                        size: original_size,
2876                        is_delete_marker: false,
2877                        created_at: now,
2878                    },
2879                );
2880                if pv.versioned_response {
2881                    resp.output.version_id = Some(pv.version_id.clone());
2882                }
2883            }
2884            // v0.5 #27: AWS S3 echoes the SSE-C headers back on success
2885            // so the client knows the server actually applied the
2886            // requested algorithm and which key fingerprint matched.
2887            if let (Some(m), Ok(resp)) = (sse_c_material.as_ref(), backend_resp.as_mut()) {
2888                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
2889                resp.output.sse_customer_key_md5 =
2890                    Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
2891            }
2892            // v0.5 #28: SSE-KMS echo — `aws:kms` + the canonical key id
2893            // the backend returned (AWS KMS returns the ARN even when
2894            // the request used an alias).
2895            if let (Some((_, wrapped)), Ok(resp)) = (kms_wrap.as_ref(), backend_resp.as_mut()) {
2896                resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
2897                    ServerSideEncryption::AWS_KMS,
2898                ));
2899                resp.output.ssekms_key_id = Some(wrapped.key_id.clone());
2900            }
2901            // v0.5 #30: persist any per-PUT explicit retention / legal
2902            // hold the client supplied, then auto-apply the bucket
2903            // default (no-op when state is already populated). The
2904            // explicit fields take precedence — the bucket-default
2905            // helper bails out as soon as it sees any retention.
2906            if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
2907                if explicit_lock_mode.is_some()
2908                    || explicit_retain_until.is_some()
2909                    || explicit_legal_hold_on.is_some()
2910                {
2911                    let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
2912                    if let Some(m) = explicit_lock_mode {
2913                        state.mode = Some(m);
2914                    }
2915                    if let Some(u) = explicit_retain_until {
2916                        state.retain_until = Some(u);
2917                    }
2918                    if let Some(lh) = explicit_legal_hold_on {
2919                        state.legal_hold_on = lh;
2920                    }
2921                    mgr.set(&put_bucket, &put_key, state);
2922                }
2923                mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
2924            }
2925            let _ = (original_size, compressed_size); // mute unused warnings
2926            let elapsed = put_start.elapsed();
2927            crate::metrics::record_put(
2928                codec_label,
2929                original_size,
2930                compressed_size,
2931                elapsed.as_secs_f64(),
2932                backend_resp.is_ok(),
2933            );
2934            // v0.4 #20: structured access-log entry (best-effort).
2935            self.record_access(
2936                access_preamble,
2937                "REST.PUT.OBJECT",
2938                &put_bucket,
2939                Some(&put_key),
2940                if backend_resp.is_ok() { 200 } else { 500 },
2941                compressed_size,
2942                original_size,
2943                elapsed.as_millis() as u64,
2944                backend_resp.as_ref().err().map(|e| e.code().as_str()),
2945            )
2946            .await;
2947            info!(
2948                op = "put_object",
2949                bucket = %put_bucket,
2950                key = %put_key,
2951                codec = codec_label,
2952                bytes_in = original_size,
2953                bytes_out = compressed_size,
2954                ratio = format!(
2955                    "{:.3}",
2956                    if original_size == 0 { 1.0 } else { compressed_size as f64 / original_size as f64 }
2957                ),
2958                latency_ms = elapsed.as_millis() as u64,
2959                ok = backend_resp.is_ok(),
2960                "S4 put completed"
2961            );
2962            // v0.6 #35: fire bucket-notification destinations (best-effort,
2963            // detached). Skipped when no manager is attached or when the
2964            // bucket has no rule matching `s3:ObjectCreated:Put` for this
2965            // key.
2966            if backend_resp.is_ok()
2967                && let Some(mgr) = self.notifications.as_ref()
2968            {
2969                let dests = mgr.match_destinations(
2970                    &put_bucket,
2971                    &crate::notifications::EventType::ObjectCreatedPut,
2972                    &put_key,
2973                );
2974                if !dests.is_empty() {
2975                    let etag = backend_resp
2976                        .as_ref()
2977                        .ok()
2978                        .and_then(|r| r.output.e_tag.clone())
2979                        .map(ETag::into_value);
2980                    let version_id = pending_version
2981                        .as_ref()
2982                        .filter(|pv| pv.versioned_response)
2983                        .map(|pv| pv.version_id.clone());
2984                    tokio::spawn(crate::notifications::dispatch_event(
2985                        Arc::clone(mgr),
2986                        put_bucket.clone(),
2987                        put_key.clone(),
2988                        crate::notifications::EventType::ObjectCreatedPut,
2989                        Some(original_size),
2990                        etag,
2991                        version_id,
2992                        format!("S4-{}", uuid::Uuid::new_v4()),
2993                    ));
2994                }
2995            }
2996            // v0.6 #39: persist parsed `x-amz-tagging` tags into the
2997            // tagging manager on a successful PUT. AWS PutObject's
2998            // tagging is a full-replace operation (not a merge), so
2999            // any pre-existing entry for `(bucket, key)` is overwritten.
3000            if backend_resp.is_ok()
3001                && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3002            {
3003                mgr.put_object_tags(&put_bucket, &put_key, tags);
3004            }
3005            // v0.6 #40: cross-bucket replication fire-point. On
3006            // successful source PUT, consult the replication manager;
3007            // when an enabled rule matches, mark the source key
3008            // `Pending` and spawn a detached task that PUTs the same
3009            // backend bytes + metadata to the rule's destination
3010            // bucket. The dispatcher itself records `Completed` /
3011            // `Failed` and bumps the drop counter on retry-budget
3012            // exhaustion.
3013            self.spawn_replication_if_matched(
3014                &put_bucket,
3015                &put_key,
3016                &request_tags,
3017                &replication_body,
3018                &replication_metadata,
3019                backend_resp.is_ok(),
3020                pending_version.as_ref(),
3021            );
3022            return backend_resp;
3023        }
3024        // Body-less PUT (rare: zero-length object). Mirror the body-full
3025        // versioning hooks so list_object_versions / GET-by-version still see
3026        // empty-body objects in the chain.
3027        let pending_version: Option<crate::versioning::PutOutcome> = self
3028            .versioning
3029            .as_ref()
3030            .map(|mgr| mgr.state(&put_bucket))
3031            .map(|state| match state {
3032                crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3033                    version_id: crate::versioning::VersioningManager::new_version_id(),
3034                    versioned_response: true,
3035                },
3036                _ => crate::versioning::PutOutcome {
3037                    version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3038                    versioned_response: false,
3039                },
3040            });
3041        if let Some(ref pv) = pending_version
3042            && pv.versioned_response
3043        {
3044            req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3045        }
3046        let mut backend_resp = self.backend.put_object(req).await;
3047        if let (Some(mgr), Some(pv), Ok(resp)) = (
3048            self.versioning.as_ref(),
3049            pending_version.as_ref(),
3050            backend_resp.as_mut(),
3051        ) {
3052            let etag = resp
3053                .output
3054                .e_tag
3055                .clone()
3056                .map(ETag::into_value)
3057                .unwrap_or_default();
3058            let now = chrono::Utc::now();
3059            mgr.commit_put_with_version(
3060                &put_bucket,
3061                &put_key,
3062                crate::versioning::VersionEntry {
3063                    version_id: pv.version_id.clone(),
3064                    etag,
3065                    size: 0,
3066                    is_delete_marker: false,
3067                    created_at: now,
3068                },
3069            );
3070            if pv.versioned_response {
3071                resp.output.version_id = Some(pv.version_id.clone());
3072            }
3073        }
3074        // v0.5 #30: same explicit-then-default lock-state commit as the
3075        // body-bearing branch above, so a zero-length PUT also picks up
3076        // bucket-default retention.
3077        if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3078            if explicit_lock_mode.is_some()
3079                || explicit_retain_until.is_some()
3080                || explicit_legal_hold_on.is_some()
3081            {
3082                let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3083                if let Some(m) = explicit_lock_mode {
3084                    state.mode = Some(m);
3085                }
3086                if let Some(u) = explicit_retain_until {
3087                    state.retain_until = Some(u);
3088                }
3089                if let Some(lh) = explicit_legal_hold_on {
3090                    state.legal_hold_on = lh;
3091                }
3092                mgr.set(&put_bucket, &put_key, state);
3093            }
3094            mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3095        }
3096        // v0.6 #35: same notification fire-point as the body-bearing PUT
3097        // branch above (zero-length objects still match `ObjectCreated:Put`
3098        // rules per the AWS event taxonomy).
3099        if backend_resp.is_ok()
3100            && let Some(mgr) = self.notifications.as_ref()
3101        {
3102            let dests = mgr.match_destinations(
3103                &put_bucket,
3104                &crate::notifications::EventType::ObjectCreatedPut,
3105                &put_key,
3106            );
3107            if !dests.is_empty() {
3108                let etag = backend_resp
3109                    .as_ref()
3110                    .ok()
3111                    .and_then(|r| r.output.e_tag.clone())
3112                    .map(ETag::into_value);
3113                let version_id = pending_version
3114                    .as_ref()
3115                    .filter(|pv| pv.versioned_response)
3116                    .map(|pv| pv.version_id.clone());
3117                tokio::spawn(crate::notifications::dispatch_event(
3118                    Arc::clone(mgr),
3119                    put_bucket.clone(),
3120                    put_key.clone(),
3121                    crate::notifications::EventType::ObjectCreatedPut,
3122                    Some(0),
3123                    etag,
3124                    version_id,
3125                    format!("S4-{}", uuid::Uuid::new_v4()),
3126                ));
3127            }
3128        }
3129        // v0.6 #39: persist parsed `x-amz-tagging` for the body-less
3130        // (zero-length) PUT branch too — same shape as the body-bearing
3131        // branch above.
3132        if backend_resp.is_ok()
3133            && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3134        {
3135            mgr.put_object_tags(&put_bucket, &put_key, tags);
3136        }
3137        // v0.6 #40: cross-bucket replication for the zero-length PUT
3138        // branch — same shape as the body-bearing branch above.
3139        // v0.8.2 #61: pass `pending_version` so a versioned source's
3140        // destination receives the same shadow-key path.
3141        self.spawn_replication_if_matched(
3142            &put_bucket,
3143            &put_key,
3144            &request_tags,
3145            &bytes::Bytes::new(),
3146            &None,
3147            backend_resp.is_ok(),
3148            pending_version.as_ref(),
3149        );
3150        backend_resp
3151    }
3152
3153    // === 圧縮を解く path (GET) ===
3154    #[tracing::instrument(
3155        name = "s4.get_object",
3156        skip(self, req),
3157        fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_out, range, path)
3158    )]
3159    async fn get_object(
3160        &self,
3161        mut req: S3Request<GetObjectInput>,
3162    ) -> S3Result<S3Response<GetObjectOutput>> {
3163        let get_start = Instant::now();
3164        let get_bucket = req.input.bucket.clone();
3165        let get_key = req.input.key.clone();
3166        // v0.8.16 F-13: reserved-name guard now also fires on GET.
3167        // The v0.8.15 #137 fix only blocked PUT / Copy / Create —
3168        // a curious or hostile client could still
3169        // `GetObject(<key>.s4index)` and read the raw sidecar
3170        // (frame layout, source ETag, source compressed size).
3171        // The list filter already hides the entry from listings;
3172        // explicit reject closes the directed-read leak.
3173        if s4_codec::index::is_reserved_sidecar_key(&get_key) {
3174            return Err(S3Error::with_message(
3175                S3ErrorCode::NoSuchKey,
3176                format!("object key {get_key:?} is reserved for S4 internal sidecars"),
3177            ));
3178        }
3179        self.enforce_rate_limit(&req, &get_bucket)?;
3180        self.enforce_policy(&req, "s3:GetObject", &get_bucket, Some(&get_key))?;
3181        // Range request の事前検出 (decompress 後 slice する path に使う)。
3182        let range_request = req.input.range.take();
3183        // v0.5 #27: pull SSE-C material from the input headers before
3184        // the request is moved into the backend. A header parse error
3185        // fails fast (no body fetch). The material is consumed below
3186        // when decrypting an S4E3-framed body; the SSE-C headers on
3187        // `req.input` are cleared so the backend doesn't see them.
3188        let sse_c_alg = req.input.sse_customer_algorithm.take();
3189        let sse_c_key = req.input.sse_customer_key.take();
3190        let sse_c_md5 = req.input.sse_customer_key_md5.take();
3191        let get_sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3192
3193        // v0.5 #34: route the GET through the VersioningManager when
3194        // attached AND the bucket is in a versioning-aware state.
3195        // Resolves which version to fetch (explicit `?versionId=` query
3196        // param vs. chain latest), translates a delete-marker into 404
3197        // NoSuchKey, and rewrites the backend storage key to the shadow
3198        // path (`<key>.__s4ver__/<vid>`) for non-null Enabled-bucket
3199        // versions. `resolved_version_id` is stamped onto the response
3200        // so clients see a coherent `x-amz-version-id` header.
3201        //
3202        // When the bucket is Unversioned (or no manager attached), the
3203        // chain-resolution step is skipped and the request flows
3204        // through the existing single-key path unchanged.
3205        let resolved_version_id: Option<String> = match self.versioning.as_ref() {
3206            Some(mgr)
3207                if mgr.state(&get_bucket) != crate::versioning::VersioningState::Unversioned =>
3208            {
3209                let req_vid = req.input.version_id.take();
3210                let entry = match req_vid.as_deref() {
3211                    Some(vid) => {
3212                        mgr.lookup_version(&get_bucket, &get_key, vid)
3213                            .ok_or_else(|| {
3214                                S3Error::with_message(
3215                                    S3ErrorCode::NoSuchVersion,
3216                                    format!("no such version: {vid}"),
3217                                )
3218                            })?
3219                    }
3220                    None => mgr.lookup_latest(&get_bucket, &get_key).ok_or_else(|| {
3221                        S3Error::with_message(
3222                            S3ErrorCode::NoSuchKey,
3223                            format!("no such key: {get_key}"),
3224                        )
3225                    })?,
3226                };
3227                if entry.is_delete_marker {
3228                    // S3 spec: GET without versionId on a
3229                    // delete-marker latest → 404 NoSuchKey + the
3230                    // response carries `x-amz-delete-marker: true`.
3231                    // GET with explicit versionId pointing at a delete
3232                    // marker → 405 MethodNotAllowed; we surface
3233                    // NoSuchKey here for both since s3s collapses them
3234                    // into the same not-found error path.
3235                    return Err(S3Error::with_message(
3236                        S3ErrorCode::NoSuchKey,
3237                        format!("delete marker is the current version of {get_key}"),
3238                    ));
3239                }
3240                if entry.version_id != crate::versioning::NULL_VERSION_ID {
3241                    req.input.key = versioned_shadow_key(&get_key, &entry.version_id);
3242                }
3243                Some(entry.version_id)
3244            }
3245            _ => None,
3246        };
3247
3248        // ====== Range GET の partial-fetch fast path (sidecar index 利用) ======
3249        // sidecar `<key>.s4index` が存在し、multipart-framed object であれば
3250        // 必要 frame だけを backend に Range GET し帯域節約する。
3251        //
3252        // v0.8.4 #73 H-2: BEFORE trusting the sidecar's frame offsets,
3253        // verify the source object hasn't been overwritten / mutated since
3254        // the sidecar was stamped. The sidecar carries the backend ETag
3255        // captured at PUT time (`source_etag`); a HEAD against the current
3256        // backend object tells us the live ETag. If they disagree we treat
3257        // the sidecar as stale and fall through to the full-GET path —
3258        // returning the wrong frames for a Range request would surface as
3259        // a CRC mismatch deeper in the stack but would also potentially
3260        // disclose unrelated frames if a hostile operator wrote the
3261        // sidecar themselves. Fail-open to "full read" is the safe default.
3262        //
3263        // Legacy v1 sidecars (no `source_etag` populated) keep the old
3264        // best-effort behaviour so existing on-disk indexes don't suddenly
3265        // start missing the partial-fetch path.
3266        if let Some(ref r) = range_request
3267            && let Some(index) = self.read_sidecar(&req.input.bucket, &req.input.key).await
3268            && self
3269                .sidecar_version_binding_ok(&req.input.bucket, &req.input.key, &index)
3270                .await
3271        {
3272            let total = index.total_original_size();
3273            let (start, end_exclusive) = match resolve_range(r, total) {
3274                Ok(v) => v,
3275                Err(e) => {
3276                    return Err(S3Error::with_message(S3ErrorCode::InvalidRange, e));
3277                }
3278            };
3279            if let Some(plan) = index.lookup_range(start, end_exclusive) {
3280                return self
3281                    .partial_range_get(&req, plan, start, end_exclusive, total, get_start)
3282                    .await;
3283            }
3284        }
3285        let mut resp = self.backend.get_object(req).await?;
3286        // v0.5 #34: stamp the resolved version-id so the client sees a
3287        // coherent `x-amz-version-id` header (only for chains owned by
3288        // the manager — Unversioned buckets / no-manager paths never
3289        // set this).
3290        if let Some(ref vid) = resolved_version_id {
3291            resp.output.version_id = Some(vid.clone());
3292        }
3293        let is_multipart = is_multipart_object(&resp.output.metadata);
3294        let is_framed_v2 = is_framed_v2_object(&resp.output.metadata);
3295        // v0.2 #4: framed-v2 single-PUT は多 frame parse が必要なので
3296        // multipart と同じ path に流す。
3297        let needs_frame_parse = is_multipart || is_framed_v2;
3298        let manifest_opt = extract_manifest(&resp.output.metadata);
3299
3300        if !needs_frame_parse && manifest_opt.is_none() {
3301            // S4 が書いていないオブジェクトは透過 (raw bucket pre-existing object 等)
3302            debug!("S4 get_object: object lacks s4-codec metadata, returning as-is");
3303            return Ok(resp);
3304        }
3305
3306        if let Some(blob) = resp.output.body.take() {
3307            // v0.4 #21 / v0.5 #27: if the object was stored under SSE
3308            // (metadata flag `s4-encrypted: aes-256-gcm`), decrypt
3309            // before any frame parse / streaming decompress. Encrypted
3310            // bodies are opaque to the codec; this also forces the
3311            // buffered path because AES-GCM needs the full body for tag
3312            // verify. SSE-C uses the per-request customer key, SSE-S4
3313            // falls back to the configured keyring.
3314            let blob = if is_sse_encrypted(&resp.output.metadata) {
3315                let body = collect_blob(blob, self.max_body_bytes)
3316                    .await
3317                    .map_err(internal("collect SSE-encrypted body"))?;
3318                // v0.5 #28: peek the frame magic to route the right
3319                // decrypt path. S4E4 means SSE-KMS — unwrap the DEK
3320                // through the KMS backend (async). S4E1/E2/E3 take
3321                // the sync path (keyring or customer key).
3322                //
3323                // v0.8 #52 (S4E5) / v0.8.1 #57 (S4E6): the chunked
3324                // SSE-S4 frames take the *streaming* path — we hand
3325                // the response body a per-chunk verify-and-emit
3326                // Stream so the client sees chunk 0 plaintext after
3327                // one chunk-worth of AES-GCM verify (vs. waiting
3328                // for the whole body's tag), and the gateway no
3329                // longer needs to materialize the full plaintext
3330                // in memory before responding. SSE-C is out of
3331                // scope for the chunked path (chunked S4E3 is a
3332                // follow-up), so this branch requires the SSE-S4
3333                // keyring to be wired and `get_sse_c_material` to
3334                // be absent — otherwise we surface a clear
3335                // misconfiguration error instead of silently
3336                // falling through to the buffered chunked path.
3337                // v0.8.11 CRIT-1 fix: the chunked stream early-return is
3338                // only correct when the decrypted body IS the user's
3339                // plaintext as-stored. If the object went through the
3340                // codec (compressed) or carries S4F2 frames, returning
3341                // the decrypt stream directly hands the client
3342                // compressed / framed bytes. Restrict the early-return
3343                // to codec=Passthrough + non-framed objects; everything
3344                // else falls through to the buffered path, which
3345                // decrypt-buffers S4E5/S4E6 via
3346                // `decrypt_chunked_buffered_default` and then runs the
3347                // existing decompress pipeline.
3348                let chunked_streaming_safe = !needs_frame_parse
3349                    && manifest_opt
3350                        .as_ref()
3351                        .map(|m| m.codec == CodecKind::Passthrough)
3352                        .unwrap_or(false);
3353                if matches!(crate::sse::peek_magic(&body), Some("S4E5") | Some("S4E6"))
3354                    && get_sse_c_material.is_none()
3355                    && chunked_streaming_safe
3356                {
3357                    let keyring_arc = self.sse_keyring.clone().ok_or_else(|| {
3358                        S3Error::with_message(
3359                            S3ErrorCode::InvalidRequest,
3360                            "object is SSE-S4 encrypted (S4E5/S4E6) but no --sse-s4-key is configured on this gateway",
3361                        )
3362                    })?;
3363                    let body_len = body.len() as u64;
3364                    let stream = crate::sse::decrypt_chunked_stream(body, keyring_arc.as_ref());
3365                    // Stream is `'static` (the keyring borrow is
3366                    // consumed up front; the cipher lives inside
3367                    // the stream state — see decrypt_chunked_stream
3368                    // doc), so we can move it straight into a
3369                    // StreamingBlob without lifetime gymnastics.
3370                    use futures::StreamExt;
3371                    let mapped = stream.map(|r| {
3372                        r.map_err(|e| std::io::Error::other(format!("SSE-S4 chunked decrypt: {e}")))
3373                    });
3374                    use s3s::dto::StreamingBlob;
3375                    resp.output.body = Some(StreamingBlob::wrap(mapped));
3376                    // Plaintext content_length is unknown until all
3377                    // chunks have been verified; null it out so the
3378                    // ByteStream wrapper reports `unknown` to the
3379                    // HTTP layer (which then emits chunked transfer-
3380                    // encoding) rather than lying about the size.
3381                    resp.output.content_length = None;
3382                    // The backend's checksums + ETag describe the
3383                    // encrypted body (S4E5/S4E6 wire format), not
3384                    // the plaintext we're about to stream — clear them
3385                    // so the AWS SDK doesn't fail the GET with a
3386                    // ChecksumMismatch on a successful round-trip.
3387                    // Mirrors the streaming-zstd path at L1180-1185.
3388                    resp.output.checksum_crc32 = None;
3389                    resp.output.checksum_crc32c = None;
3390                    resp.output.checksum_crc64nvme = None;
3391                    resp.output.checksum_sha1 = None;
3392                    resp.output.checksum_sha256 = None;
3393                    resp.output.e_tag = None;
3394                    let elapsed = get_start.elapsed();
3395                    crate::metrics::record_get(
3396                        "sse-s4-chunked",
3397                        body_len,
3398                        body_len,
3399                        elapsed.as_secs_f64(),
3400                        true,
3401                    );
3402                    return Ok(resp);
3403                }
3404                let plain = match crate::sse::peek_magic(&body) {
3405                    Some("S4E4") => {
3406                        let kms = self.kms.as_ref().ok_or_else(|| {
3407                            S3Error::with_message(
3408                                S3ErrorCode::InvalidRequest,
3409                                "object is SSE-KMS encrypted but no --kms-local-dir / --kms-aws-region is configured on this gateway",
3410                            )
3411                        })?;
3412                        let kms_ref: &dyn crate::kms::KmsBackend = kms.as_ref();
3413                        crate::sse::decrypt_with_kms(&body, kms_ref)
3414                            .await
3415                            .map_err(|e| match e {
3416                                crate::sse::SseError::KmsBackend(k) => kms_error_to_s3(k),
3417                                other => S3Error::with_message(
3418                                    S3ErrorCode::InternalError,
3419                                    format!("SSE-KMS decrypt failed: {other}"),
3420                                ),
3421                            })?
3422                    }
3423                    _ => {
3424                        if let Some(ref m) = get_sse_c_material {
3425                            crate::sse::decrypt(
3426                                &body,
3427                                crate::sse::SseSource::CustomerKey {
3428                                    key: &m.key,
3429                                    key_md5: &m.key_md5,
3430                                },
3431                            )
3432                            .map_err(sse_c_error_to_s3)?
3433                        } else {
3434                            let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
3435                                S3Error::with_message(
3436                                    S3ErrorCode::InvalidRequest,
3437                                    "object is SSE-S4 encrypted but no --sse-s4-key is configured on this gateway",
3438                                )
3439                            })?;
3440                            crate::sse::decrypt(&body, keyring).map_err(|e| {
3441                                S3Error::with_message(
3442                                    S3ErrorCode::InternalError,
3443                                    format!("SSE-S4 decrypt failed: {e}"),
3444                                )
3445                            })?
3446                        }
3447                    }
3448                };
3449                // v0.5 #28: parse out the on-disk wrapped DEK's key id
3450                // so the GET response can echo `x-amz-server-side-encryption-aws-kms-key-id`.
3451                if matches!(crate::sse::peek_magic(&body), Some("S4E4"))
3452                    && let Ok(hdr) = crate::sse::parse_s4e4_header(&body)
3453                {
3454                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
3455                        ServerSideEncryption::AWS_KMS,
3456                    ));
3457                    resp.output.ssekms_key_id = Some(hdr.key_id.to_string());
3458                }
3459                bytes_to_blob(plain)
3460            } else if let Some(ref m) = get_sse_c_material {
3461                // Client sent SSE-C headers for an unencrypted object —
3462                // mirror AWS S3's 400 InvalidRequest.
3463                let _ = m;
3464                return Err(sse_c_error_to_s3(
3465                    crate::sse::SseError::CustomerKeyUnexpected,
3466                ));
3467            } else {
3468                blob
3469            };
3470            // v0.5 #27: SSE-C echo on success — algorithm + key MD5
3471            // tell the client that the supplied key was the one used.
3472            if let Some(ref m) = get_sse_c_material {
3473                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
3474                resp.output.sse_customer_key_md5 =
3475                    Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
3476            }
3477            // ====== Streaming fast path (CpuZstd, non-multipart, codec supports it) ======
3478            // 大規模 object (e.g. 5 GB) を memory に collect すると OOM するので、
3479            // codec が streaming-aware なら body を chunk-by-chunk で decompress して
3480            // 即座に client に流す。
3481            //
3482            // ただし Range request 時は streaming できない (slice するため total bytes
3483            // が必要) → buffered path に fall through。
3484            if range_request.is_none()
3485                && !needs_frame_parse
3486                && let Some(ref m) = manifest_opt
3487                && supports_streaming_decompress(m.codec)
3488                && m.codec == CodecKind::CpuZstd
3489            {
3490                // v0.8.4 #73 H-1: wrap the decompressor output in a
3491                // rolling-CRC32C verifier so a tampered ciphertext (or a
3492                // backend-side corruption that the zstd decoder happens
3493                // to "successfully" decode into wrong bytes) surfaces as
3494                // a streaming error tail at EOF instead of silently
3495                // delivering corrupt plaintext to the client. The wrap
3496                // is a pure pass-through during the body — no extra
3497                // buffering, TTFB unaffected — and the integrity
3498                // decision lands at the last chunk.
3499                let decompressed_blob = cpu_zstd_decompress_stream(blob);
3500                let verified_reader = Crc32cVerifyingReader::new(
3501                    blob_to_async_read(decompressed_blob),
3502                    m.crc32c,
3503                    m.original_size,
3504                );
3505                let verified_blob = async_read_to_blob(verified_reader);
3506                resp.output.content_length = Some(m.original_size as i64);
3507                resp.output.checksum_crc32 = None;
3508                resp.output.checksum_crc32c = None;
3509                resp.output.checksum_crc64nvme = None;
3510                resp.output.checksum_sha1 = None;
3511                resp.output.checksum_sha256 = None;
3512                resp.output.e_tag = None;
3513                resp.output.body = Some(verified_blob);
3514                let elapsed = get_start.elapsed();
3515                crate::metrics::record_get(
3516                    m.codec.as_str(),
3517                    m.compressed_size,
3518                    m.original_size,
3519                    elapsed.as_secs_f64(),
3520                    true,
3521                );
3522                info!(
3523                    op = "get_object",
3524                    bucket = %get_bucket,
3525                    key = %get_key,
3526                    codec = m.codec.as_str(),
3527                    bytes_in = m.compressed_size,
3528                    bytes_out = m.original_size,
3529                    path = "streaming",
3530                    setup_latency_ms = elapsed.as_millis() as u64,
3531                    "S4 get started (streaming)"
3532                );
3533                return Ok(resp);
3534            }
3535            // Passthrough: そのまま流す (Range なしの場合のみ streaming)
3536            if range_request.is_none()
3537                && !needs_frame_parse
3538                && let Some(ref m) = manifest_opt
3539                && m.codec == CodecKind::Passthrough
3540            {
3541                resp.output.content_length = Some(m.original_size as i64);
3542                resp.output.checksum_crc32 = None;
3543                resp.output.checksum_crc32c = None;
3544                resp.output.checksum_crc64nvme = None;
3545                resp.output.checksum_sha1 = None;
3546                resp.output.checksum_sha256 = None;
3547                resp.output.e_tag = None;
3548                resp.output.body = Some(blob);
3549                debug!("S4 get_object: passthrough streaming");
3550                return Ok(resp);
3551            }
3552
3553            // ====== Buffered slow path (multipart frame parser, GPU codecs) ======
3554            let bytes = collect_blob(blob, self.max_body_bytes)
3555                .await
3556                .map_err(internal("collect get body"))?;
3557
3558            let decompressed = if needs_frame_parse {
3559                // multipart objects と framed-v2 single-PUT objects は同じ
3560                // S4F2 frame 列なので decompress_multipart で統一処理
3561                self.decompress_multipart(bytes).await?
3562            } else {
3563                let manifest = manifest_opt.as_ref().expect("non-multipart guarded above");
3564                self.registry
3565                    .decompress(bytes, manifest)
3566                    .await
3567                    .map_err(internal("registry decompress"))?
3568            };
3569
3570            // Range request があれば slice。なければ full body を返す。
3571            let total_size = decompressed.len() as u64;
3572            let (final_bytes, status_override) = if let Some(r) = range_request.as_ref() {
3573                let (start, end) = resolve_range(r, total_size)
3574                    .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
3575                let sliced = decompressed.slice(start as usize..end as usize);
3576                resp.output.content_range = Some(format!(
3577                    "bytes {start}-{}/{total_size}",
3578                    end.saturating_sub(1)
3579                ));
3580                (sliced, Some(http::StatusCode::PARTIAL_CONTENT))
3581            } else {
3582                (decompressed, None)
3583            };
3584            // 解凍後の真のサイズを返す (S3 client は content_length を信頼するので
3585            // 圧縮 size のままだと downstream が body を途中で切ってしまう)
3586            resp.output.content_length = Some(final_bytes.len() as i64);
3587            // 圧縮済 bytes の checksum を返すと AWS SDK 側で StreamingError
3588            // (ChecksumMismatch) になる。ETag も backend が返した「圧縮済 bytes の
3589            // MD5/checksum」なので意味的にズレる — クリアして S4 自身の crc32c
3590            // (manifest 内 / frame 内) で integrity を保証する設計にする。
3591            resp.output.checksum_crc32 = None;
3592            resp.output.checksum_crc32c = None;
3593            resp.output.checksum_crc64nvme = None;
3594            resp.output.checksum_sha1 = None;
3595            resp.output.checksum_sha256 = None;
3596            resp.output.e_tag = None;
3597            let returned_size = final_bytes.len() as u64;
3598            let codec_label = manifest_opt
3599                .as_ref()
3600                .map(|m| m.codec.as_str())
3601                .unwrap_or("multipart");
3602            resp.output.body = Some(bytes_to_blob(final_bytes));
3603            if let Some(status) = status_override {
3604                resp.status = Some(status);
3605            }
3606            let elapsed = get_start.elapsed();
3607            crate::metrics::record_get(codec_label, 0, returned_size, elapsed.as_secs_f64(), true);
3608            info!(
3609                op = "get_object",
3610                bucket = %get_bucket,
3611                key = %get_key,
3612                codec = codec_label,
3613                bytes_out = returned_size,
3614                total_object_size = total_size,
3615                range = range_request.is_some(),
3616                path = "buffered",
3617                latency_ms = elapsed.as_millis() as u64,
3618                "S4 get completed (buffered)"
3619            );
3620        }
3621        // v0.6 #40: echo the recorded `x-amz-replication-status` so
3622        // consumers can poll progress (PENDING / COMPLETED / FAILED).
3623        if let Some(mgr) = self.replication.as_ref()
3624            && let Some(status) = mgr.lookup_status(&get_bucket, &get_key)
3625        {
3626            resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3627                status.as_aws_str().to_owned(),
3628            ));
3629        }
3630        Ok(resp)
3631    }
3632
3633    // === passthrough delegations ===
3634    async fn head_bucket(
3635        &self,
3636        req: S3Request<HeadBucketInput>,
3637    ) -> S3Result<S3Response<HeadBucketOutput>> {
3638        self.backend.head_bucket(req).await
3639    }
3640    async fn list_buckets(
3641        &self,
3642        req: S3Request<ListBucketsInput>,
3643    ) -> S3Result<S3Response<ListBucketsOutput>> {
3644        self.backend.list_buckets(req).await
3645    }
3646    async fn create_bucket(
3647        &self,
3648        req: S3Request<CreateBucketInput>,
3649    ) -> S3Result<S3Response<CreateBucketOutput>> {
3650        self.backend.create_bucket(req).await
3651    }
3652    async fn delete_bucket(
3653        &self,
3654        req: S3Request<DeleteBucketInput>,
3655    ) -> S3Result<S3Response<DeleteBucketOutput>> {
3656        self.backend.delete_bucket(req).await
3657    }
3658    async fn head_object(
3659        &self,
3660        req: S3Request<HeadObjectInput>,
3661    ) -> S3Result<S3Response<HeadObjectOutput>> {
3662        // v0.6 #40: capture bucket/key before req is consumed so the
3663        // replication-status echo can look the entry up.
3664        let head_bucket = req.input.bucket.clone();
3665        let head_key = req.input.key.clone();
3666        // v0.8.16 F-13: same reserved-name guard as `get_object`.
3667        if s4_codec::index::is_reserved_sidecar_key(&head_key) {
3668            return Err(S3Error::with_message(
3669                S3ErrorCode::NoSuchKey,
3670                format!("object key {head_key:?} is reserved for S4 internal sidecars"),
3671            ));
3672        }
3673        let mut resp = self.backend.head_object(req).await?;
3674        if let Some(manifest) = extract_manifest(&resp.output.metadata) {
3675            // 客側には decompress 後の意味のある content_length / checksum を返す。
3676            // backend が返す圧縮済 bytes の checksum / e_tag は意味が違うため除去
3677            // (S4 は manifest 内の crc32c で integrity を担保する)。
3678            resp.output.content_length = Some(manifest.original_size as i64);
3679            resp.output.checksum_crc32 = None;
3680            resp.output.checksum_crc32c = None;
3681            resp.output.checksum_crc64nvme = None;
3682            resp.output.checksum_sha1 = None;
3683            resp.output.checksum_sha256 = None;
3684            resp.output.e_tag = None;
3685        }
3686        // v0.6 #40: echo `x-amz-replication-status` (PENDING / COMPLETED
3687        // / FAILED) so consumers can poll progress without a GET.
3688        if let Some(mgr) = self.replication.as_ref()
3689            && let Some(status) = mgr.lookup_status(&head_bucket, &head_key)
3690        {
3691            resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3692                status.as_aws_str().to_owned(),
3693            ));
3694        }
3695        // v0.7 #48 BUG-4 fix: HEAD must echo SSE indicators so SDKs
3696        // and pipelines see the same posture they got on PUT. The PUT
3697        // path stamps `s4-sse-type` metadata for exactly this — HEAD
3698        // doesn't fetch the body, so it can't peek frame magic.
3699        if let Some(meta) = resp.output.metadata.as_ref()
3700            && let Some(sse_type) = meta.get("s4-sse-type")
3701        {
3702            {
3703                match sse_type.as_str() {
3704                    "aws:kms" => {
3705                        resp.output.server_side_encryption = Some(
3706                            ServerSideEncryption::from_static(ServerSideEncryption::AWS_KMS),
3707                        );
3708                        if let Some(key_id) = meta.get("s4-sse-kms-key-id") {
3709                            resp.output.ssekms_key_id = Some(key_id.clone());
3710                        }
3711                    }
3712                    _ => {
3713                        resp.output.server_side_encryption = Some(
3714                            ServerSideEncryption::from_static(ServerSideEncryption::AES256),
3715                        );
3716                        if let Some(md5) = meta.get("s4-sse-c-key-md5") {
3717                            resp.output.sse_customer_algorithm =
3718                                Some(crate::sse::SSE_C_ALGORITHM.into());
3719                            resp.output.sse_customer_key_md5 = Some(md5.clone());
3720                        }
3721                    }
3722                }
3723            }
3724        }
3725        Ok(resp)
3726    }
3727    async fn delete_object(
3728        &self,
3729        mut req: S3Request<DeleteObjectInput>,
3730    ) -> S3Result<S3Response<DeleteObjectOutput>> {
3731        let bucket = req.input.bucket.clone();
3732        let key = req.input.key.clone();
3733        // v0.8.16 F-13: reserved-name guard on DELETE. Without it a
3734        // hostile client could `DeleteObject(<key>.s4index)` to
3735        // orphan the sidecar, silently disabling Range-GET
3736        // partial-fetch for the corresponding `<key>`. The S4
3737        // internal cleanup path (`write_sidecar` and friends)
3738        // talks to `self.backend.delete_object(...)` directly, NOT
3739        // through this trait method, so the guard doesn't break
3740        // legitimate sidecar cleanup.
3741        if s4_codec::index::is_reserved_sidecar_key(&key) {
3742            let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
3743                .unwrap_or(S3ErrorCode::InvalidArgument);
3744            return Err(S3Error::with_message(
3745                code,
3746                format!(
3747                    "object key {key:?} is reserved (suffix `{}` is used for S4 internal sidecars)",
3748                    s4_codec::index::SIDECAR_SUFFIX,
3749                ),
3750            ));
3751        }
3752        self.enforce_rate_limit(&req, &bucket)?;
3753        self.enforce_policy(&req, "s3:DeleteObject", &bucket, Some(&key))?;
3754        // v0.6 #42: MFA Delete enforcement. When the bucket has
3755        // MFA-Delete = Enabled, every DELETE / DELETE-version /
3756        // delete-marker form needs `x-amz-mfa: <serial> <code>` (RFC 6238
3757        // 6-digit TOTP). Runs *before* the WORM / versioning routers so
3758        // a missing token is denied for free regardless of which delete
3759        // path the request would otherwise take.
3760        if let Some(mgr) = self.mfa_delete.as_ref()
3761            && mgr.is_enabled(&bucket)
3762        {
3763            let header = req.input.mfa.as_deref();
3764            if let Err(e) = crate::mfa::check_mfa(&bucket, header, mgr, current_unix_secs()) {
3765                crate::metrics::record_mfa_delete_denial(&bucket);
3766                return Err(mfa_error_to_s3(e));
3767            }
3768        }
3769        // v0.5 #30: refuse the delete while a WORM lock is in effect.
3770        // Compliance can never be bypassed; Governance can be overridden
3771        // via `x-amz-bypass-governance-retention: true`; legal hold
3772        // never. The check happens before the versioning router so a
3773        // locked object can't be soft-deleted (delete-marker push) on an
3774        // Enabled bucket either — S3 spec says lock applies to all
3775        // delete forms.
3776        if let Some(mgr) = self.object_lock.as_ref()
3777            && let Some(state) = mgr.get(&bucket, &key)
3778        {
3779            let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
3780            // v0.8.12 HIGH-7 fix: the bypass header alone used to be
3781            // enough to override Governance retention. AWS spec
3782            // requires the caller hold `s3:BypassGovernanceRetention`
3783            // for the target ARN; without that, the header is
3784            // silently ignored (not an error — it lines up with how
3785            // AWS' canonical behaviour treats unprivileged callers).
3786            let bypass_allowed = if bypass_header {
3787                self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
3788                    .is_ok()
3789            } else {
3790                false
3791            };
3792            let now = chrono::Utc::now();
3793            if !state.can_delete(now, bypass_allowed) {
3794                crate::metrics::record_policy_denial("s3:DeleteObject", &bucket);
3795                return Err(S3Error::with_message(
3796                    S3ErrorCode::AccessDenied,
3797                    "Access Denied because object protected by object lock",
3798                ));
3799            }
3800        }
3801        // v0.5 #34: route DELETE through the VersioningManager when the
3802        // bucket is in a versioning-aware state.
3803        //
3804        // - Enabled bucket, no version_id → push a delete marker into
3805        //   the chain. NO backend object is touched (older versions
3806        //   stay reachable via specific-version GET).
3807        // - Enabled / Suspended bucket, with version_id → physical
3808        //   delete. Backend bytes at the shadow key (or `<key>` for
3809        //   `null`) are removed; chain entry is dropped. If the deleted
3810        //   entry was a delete marker, no backend bytes exist for it
3811        //   (record-only).
3812        // - Suspended bucket, no version_id → push a "null" delete
3813        //   marker (S3 spec); backend bytes at `<key>` are physically
3814        //   removed (same as legacy).
3815        // - Unversioned bucket → fall through to legacy passthrough.
3816        if let Some(mgr) = self.versioning.as_ref() {
3817            let state = mgr.state(&bucket);
3818            if state != crate::versioning::VersioningState::Unversioned {
3819                let req_vid = req.input.version_id.take();
3820                if let Some(vid) = req_vid {
3821                    // Specific-version DELETE: touch backend bytes only
3822                    // when the entry was a real version (not a delete
3823                    // marker, which has no backend bytes).
3824                    let outcome = mgr.record_delete_specific(&bucket, &key, &vid);
3825                    let backend_target = if vid == crate::versioning::NULL_VERSION_ID {
3826                        key.clone()
3827                    } else {
3828                        versioned_shadow_key(&key, &vid)
3829                    };
3830                    let was_real_version = outcome
3831                        .as_ref()
3832                        .map(|o| !o.is_delete_marker)
3833                        .unwrap_or(false);
3834                    if was_real_version {
3835                        // Best-effort backend cleanup; missing bytes
3836                        // are not an error (e.g. shadow key already
3837                        // GC'd).
3838                        let backend_input = DeleteObjectInput {
3839                            bucket: bucket.clone(),
3840                            key: backend_target,
3841                            ..Default::default()
3842                        };
3843                        let backend_req = S3Request {
3844                            input: backend_input,
3845                            method: http::Method::DELETE,
3846                            uri: req.uri.clone(),
3847                            headers: req.headers.clone(),
3848                            extensions: http::Extensions::new(),
3849                            credentials: req.credentials.clone(),
3850                            region: req.region.clone(),
3851                            service: req.service.clone(),
3852                            trailing_headers: None,
3853                        };
3854                        let _ = self.backend.delete_object(backend_req).await;
3855                    }
3856                    let mut output = DeleteObjectOutput {
3857                        version_id: Some(vid.clone()),
3858                        ..Default::default()
3859                    };
3860                    if let Some(o) = outcome.as_ref()
3861                        && o.is_delete_marker
3862                    {
3863                        output.delete_marker = Some(true);
3864                    }
3865                    // v0.6 #35: specific-version DELETE always counts as
3866                    // a hard `ObjectRemoved:Delete` event (the chain
3867                    // entry, marker or not, is gone after this call).
3868                    self.fire_delete_notification(
3869                        &bucket,
3870                        &key,
3871                        crate::notifications::EventType::ObjectRemovedDelete,
3872                        Some(vid.clone()),
3873                    );
3874                    return Ok(S3Response::new(output));
3875                }
3876                // No version_id: record a delete marker (state-aware).
3877                let outcome = mgr.record_delete(&bucket, &key);
3878                if state == crate::versioning::VersioningState::Suspended {
3879                    // Suspended buckets also evict the prior `<key>`
3880                    // bytes (the previous null version is gone too).
3881                    let backend_input = DeleteObjectInput {
3882                        bucket: bucket.clone(),
3883                        key: key.clone(),
3884                        ..Default::default()
3885                    };
3886                    let backend_req = S3Request {
3887                        input: backend_input,
3888                        method: http::Method::DELETE,
3889                        uri: req.uri.clone(),
3890                        headers: req.headers.clone(),
3891                        extensions: http::Extensions::new(),
3892                        credentials: req.credentials.clone(),
3893                        region: req.region.clone(),
3894                        service: req.service.clone(),
3895                        trailing_headers: None,
3896                    };
3897                    let _ = self.backend.delete_object(backend_req).await;
3898                }
3899                let output = DeleteObjectOutput {
3900                    delete_marker: Some(true),
3901                    version_id: outcome.version_id.clone(),
3902                    ..Default::default()
3903                };
3904                // v0.6 #35: versioned bucket DELETE without a version-id
3905                // creates a delete marker — the dedicated AWS event
3906                // taxonomy entry. Suspended-state buckets also push a
3907                // (null) marker, so the same event fires there.
3908                self.fire_delete_notification(
3909                    &bucket,
3910                    &key,
3911                    crate::notifications::EventType::ObjectRemovedDeleteMarker,
3912                    outcome.version_id,
3913                );
3914                return Ok(S3Response::new(output));
3915            }
3916        }
3917        // Legacy / Unversioned path: physical delete on the backend +
3918        // best-effort sidecar cleanup (mirrors v0.4 behaviour).
3919        let resp = self.backend.delete_object(req).await?;
3920        // v0.5 #30: drop any per-object lock state once the delete has
3921        // succeeded so the freed key can be re-armed by a future PUT
3922        // under the bucket default. Reaching here implies the lock had
3923        // already passed `can_delete` above, so this is purely cleanup.
3924        if let Some(mgr) = self.object_lock.as_ref() {
3925            mgr.clear(&bucket, &key);
3926        }
3927        // v0.6 #39: drop any object-level tag set on physical delete —
3928        // the freed key starts a fresh tag history if a future PUT
3929        // re-creates it. (Versioned-delete branches above return early
3930        // and do NOT touch tags, mirroring AWS where tag state is
3931        // attached to the logical key, not the version chain.)
3932        if let Some(mgr) = self.tagging.as_ref() {
3933            mgr.delete_object_tags(&bucket, &key);
3934        }
3935        let sidecar = sidecar_key(&key);
3936        // v0.7 #49: skip the sidecar DELETE if the key + sidecar suffix
3937        // can't be encoded into a request URI — the primary delete
3938        // already succeeded and a stale sidecar is harmless (Range GET
3939        // re-validates the underlying object on next read).
3940        if let Ok(uri) = safe_object_uri(&bucket, &sidecar) {
3941            let sidecar_input = DeleteObjectInput {
3942                bucket: bucket.clone(),
3943                key: sidecar,
3944                ..Default::default()
3945            };
3946            let sidecar_req = S3Request {
3947                input: sidecar_input,
3948                method: http::Method::DELETE,
3949                uri,
3950                headers: http::HeaderMap::new(),
3951                extensions: http::Extensions::new(),
3952                credentials: None,
3953                region: None,
3954                service: None,
3955                trailing_headers: None,
3956            };
3957            let _ = self.backend.delete_object(sidecar_req).await;
3958        }
3959        // v0.6 #35: legacy unversioned-bucket hard delete fires the
3960        // canonical `ObjectRemoved:Delete` event.
3961        self.fire_delete_notification(
3962            &bucket,
3963            &key,
3964            crate::notifications::EventType::ObjectRemovedDelete,
3965            None,
3966        );
3967        Ok(resp)
3968    }
3969    async fn delete_objects(
3970        &self,
3971        req: S3Request<DeleteObjectsInput>,
3972    ) -> S3Result<S3Response<DeleteObjectsOutput>> {
3973        // v0.6 #42: MFA Delete applies once to the whole batch (S3 spec:
3974        // when MFA-Delete is on the bucket, a missing / invalid token
3975        // fails the entire DeleteObjects request, not per-object).
3976        if let Some(mgr) = self.mfa_delete.as_ref()
3977            && mgr.is_enabled(&req.input.bucket)
3978        {
3979            let header = req.input.mfa.as_deref();
3980            if let Err(e) =
3981                crate::mfa::check_mfa(&req.input.bucket, header, mgr, current_unix_secs())
3982            {
3983                crate::metrics::record_mfa_delete_denial(&req.input.bucket);
3984                return Err(mfa_error_to_s3(e));
3985            }
3986        }
3987        // v0.8.11 CRIT-3 fix: route every entry through the gated
3988        // per-object `delete_object` path so Object Lock, IAM policy,
3989        // versioning, tagging, sidecar cleanup and notification fan-
3990        // out all fire for batch DELETE. The previous
3991        // `self.backend.delete_objects(req).await` straight-through
3992        // bypassed every gate, so a `legal_hold=on` key listed inside
3993        // a DeleteObjects XML was happily removed.
3994        //
3995        // S3 spec note: DeleteObjects is "best-effort per object" —
3996        // a failure on one key surfaces as an `Errors` entry without
3997        // aborting the rest of the batch. Quiet-mode suppresses the
3998        // `Deleted` list (errors are still reported). We honour both.
3999        let bucket = req.input.bucket.clone();
4000        let bypass_governance = req.input.bypass_governance_retention.unwrap_or(false);
4001        let mfa_header = req.input.mfa.clone();
4002        let quiet = req.input.delete.quiet.unwrap_or(false);
4003        let mut deleted: Vec<DeletedObject> = Vec::new();
4004        let mut errors: Vec<s3s::dto::Error> = Vec::new();
4005        for ident in req.input.delete.objects.iter() {
4006            let key = ident.key.clone();
4007            let version_id = ident.version_id.clone();
4008            let per_input = DeleteObjectInput {
4009                bucket: bucket.clone(),
4010                key: key.clone(),
4011                version_id: version_id.clone(),
4012                bypass_governance_retention: Some(bypass_governance),
4013                mfa: mfa_header.clone(),
4014                ..Default::default()
4015            };
4016            let per_uri = match safe_object_uri(&bucket, &key) {
4017                Ok(u) => u,
4018                Err(_) => {
4019                    errors.push(s3s::dto::Error {
4020                        code: Some("InvalidArgument".to_owned()),
4021                        key: Some(key),
4022                        message: Some("object key is not URI-encodable".to_owned()),
4023                        version_id,
4024                    });
4025                    continue;
4026                }
4027            };
4028            let per_req = S3Request {
4029                input: per_input,
4030                method: http::Method::DELETE,
4031                uri: per_uri,
4032                headers: req.headers.clone(),
4033                extensions: http::Extensions::new(),
4034                credentials: req.credentials.clone(),
4035                region: req.region.clone(),
4036                service: req.service.clone(),
4037                trailing_headers: None,
4038            };
4039            match self.delete_object(per_req).await {
4040                Ok(resp) => {
4041                    let out = resp.output;
4042                    // DeleteObjectOutput doesn't surface a separate
4043                    // `delete_marker_version_id`; the marker's version
4044                    // id is whatever `version_id` carries (when the
4045                    // versioning manager pushed a delete-marker, that
4046                    // field already holds the marker's vid).
4047                    let vid = out.version_id.clone().or(version_id);
4048                    deleted.push(DeletedObject {
4049                        key: Some(key),
4050                        version_id: vid.clone(),
4051                        delete_marker: out.delete_marker,
4052                        delete_marker_version_id: vid,
4053                    });
4054                }
4055                Err(e) => {
4056                    let code_str = e.code().as_str().to_owned();
4057                    let msg = e.message().unwrap_or(code_str.as_str()).to_owned();
4058                    errors.push(s3s::dto::Error {
4059                        code: Some(code_str),
4060                        key: Some(key),
4061                        message: Some(msg),
4062                        version_id,
4063                    });
4064                }
4065            }
4066        }
4067        let output = DeleteObjectsOutput {
4068            deleted: if quiet || deleted.is_empty() {
4069                None
4070            } else {
4071                Some(deleted)
4072            },
4073            errors: if errors.is_empty() {
4074                None
4075            } else {
4076                Some(errors)
4077            },
4078            ..Default::default()
4079        };
4080        Ok(S3Response::new(output))
4081    }
4082    async fn copy_object(
4083        &self,
4084        mut req: S3Request<CopyObjectInput>,
4085    ) -> S3Result<S3Response<CopyObjectOutput>> {
4086        // copy is conceptually "GetObject src + PutObject dst" — enforce both.
4087        let dst_bucket = req.input.bucket.clone();
4088        let dst_key = req.input.key.clone();
4089        // v0.8.15 M-1: same reserved-name guard as `put_object`. A
4090        // copy whose destination would land at `<x>.s4index` carries
4091        // the same listing / cleanup hazards.
4092        if s4_codec::index::is_reserved_sidecar_key(&dst_key) {
4093            let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
4094                .unwrap_or(S3ErrorCode::InvalidArgument);
4095            return Err(S3Error::with_message(
4096                code,
4097                format!(
4098                    "destination key {dst_key:?} is reserved (suffix `{}` is used for S4 \
4099                     internal sidecars)",
4100                    s4_codec::index::SIDECAR_SUFFIX,
4101                ),
4102            ));
4103        }
4104        self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
4105        if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4106            self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
4107        }
4108        // S4-aware copy: source object に s4-* metadata がある場合、それを
4109        // destination に確実に preserve する。
4110        //
4111        // - MetadataDirective::COPY (default): backend が source metadata を
4112        //   そのまま copy するので S4 metadata も自動で渡る。介入不要
4113        // - MetadataDirective::REPLACE: 客が指定した metadata で source を
4114        //   上書き → s4-* metadata が消えると destination は decompress 不能に
4115        //   なる (silent corruption)。S4 が source metadata を HEAD で取得し、
4116        //   s4-* fields を input.metadata に強制 merge する
4117        let needs_merge = req
4118            .input
4119            .metadata_directive
4120            .as_ref()
4121            .map(|d| d.as_str() == MetadataDirective::REPLACE)
4122            .unwrap_or(false);
4123        if needs_merge && let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4124            // v0.8.16 F-8: strip the client-supplied `s4-*` keys
4125            // *unconditionally* — the v0.8.15 M-2 fix only ran the
4126            // strip inside the `if let Ok(head) = ...` block, so a
4127            // backend HEAD failure (transient 5xx, NoSuchKey on a
4128            // racing delete) left attacker-injected `s4-*` /
4129            // `S4-*` metadata intact on the destination. Now we
4130            // strip first, then re-populate from the source HEAD
4131            // when available — HEAD failure simply means the
4132            // destination loses the codec markers (correct: a
4133            // CopyObject without the source's codec metadata
4134            // produces an unreadable object, but doesn't allow
4135            // injection).
4136            let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4137            dest_meta.retain(|k, _| !k.to_ascii_lowercase().starts_with("s4-"));
4138            let head_input = HeadObjectInput {
4139                bucket: bucket.to_string(),
4140                key: key.to_string(),
4141                ..Default::default()
4142            };
4143            let head_req = S3Request {
4144                input: head_input,
4145                method: req.method.clone(),
4146                uri: req.uri.clone(),
4147                headers: req.headers.clone(),
4148                extensions: http::Extensions::new(),
4149                credentials: req.credentials.clone(),
4150                region: req.region.clone(),
4151                service: req.service.clone(),
4152                trailing_headers: None,
4153            };
4154            if let Ok(head) = self.backend.head_object(head_req).await
4155                && let Some(src_meta) = head.output.metadata.as_ref()
4156            {
4157                let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4158                for key in [
4159                    META_CODEC,
4160                    META_ORIGINAL_SIZE,
4161                    META_COMPRESSED_SIZE,
4162                    META_CRC32C,
4163                    META_MULTIPART,
4164                    META_FRAMED,
4165                ] {
4166                    if let Some(v) = src_meta.get(key) {
4167                        dest_meta.insert(key.to_string(), v.clone());
4168                    }
4169                }
4170                // SSE markers are equally reserved — propagate any
4171                // source flags so a copy of an encrypted object stays
4172                // marked as encrypted at the destination.
4173                for sse_key in [
4174                    "s4-encrypted",
4175                    "s4-sse-type",
4176                    "s4-sse-c-key-md5",
4177                    "s4-sse-kms-key-id",
4178                ] {
4179                    if let Some(v) = src_meta.get(sse_key) {
4180                        dest_meta.insert(sse_key.to_string(), v.clone());
4181                    }
4182                }
4183                debug!(
4184                    src_bucket = %bucket,
4185                    src_key = %key,
4186                    "S4 copy_object: replaced client s4-* metadata with source values across REPLACE directive (v0.8.15 M-2)"
4187                );
4188            }
4189        }
4190        self.backend.copy_object(req).await
4191    }
4192    async fn list_objects(
4193        &self,
4194        req: S3Request<ListObjectsInput>,
4195    ) -> S3Result<S3Response<ListObjectsOutput>> {
4196        self.enforce_rate_limit(&req, &req.input.bucket)?;
4197        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4198        let mut resp = self.backend.list_objects(req).await?;
4199        // S4 内部 object (`*.s4index` sidecar、`.__s4ver__/` shadow versions
4200        // — v0.5 #34) を顧客から隠す。
4201        if let Some(contents) = resp.output.contents.as_mut() {
4202            contents.retain(|o| {
4203                o.key
4204                    .as_ref()
4205                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4206                    .unwrap_or(true)
4207            });
4208        }
4209        Ok(resp)
4210    }
4211    async fn list_objects_v2(
4212        &self,
4213        req: S3Request<ListObjectsV2Input>,
4214    ) -> S3Result<S3Response<ListObjectsV2Output>> {
4215        self.enforce_rate_limit(&req, &req.input.bucket)?;
4216        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4217        let mut resp = self.backend.list_objects_v2(req).await?;
4218        if let Some(contents) = resp.output.contents.as_mut() {
4219            let before = contents.len();
4220            contents.retain(|o| {
4221                o.key
4222                    .as_ref()
4223                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4224                    .unwrap_or(true)
4225            });
4226            // key_count も補正 (S3 spec compliance)
4227            if let Some(kc) = resp.output.key_count.as_mut() {
4228                *kc -= (before - contents.len()) as i32;
4229            }
4230        }
4231        Ok(resp)
4232    }
4233    /// v0.4 #17: filter S4-internal sidecars from versioned listings.
4234    /// v0.5 #34: when a [`crate::versioning::VersioningManager`] is
4235    /// attached AND the bucket is in a versioning-aware state, build
4236    /// the `Versions` / `DeleteMarkers` arrays directly from the
4237    /// in-memory chain (paginated + ordered the S3 way: key asc,
4238    /// version newest-first inside each key). Otherwise fall back to
4239    /// passthrough + sidecar-filter (legacy v0.4 behaviour).
4240    async fn list_object_versions(
4241        &self,
4242        req: S3Request<ListObjectVersionsInput>,
4243    ) -> S3Result<S3Response<ListObjectVersionsOutput>> {
4244        self.enforce_rate_limit(&req, &req.input.bucket)?;
4245        self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4246        // v0.5 #34: VersioningManager-owned path.
4247        if let Some(mgr) = self.versioning.as_ref()
4248            && mgr.state(&req.input.bucket) != crate::versioning::VersioningState::Unversioned
4249        {
4250            let max_keys = req.input.max_keys.unwrap_or(1000) as usize;
4251            let page = mgr.list_versions(
4252                &req.input.bucket,
4253                req.input.prefix.as_deref(),
4254                req.input.key_marker.as_deref(),
4255                req.input.version_id_marker.as_deref(),
4256                max_keys,
4257            );
4258            let versions: Vec<ObjectVersion> = page
4259                .versions
4260                .into_iter()
4261                .map(|e| ObjectVersion {
4262                    key: Some(e.key),
4263                    version_id: Some(e.version_id),
4264                    is_latest: Some(e.is_latest),
4265                    e_tag: Some(ETag::Strong(e.etag)),
4266                    size: Some(e.size as i64),
4267                    last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4268                    ..Default::default()
4269                })
4270                .collect();
4271            let delete_markers: Vec<DeleteMarkerEntry> = page
4272                .delete_markers
4273                .into_iter()
4274                .map(|e| DeleteMarkerEntry {
4275                    key: Some(e.key),
4276                    version_id: Some(e.version_id),
4277                    is_latest: Some(e.is_latest),
4278                    last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4279                    ..Default::default()
4280                })
4281                .collect();
4282            let output = ListObjectVersionsOutput {
4283                name: Some(req.input.bucket.clone()),
4284                prefix: req.input.prefix.clone(),
4285                key_marker: req.input.key_marker.clone(),
4286                version_id_marker: req.input.version_id_marker.clone(),
4287                max_keys: req.input.max_keys,
4288                versions: if versions.is_empty() {
4289                    None
4290                } else {
4291                    Some(versions)
4292                },
4293                delete_markers: if delete_markers.is_empty() {
4294                    None
4295                } else {
4296                    Some(delete_markers)
4297                },
4298                is_truncated: Some(page.is_truncated),
4299                next_key_marker: page.next_key_marker,
4300                next_version_id_marker: page.next_version_id_marker,
4301                ..Default::default()
4302            };
4303            return Ok(S3Response::new(output));
4304        }
4305        // Legacy passthrough path (v0.4 #17 sidecar filter retained).
4306        let mut resp = self.backend.list_object_versions(req).await?;
4307        if let Some(versions) = resp.output.versions.as_mut() {
4308            versions.retain(|v| {
4309                v.key
4310                    .as_ref()
4311                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4312                    .unwrap_or(true)
4313            });
4314        }
4315        if let Some(markers) = resp.output.delete_markers.as_mut() {
4316            markers.retain(|m| {
4317                m.key
4318                    .as_ref()
4319                    .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4320                    .unwrap_or(true)
4321            });
4322        }
4323        Ok(resp)
4324    }
4325
4326    async fn create_multipart_upload(
4327        &self,
4328        mut req: S3Request<CreateMultipartUploadInput>,
4329    ) -> S3Result<S3Response<CreateMultipartUploadOutput>> {
4330        // v0.8.12 HIGH-9 fix: gate multipart Create on `s3:PutObject` —
4331        // the destination is conceptually about to host a new object,
4332        // matching what `put_object` enforces L2078. Without this, a
4333        // bucket policy denying `s3:PutObject` was bypassable simply
4334        // by switching the client to the multipart wire path.
4335        let mp_bucket = req.input.bucket.clone();
4336        let mp_key = req.input.key.clone();
4337        // v0.8.15 M-1: reserved-name guard on the multipart entry too.
4338        if s4_codec::index::is_reserved_sidecar_key(&mp_key) {
4339            let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
4340                .unwrap_or(S3ErrorCode::InvalidArgument);
4341            return Err(S3Error::with_message(
4342                code,
4343                format!(
4344                    "object key {mp_key:?} is reserved (suffix `{}` is used for S4 internal \
4345                     sidecars)",
4346                    s4_codec::index::SIDECAR_SUFFIX,
4347                ),
4348            ));
4349        }
4350        self.enforce_policy(&req, "s3:PutObject", &mp_bucket, Some(&mp_key))?;
4351        self.enforce_rate_limit(&req, &mp_bucket)?;
4352        // Multipart object は per-part 圧縮 + frame 形式で書く。GET 時に
4353        // frame parse を起動するため、object metadata に flag を立てる。
4354        // codec は dispatcher の default kind を採用 (per-part 別 codec は Phase 2)。
4355        let codec_kind = self.registry.default_kind();
4356        let meta = req.input.metadata.get_or_insert_with(Default::default);
4357        meta.insert(META_MULTIPART.into(), "true".into());
4358        meta.insert(META_CODEC.into(), codec_kind.as_str().into());
4359        // v0.8 #54 BUG-10 fix: take() the SSE request fields off
4360        // `req.input` so they are NOT forwarded to the backend on
4361        // CreateMultipartUpload. Same root cause as v0.7 #48 BUG-2/3 on
4362        // single-PUT — MinIO rejects SSE-C with "HTTPS required" and
4363        // SSE-KMS with "KMS not configured" when the headers reach it.
4364        // S4 owns the encrypt-then-store contract; we capture the
4365        // recipe in `multipart_state` here and apply it on Complete.
4366        let sse_c_alg = req.input.sse_customer_algorithm.take();
4367        let sse_c_key = req.input.sse_customer_key.take();
4368        let sse_c_md5 = req.input.sse_customer_key_md5.take();
4369        let sse_header = req.input.server_side_encryption.take();
4370        let sse_kms_key = req.input.ssekms_key_id.take();
4371        // Strip the encryption-context too — leaving it would make
4372        // MinIO try to validate it against a non-existent KMS key.
4373        let _ = req.input.ssekms_encryption_context.take();
4374        let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
4375        let kms_key_id = extract_kms_key_id(
4376            &sse_header,
4377            &sse_kms_key,
4378            self.kms_default_key_id.as_deref(),
4379        );
4380        // SSE-C / SSE-KMS exclusivity (mirrors put_object L1870).
4381        if sse_c_material.is_some() && kms_key_id.is_some() {
4382            return Err(S3Error::with_message(
4383                S3ErrorCode::InvalidArgument,
4384                "SSE-C and SSE-KMS cannot be used together on the same multipart upload",
4385            ));
4386        }
4387        let sse_mode = if let Some(ref m) = sse_c_material {
4388            // v0.8.2 #62 (H-6 audit fix): wrap the customer-supplied
4389            // 32-byte key in `Zeroizing` so abandoned uploads (or
4390            // normal Complete/Abort) wipe the key bytes on drop. The
4391            // `key_md5` is the public fingerprint and stays as a
4392            // bare `[u8; 16]`.
4393            crate::multipart_state::MultipartSseMode::SseC {
4394                key: zeroize::Zeroizing::new(m.key),
4395                key_md5: m.key_md5,
4396            }
4397        } else if let Some(ref kid) = kms_key_id {
4398            // KMS pre-flight: fail at Create rather than at Complete if
4399            // the gateway has no KMS backend wired (mirrors the
4400            // put_object L1879 check).
4401            if self.kms.is_none() {
4402                return Err(S3Error::with_message(
4403                    S3ErrorCode::InvalidRequest,
4404                    "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4405                ));
4406            }
4407            crate::multipart_state::MultipartSseMode::SseKms {
4408                key_id: kid.clone(),
4409            }
4410        } else if self.sse_keyring.is_some() {
4411            // SSE-S4: server-driven transparent encryption. Activates
4412            // whenever the gateway has a keyring configured AND the
4413            // client didn't pick a different SSE mode.
4414            crate::multipart_state::MultipartSseMode::SseS4
4415        } else {
4416            crate::multipart_state::MultipartSseMode::None
4417        };
4418        // v0.8 #54 BUG-9 fix: parse the Tagging header on Create. The
4419        // single-PUT path does this on PutObject; the multipart path
4420        // captures it now and commits via TagManager on Complete.
4421        let request_tags: Option<crate::tagging::TagSet> = req
4422            .input
4423            .tagging
4424            .as_deref()
4425            .map(crate::tagging::parse_tagging_header)
4426            .transpose()
4427            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
4428        // Strip the `Tagging` field off the input so the backend
4429        // doesn't try to apply it (no-op on MinIO but keeps the wire
4430        // clean).
4431        let _ = req.input.tagging.take();
4432        // Object Lock recipe (BUG-7 — captured here, applied on Complete).
4433        let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
4434            .input
4435            .object_lock_mode
4436            .as_ref()
4437            .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
4438        let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
4439            .input
4440            .object_lock_retain_until_date
4441            .as_ref()
4442            .and_then(timestamp_to_chrono_utc);
4443        let explicit_legal_hold_on: bool = req
4444            .input
4445            .object_lock_legal_hold_status
4446            .as_ref()
4447            .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
4448            .unwrap_or(false);
4449        let bucket = req.input.bucket.clone();
4450        let key = req.input.key.clone();
4451        debug!(
4452            bucket = %bucket,
4453            key = %key,
4454            codec = codec_kind.as_str(),
4455            sse = ?sse_mode,
4456            "S4 create_multipart_upload: marking object for per-part compression"
4457        );
4458        let mut resp = self.backend.create_multipart_upload(req).await?;
4459        // Stash the per-upload context only after the backend handed
4460        // us an upload_id (failed Creates leave nothing in the store).
4461        if let Some(upload_id) = resp.output.upload_id.as_ref() {
4462            self.multipart_state.put(
4463                upload_id,
4464                crate::multipart_state::MultipartUploadContext {
4465                    bucket,
4466                    key,
4467                    sse: sse_mode.clone(),
4468                    tags: request_tags,
4469                    object_lock_mode: explicit_lock_mode,
4470                    object_lock_retain_until: explicit_retain_until,
4471                    object_lock_legal_hold: explicit_legal_hold_on,
4472                },
4473            );
4474        }
4475        // SSE-C / SSE-KMS response echo (mirrors put_object L2036-L2050).
4476        match &sse_mode {
4477            crate::multipart_state::MultipartSseMode::SseC { key_md5, .. } => {
4478                resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
4479                resp.output.sse_customer_key_md5 =
4480                    Some(base64::engine::general_purpose::STANDARD.encode(key_md5));
4481            }
4482            crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
4483                resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
4484                    ServerSideEncryption::AWS_KMS,
4485                ));
4486                resp.output.ssekms_key_id = Some(key_id.clone());
4487            }
4488            _ => {}
4489        }
4490        Ok(resp)
4491    }
4492
4493    async fn upload_part(
4494        &self,
4495        mut req: S3Request<UploadPartInput>,
4496    ) -> S3Result<S3Response<UploadPartOutput>> {
4497        // v0.8.12 HIGH-9 fix: same `s3:PutObject` gate as
4498        // `put_object` / `create_multipart_upload`. Even though
4499        // Create already passed the gate, a bucket policy that
4500        // *revokes* `s3:PutObject` mid-flight should stop further
4501        // parts (e.g. legal hold drops, retention shortened).
4502        let part_bucket = req.input.bucket.clone();
4503        let part_key = req.input.key.clone();
4504        self.enforce_policy(&req, "s3:PutObject", &part_bucket, Some(&part_key))?;
4505        self.enforce_rate_limit(&req, &part_bucket)?;
4506        // 各 part を圧縮して frame header 付きで forward。GET 時に
4507        // `decompress_multipart` が frame iter で順に解凍する。
4508        // **per-part codec dispatch**: dispatcher が body 先頭 sample から
4509        // codec を選ぶので、parquet 風の mixed-content multipart で part ごとに
4510        // 最適 codec を使える (整数列 part → Bitcomp、text 列 part → zstd 等)。
4511        //
4512        // v0.8 #54 BUG-5/BUG-10 fix: lookup the per-upload SSE
4513        // context captured by `create_multipart_upload` and (a) strip
4514        // any SSE-C request headers off `req.input` so the backend
4515        // doesn't see them — same root cause as v0.7 #48 BUG-2/3 on
4516        // single-PUT; MinIO refuses SSE-C parts over HTTP — and (b)
4517        // observe that an upload context exists for `upload_id`. The
4518        // actual encrypt happens once at `complete_multipart_upload`
4519        // time on the assembled body (the per-part-encrypt approach
4520        // would require a matching multi-segment decrypt path on GET;
4521        // encrypting the whole assembled body keeps the GET path's
4522        // `is_sse_encrypted` branch in get_object L2429 working
4523        // unchanged).
4524        let sse_ctx = self.multipart_state.get(req.input.upload_id.as_str());
4525        // v0.8.2 #62 (H-1 audit fix): SSE-C key consistency check.
4526        // The AWS S3 spec requires the same SSE-C key headers on
4527        // every UploadPart and rejects mismatches with 400. Prior to
4528        // #62 we silently stripped the headers (BUG-10 fix) without
4529        // validating them, allowing a client to send part 1 under
4530        // key-A and part 2 under key-B; both got stored, then
4531        // re-encrypted with key-A on Complete — the client thinks
4532        // part 2 is under key-B but a GET with key-B would in fact
4533        // hit the part-1 ciphertext that was actually encrypted with
4534        // key-A. That would either decrypt successfully (silent
4535        // corruption: client lost track of which key encrypts what)
4536        // or fail in a confusing way. Validate the per-part headers
4537        // now and reject with 400 InvalidArgument on mismatch /
4538        // omission / partial supply, matching real-S3 behaviour.
4539        if let Some(ref ctx) = sse_ctx {
4540            if let crate::multipart_state::MultipartSseMode::SseC {
4541                key_md5: ctx_md5, ..
4542            } = &ctx.sse
4543            {
4544                let alg = req.input.sse_customer_algorithm.take();
4545                let key_b64 = req.input.sse_customer_key.take();
4546                let md5_b64 = req.input.sse_customer_key_md5.take();
4547                match (alg, key_b64, md5_b64) {
4548                    (Some(a), Some(k), Some(m)) => {
4549                        // Parse + validate; if the per-part headers
4550                        // are themselves malformed (algorithm not
4551                        // AES256, MD5 mismatch, key not 32 bytes)
4552                        // surface the same 400 the single-PUT path
4553                        // would. Then compare the parsed MD5 to the
4554                        // upload-context's MD5; mismatch is a
4555                        // different-key UploadPart and must reject.
4556                        let part_material = crate::sse::parse_customer_key_headers(&a, &k, &m)
4557                            .map_err(sse_c_error_to_s3)?;
4558                        if part_material.key_md5 != *ctx_md5 {
4559                            return Err(S3Error::with_message(
4560                                S3ErrorCode::InvalidArgument,
4561                                "SSE-C key on UploadPart does not match the key supplied on CreateMultipartUpload",
4562                            ));
4563                        }
4564                        // OK — same key as Create. Headers are
4565                        // already taken off `req.input` so the
4566                        // backend never sees them.
4567                    }
4568                    (None, None, None) => {
4569                        // AWS S3 spec: SSE-C headers MUST be replayed
4570                        // on every UploadPart of an SSE-C multipart.
4571                        // Real-S3 returns 400 InvalidRequest in this
4572                        // case; mirror that.
4573                        return Err(S3Error::with_message(
4574                            S3ErrorCode::InvalidRequest,
4575                            "SSE-C requires customer-key headers on every UploadPart (CreateMultipartUpload was SSE-C)",
4576                        ));
4577                    }
4578                    _ => {
4579                        // Partial header set (e.g. algorithm + key
4580                        // but no MD5) — same handling as the
4581                        // single-PUT `extract_sse_c_material` helper.
4582                        return Err(S3Error::with_message(
4583                            S3ErrorCode::InvalidRequest,
4584                            "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
4585                        ));
4586                    }
4587                }
4588            } else {
4589                // CreateMultipartUpload was non-SSE-C (None / SseS4 /
4590                // SseKms). A part that arrives carrying SSE-C headers
4591                // is either a confused client or an attempt to
4592                // smuggle SSE-C around the gateway-internal SSE
4593                // recipe. Reject with 400 InvalidRequest rather than
4594                // silently strip — the strip would let the client
4595                // believe the part was encrypted under their key
4596                // when in fact the upload's encryption recipe is
4597                // whatever the Create captured.
4598                if req.input.sse_customer_algorithm.is_some()
4599                    || req.input.sse_customer_key.is_some()
4600                    || req.input.sse_customer_key_md5.is_some()
4601                {
4602                    return Err(S3Error::with_message(
4603                        S3ErrorCode::InvalidRequest,
4604                        "UploadPart sent SSE-C headers but CreateMultipartUpload was not SSE-C",
4605                    ));
4606                }
4607            }
4608        } else {
4609            // No upload context registered (gateway crashed between
4610            // Create and Part, or pre-#62 abandoned-upload restore).
4611            // We can't check key consistency in this case — strip
4612            // the headers and let the request through unchanged so
4613            // the backend's `NoSuchUpload` reply (or whatever it
4614            // chooses to do) flows back to the client.
4615            let _ = req.input.sse_customer_algorithm.take();
4616            let _ = req.input.sse_customer_key.take();
4617            let _ = req.input.sse_customer_key_md5.take();
4618        }
4619        let _sse_ctx = sse_ctx;
4620        if let Some(blob) = req.input.body.take() {
4621            let bytes = collect_blob(blob, self.max_body_bytes)
4622                .await
4623                .map_err(internal("collect upload_part body"))?;
4624            // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
4625            // checksum algorithms against the received part body.
4626            verify_client_body_checksums(
4627                &bytes,
4628                req.input.content_md5.as_deref(),
4629                req.input.checksum_crc32.as_deref(),
4630                req.input.checksum_crc32c.as_deref(),
4631                req.input.checksum_sha1.as_deref(),
4632                req.input.checksum_sha256.as_deref(),
4633                req.input.checksum_crc64nvme.as_deref(),
4634            )?;
4635            let sample_len = bytes.len().min(SAMPLE_BYTES);
4636            // v0.8 #56: full part body is already in memory here; use its
4637            // length as the size hint so the dispatcher can promote to GPU
4638            // if it's big enough.
4639            let codec_kind = self
4640                .dispatcher
4641                .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
4642                .await;
4643            let original_size = bytes.len() as u64;
4644            // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
4645            let (compress_res, tel) = self
4646                .registry
4647                .compress_with_telemetry(bytes, codec_kind)
4648                .await;
4649            stamp_gpu_compress_telemetry(&tel);
4650            let (compressed, manifest) =
4651                compress_res.map_err(internal("registry compress part"))?;
4652            let header = FrameHeader {
4653                codec: codec_kind,
4654                original_size,
4655                compressed_size: compressed.len() as u64,
4656                crc32c: manifest.crc32c,
4657            };
4658            let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
4659            write_frame(&mut framed, header, &compressed);
4660            // v0.2 #5: heuristic-based padding skip for likely-final parts.
4661            //
4662            // AWS SDK / aws-cli / boto3 always send the final (and only the
4663            // final) part below the configured part_size. So if the raw user
4664            // part is already smaller than S3's 5 MiB multipart minimum, this
4665            // is overwhelmingly likely to be the final part — and the final
4666            // part is exempt from S3's size constraint. Skipping padding here
4667            // saves up to ~5 MiB per object on highly compressible workloads.
4668            //
4669            // If a misbehaving client sends a tiny **non-final** part, S3
4670            // itself rejects with EntityTooSmall at CompleteMultipartUpload —
4671            // identical outcome to a vanilla S3 PUT, just earlier than
4672            // padding-then-complete would catch it.
4673            let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
4674            if !likely_final {
4675                pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
4676            }
4677            let framed_bytes = framed.freeze();
4678            let new_len = framed_bytes.len() as i64;
4679            // 同じ wire 互換問題が multipart にもある (content-length / checksum)
4680            req.input.content_length = Some(new_len);
4681            req.input.checksum_algorithm = None;
4682            req.input.checksum_crc32 = None;
4683            req.input.checksum_crc32c = None;
4684            req.input.checksum_crc64nvme = None;
4685            req.input.checksum_sha1 = None;
4686            req.input.checksum_sha256 = None;
4687            req.input.content_md5 = None;
4688            req.input.body = Some(bytes_to_blob(framed_bytes));
4689            debug!(
4690                part_number = ?req.input.part_number,
4691                upload_id = ?req.input.upload_id,
4692                original_size,
4693                framed_size = new_len,
4694                "S4 upload_part: framed compressed payload"
4695            );
4696        }
4697        self.backend.upload_part(req).await
4698    }
4699    async fn complete_multipart_upload(
4700        &self,
4701        mut req: S3Request<CompleteMultipartUploadInput>,
4702    ) -> S3Result<S3Response<CompleteMultipartUploadOutput>> {
4703        let bucket = req.input.bucket.clone();
4704        let key = req.input.key.clone();
4705        let upload_id = req.input.upload_id.clone();
4706        // v0.8.12 HIGH-9 fix: gate Complete on `s3:PutObject` (the
4707        // commit point for the multipart-assembled object).
4708        self.enforce_policy(&req, "s3:PutObject", &bucket, Some(&key))?;
4709        self.enforce_rate_limit(&req, &bucket)?;
4710        // v0.8.12 HIGH-6 fix: re-verify Object Lock on the target key
4711        // at Complete time. Without this an attacker with PutObject
4712        // permission could `CreateMultipartUpload` against a key
4713        // that's currently under retention / legal hold and silently
4714        // overwrite it on Complete (the single-PUT path runs the
4715        // same check at L2007). Compliance retention is never
4716        // bypassable; Governance only with explicit IAM permission
4717        // (HIGH-7 gate below).
4718        if let Some(mgr) = self.object_lock.as_ref()
4719            && let Some(state) = mgr.get(&bucket, &key)
4720        {
4721            // CompleteMultipartUpload doesn't carry the bypass header
4722            // (the s3s DTO matches AWS' wire schema). A locked key
4723            // therefore cannot be overwritten by Complete regardless
4724            // of caller permission — operators who need to break a
4725            // Governance lock do it via PutObjectRetention before
4726            // calling Complete.
4727            let now = chrono::Utc::now();
4728            if !state.can_delete(now, false) {
4729                crate::metrics::record_policy_denial("s3:PutObject", &bucket);
4730                return Err(S3Error::with_message(
4731                    S3ErrorCode::AccessDenied,
4732                    "Access Denied because target key is protected by object lock",
4733                ));
4734            }
4735        }
4736        // v0.8.1 #59: serialise concurrent Complete invocations on the
4737        // same `(bucket, key)`. The race window the lock closes is the
4738        // GET-assembled-body → encrypt → PUT-encrypted-body triple
4739        // below (BUG-5 fix); without serialisation, two Completes for
4740        // different `upload_id` but the same logical key could each
4741        // read the other's plaintext assembled body and overwrite the
4742        // peer's encrypted result. The guard is held to function exit
4743        // (drop on `Ok` / `Err`), covering version-id mint, object-
4744        // lock apply, tagging persist, and replication enqueue too.
4745        let completion_lock = self.multipart_state.completion_lock(&bucket, &key);
4746        let _completion_guard = completion_lock.lock().await;
4747        // v0.8 #54 — fetch the per-upload context captured on Create.
4748        // `None` means an abandoned / unknown upload_id (gateway
4749        // crashed between Create and Complete, or pre-v0.8 state
4750        // restore); we still let the backend do its thing for
4751        // transparency, but we can't apply any SSE / version / lock /
4752        // tag / replication post-processing because we never captured
4753        // the recipe.
4754        let ctx = self.multipart_state.get(upload_id.as_str());
4755        // v0.8 #54 BUG-10 fix: same SSE-C header strip as upload_part
4756        // — some clients (boto3 / aws-sdk-cpp older versions) replay
4757        // the SSE-C triple on Complete too, and MinIO will choke if
4758        // they reach the backend.
4759        let _ = req.input.sse_customer_algorithm.take();
4760        let _ = req.input.sse_customer_key.take();
4761        let _ = req.input.sse_customer_key_md5.take();
4762        let mut resp = self.backend.complete_multipart_upload(req).await?;
4763        // CompleteMultipartUpload 成功 → 完成した object を full fetch して frame
4764        // index を build、`<key>.s4index` sidecar として保存。これで Range GET の
4765        // partial fetch path が利用可能になる (Range request の帯域節約)。
4766        // 注: 巨大 object の場合この pass は重いが、Range query は一度 sidecar が
4767        // できれば爆速になるので 1 回の cost は payback される
4768        //
4769        // v0.8 #54 BUG-5..9: this same fetch is the choke-point for
4770        // the SSE encrypt re-PUT + versioning shadow-key rewrite +
4771        // replication source-bytes capture, so we GET once and reuse
4772        // the bytes for every post-processing step.
4773        let assembled_body: Option<bytes::Bytes> = if let Ok(uri) = safe_object_uri(&bucket, &key) {
4774            let get_input = GetObjectInput {
4775                bucket: bucket.clone(),
4776                key: key.clone(),
4777                ..Default::default()
4778            };
4779            let get_req = S3Request {
4780                input: get_input,
4781                method: http::Method::GET,
4782                uri,
4783                headers: http::HeaderMap::new(),
4784                extensions: http::Extensions::new(),
4785                credentials: None,
4786                region: None,
4787                service: None,
4788                trailing_headers: None,
4789            };
4790            match self.backend.get_object(get_req).await {
4791                Ok(get_resp) => match get_resp.output.body {
4792                    Some(blob) => collect_blob(blob, self.max_body_bytes).await.ok(),
4793                    None => None,
4794                },
4795                Err(e) => {
4796                    // v0.8.4 #71 (C-1 audit fix): a silent
4797                    // `Err(_) => None` here is a SSE plaintext
4798                    // leak. The post-processing block below only
4799                    // runs the SSE re-encrypt branch when
4800                    // `assembled_body.is_some()`, so swallowing a
4801                    // backend error skipped the encrypt step and
4802                    // left the multipart object on disk as
4803                    // plaintext, even on SSE-S4 / SSE-C / SSE-KMS
4804                    // configured buckets. Same root-cause family
4805                    // as v0.8 BUG-5; this branch closes the
4806                    // remaining read-side window.
4807                    //
4808                    // We distinguish two cases:
4809                    //  - `NoSuchKey`: the object is genuinely
4810                    //    missing post-Complete. This is rare and
4811                    //    typically races with a concurrent
4812                    //    DeleteObject; there is nothing to re-
4813                    //    encrypt and no SSE markers to honour, so
4814                    //    falling through to the legacy
4815                    //    `assembled_body = None` path is safe.
4816                    //  - everything else (5xx, network, auth,
4817                    //    etc.): we must FAIL the Complete so the
4818                    //    client can retry. Returning Ok with
4819                    //    `assembled_body = None` would silently
4820                    //    skip the SSE re-encrypt and leave the
4821                    //    backend bytes plaintext.
4822                    if matches!(e.code(), &S3ErrorCode::NoSuchKey) {
4823                        tracing::warn!(
4824                            bucket = %bucket,
4825                            key = %key,
4826                            "multipart Complete: backend GET returned NoSuchKey; \
4827                             skipping post-processing (object likely raced with DeleteObject)"
4828                        );
4829                        None
4830                    } else {
4831                        tracing::error!(
4832                            bucket = %bucket,
4833                            key = %key,
4834                            error = %e,
4835                            "multipart Complete: backend GET failed; failing the Complete \
4836                             so the client retries (silent fall-through would skip SSE \
4837                             re-encrypt and store plaintext)"
4838                        );
4839                        return Err(internal("multipart Complete: backend body fetch failed")(e));
4840                    }
4841                }
4842            }
4843        } else {
4844            None
4845        };
4846        // Sidecar build (existing behaviour, gated on assembled body).
4847        //
4848        // v0.8.12 HIGH-10 fix: skip the sidecar when the Complete is
4849        // going to SSE-encrypt the assembled body before re-PUT (the
4850        // single-PUT path applies the same suppression at L2271).
4851        // Stale offsets into the pre-encrypt body would break Range
4852        // GET on the encrypted on-disk bytes. `ctx.sse != None`
4853        // covers all three SSE modes captured at Create time.
4854        let mp_will_encrypt = ctx
4855            .as_ref()
4856            .map(|c| !matches!(c.sse, crate::multipart_state::MultipartSseMode::None))
4857            .unwrap_or(false);
4858        // v0.8.16 F-7: versioned multipart writes the assembled body
4859        // under `versioned_shadow_key(&key, vid)` *after* this
4860        // sidecar block, then deletes the original `<key>`. Stamping
4861        // the sidecar against the to-be-deleted `<key>` (which is
4862        // what H-g did) leaves an orphan `<key>.s4index` whose
4863        // source-ETag binding can never match the live shadow body
4864        // — the Range GET fast-path's stale-sidecar check then
4865        // falls through to a full read on every request, silently
4866        // disabling partial fetch. Skip the sidecar build entirely
4867        // for versioned buckets; a follow-up issue tracks writing
4868        // the sidecar under the shadow key with the shadow's ETag.
4869        let mp_skip_sidecar_for_versioning = self
4870            .versioning
4871            .as_ref()
4872            .map(|mgr| mgr.state(&bucket))
4873            .map(|state| state == crate::versioning::VersioningState::Enabled)
4874            .unwrap_or(false);
4875        if let Some(ref body) = assembled_body
4876            && !mp_will_encrypt
4877            && !mp_skip_sidecar_for_versioning
4878            && let Ok(mut index) = build_index_from_body(body)
4879        {
4880            // v0.8.15 H-g: stamp the source-ETag / source-compressed-size
4881            // binding on the multipart sidecar. The single-PUT path
4882            // does this at L2519-L2521 via the backend's PUT response,
4883            // but Complete returns its own ETag (an opaque manifest
4884            // hash) so we have to HEAD the freshly-completed object
4885            // to pick up what backend actually wrote, then bind the
4886            // sidecar to those values. Without the binding, a
4887            // subsequent backend-side mutation (lifecycle rewrite,
4888            // out-of-band CopyObject) wouldn't trip the staleness
4889            // check on the next Range GET — the GET would happily
4890            // slice the new bytes at the old sidecar offsets, with
4891            // silent data corruption.
4892            if let Ok(uri) = safe_object_uri(&bucket, &key) {
4893                let head_req = S3Request {
4894                    input: HeadObjectInput {
4895                        bucket: bucket.clone(),
4896                        key: key.clone(),
4897                        ..Default::default()
4898                    },
4899                    method: http::Method::HEAD,
4900                    uri,
4901                    headers: http::HeaderMap::new(),
4902                    extensions: http::Extensions::new(),
4903                    credentials: None,
4904                    region: None,
4905                    service: None,
4906                    trailing_headers: None,
4907                };
4908                if let Ok(head) = self.backend.head_object(head_req).await {
4909                    index.source_etag = head.output.e_tag.as_ref().map(|t| t.value().to_string());
4910                    index.source_compressed_size = head
4911                        .output
4912                        .content_length
4913                        .and_then(|n| u64::try_from(n).ok());
4914                }
4915                // HEAD failure is non-fatal — the sidecar still works
4916                // as a v1-style best-effort fast path; the Range GET
4917                // simply falls back to a full read on any consistency
4918                // signal.
4919            }
4920            self.write_sidecar(&bucket, &key, &index).await;
4921        }
4922        // From here on, post-processing depends on the context —
4923        // short-circuit when the upload had no captured recipe
4924        // (legacy / crashed-Create / pre-v0.8 state restore).
4925        if let Some(ctx) = ctx {
4926            // v0.8 #54 BUG-6 fix: mint a version-id when the bucket
4927            // is versioning-Enabled. The single-PUT path does this in
4928            // `put_object` ~L1968; multipart was the missing branch.
4929            // We mint here (post-Complete, before any re-PUT) so the
4930            // same vid threads into both the shadow-key rewrite and
4931            // the VersionEntry the manager records.
4932            let pending_version: Option<crate::versioning::PutOutcome> = self
4933                .versioning
4934                .as_ref()
4935                .map(|mgr| mgr.state(&bucket))
4936                .map(|state| match state {
4937                    crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
4938                        version_id: crate::versioning::VersioningManager::new_version_id(),
4939                        versioned_response: true,
4940                    },
4941                    crate::versioning::VersioningState::Suspended
4942                    | crate::versioning::VersioningState::Unversioned => {
4943                        crate::versioning::PutOutcome {
4944                            version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
4945                            versioned_response: false,
4946                        }
4947                    }
4948                });
4949            // v0.8 #54 BUG-5 fix: encrypt the assembled framed body
4950            // and re-PUT it to the backend so the on-disk bytes are
4951            // SSE-encrypted. The single-PUT path does this body-by-
4952            // body inside `put_object` (L1907-L1942); for multipart,
4953            // encrypt-per-part would require a multi-segment decrypt
4954            // path on GET — we instead do a single encrypt over the
4955            // assembled framed body so the existing GET decrypt
4956            // branch (`is_sse_encrypted` → `decrypt(body, source)` →
4957            // FrameIter) handles it unchanged.
4958            //
4959            // The cost is one extra round-trip per Complete for SSE-
4960            // enabled multipart (already-paid for the sidecar build).
4961            // For single-instance gateways pointing at a co-located
4962            // backend this is negligible; cross-region operators
4963            // would benefit from per-part encrypt + multi-segment
4964            // decrypt as a follow-up.
4965            let needs_re_put = matches!(
4966                ctx.sse,
4967                crate::multipart_state::MultipartSseMode::SseS4
4968                    | crate::multipart_state::MultipartSseMode::SseC { .. }
4969                    | crate::multipart_state::MultipartSseMode::SseKms { .. }
4970            ) || pending_version
4971                .as_ref()
4972                .map(|pv| pv.versioned_response)
4973                .unwrap_or(false);
4974            // v0.8.11 CRIT-2 fix: seed the replication body with the
4975            // pre-encrypt assembled bytes, but overwrite it with the
4976            // post-encrypt `new_body` once the re-PUT branch lands.
4977            // The previous "snapshot in advance" pattern shipped the
4978            // *plaintext* framed body to the destination bucket even
4979            // when SSE-S4 / SSE-C / SSE-KMS was active — the GET on
4980            // the destination would then fail to decrypt (or, worse,
4981            // succeed in handing out plaintext that the source had
4982            // promised was encrypted at rest). When `needs_re_put`
4983            // is false (no SSE, no versioning), the backend still
4984            // holds the original plaintext-framed bytes, and the
4985            // seed value is what the destination should receive.
4986            let mut replication_body = assembled_body.clone();
4987            let mut applied_metadata: Option<std::collections::HashMap<String, String>> = None;
4988            if needs_re_put && let Some(body) = assembled_body {
4989                // v0.8.1 #58: same Zeroizing pattern as put_object's
4990                // single-PUT KMS branch — DEK plaintext lives in
4991                // `Zeroizing<[u8; 32]>` for the lifetime of this
4992                // Complete handler, then is wiped on drop.
4993                let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
4994                    if let crate::multipart_state::MultipartSseMode::SseKms { ref key_id } = ctx.sse
4995                    {
4996                        let kms = self.kms.as_ref().ok_or_else(|| {
4997                        S3Error::with_message(
4998                            S3ErrorCode::InvalidRequest,
4999                            "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
5000                        )
5001                    })?;
5002                        let (dek, wrapped) =
5003                            kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
5004                        if dek.len() != 32 {
5005                            return Err(S3Error::with_message(
5006                                S3ErrorCode::InternalError,
5007                                format!(
5008                                    "KMS backend returned a DEK of {} bytes (expected 32)",
5009                                    dek.len()
5010                                ),
5011                            ));
5012                        }
5013                        let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
5014                            zeroize::Zeroizing::new([0u8; 32]);
5015                        dek_arr.copy_from_slice(&dek);
5016                        // `dek` (Zeroizing<Vec<u8>>) is dropped at scope end.
5017                        Some((dek_arr, wrapped))
5018                    } else {
5019                        None
5020                    };
5021                // Build the new metadata map: re-fetch via HEAD so
5022                // the multipart / codec markers the backend stamped
5023                // on Create flow through unchanged, then layer the
5024                // SSE markers on top.
5025                let head_req = S3Request {
5026                    input: HeadObjectInput {
5027                        bucket: bucket.clone(),
5028                        key: key.clone(),
5029                        ..Default::default()
5030                    },
5031                    method: http::Method::HEAD,
5032                    uri: safe_object_uri(&bucket, &key)?,
5033                    headers: http::HeaderMap::new(),
5034                    extensions: http::Extensions::new(),
5035                    credentials: None,
5036                    region: None,
5037                    service: None,
5038                    trailing_headers: None,
5039                };
5040                let mut new_metadata: std::collections::HashMap<String, String> =
5041                    match self.backend.head_object(head_req).await {
5042                        Ok(h) => h.output.metadata.unwrap_or_default(),
5043                        Err(_) => std::collections::HashMap::new(),
5044                    };
5045                let new_body = match &ctx.sse {
5046                    crate::multipart_state::MultipartSseMode::SseC { key, key_md5 } => {
5047                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5048                        new_metadata.insert("s4-sse-type".into(), "AES256".into());
5049                        new_metadata.insert(
5050                            "s4-sse-c-key-md5".into(),
5051                            base64::engine::general_purpose::STANDARD.encode(key_md5),
5052                        );
5053                        // v0.8.2 #62: `key` is `&Zeroizing<[u8; 32]>`;
5054                        // auto-deref through one explicit binding so
5055                        // `SseSource::CustomerKey` gets the `&[u8; 32]`
5056                        // it expects (mirrors the SSE-KMS DEK shape
5057                        // a few lines down).
5058                        let key_ref: &[u8; 32] = key;
5059                        crate::sse::encrypt_with_source(
5060                            &body,
5061                            crate::sse::SseSource::CustomerKey {
5062                                key: key_ref,
5063                                key_md5,
5064                            },
5065                        )
5066                    }
5067                    crate::multipart_state::MultipartSseMode::SseKms { .. } => {
5068                        let (dek, wrapped) = kms_wrap
5069                            .as_ref()
5070                            .expect("SseKms branch implies kms_wrap is Some");
5071                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5072                        new_metadata.insert("s4-sse-type".into(), "aws:kms".into());
5073                        new_metadata.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
5074                        // v0.8.1 #58: auto-deref from `&Zeroizing<[u8; 32]>`
5075                        // to `&[u8; 32]` (same shape as the put_object
5076                        // single-PUT branch).
5077                        let dek_ref: &[u8; 32] = dek;
5078                        crate::sse::encrypt_with_source(
5079                            &body,
5080                            crate::sse::SseSource::Kms {
5081                                dek: dek_ref,
5082                                wrapped,
5083                            },
5084                        )
5085                    }
5086                    crate::multipart_state::MultipartSseMode::SseS4 => {
5087                        let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
5088                            S3Error::with_message(
5089                                S3ErrorCode::InternalError,
5090                                "SSE-S4 captured at Create but keyring missing at Complete",
5091                            )
5092                        })?;
5093                        new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5094                        // SSE-S4 deliberately omits `s4-sse-type` so
5095                        // HEAD doesn't falsely advertise AWS-style
5096                        // SSE-S3 (matches the put_object L1929-L1939
5097                        // comment).
5098                        // v0.8 #52: same chunk_size dispatch as the
5099                        // single-PUT branch — multipart Complete
5100                        // re-encrypts the assembled body, so honoring
5101                        // the chunked path here is required to keep
5102                        // GET streaming on multipart-uploaded objects.
5103                        if self.sse_chunk_size > 0 {
5104                            crate::sse::encrypt_v2_chunked(&body, keyring, self.sse_chunk_size)
5105                                .map_err(|e| {
5106                                    S3Error::with_message(
5107                                        S3ErrorCode::InternalError,
5108                                        format!("SSE-S4 chunked encrypt failed at Complete: {e}"),
5109                                    )
5110                                })?
5111                        } else {
5112                            crate::sse::encrypt_v2(&body, keyring)
5113                        }
5114                    }
5115                    crate::multipart_state::MultipartSseMode::None => body.clone(),
5116                };
5117                // v0.8 #54 BUG-6 fix: write the re-PUT under the
5118                // shadow key so the version chain doesn't overwrite
5119                // the previous version on a versioned bucket. The
5120                // original (unshadowed) key was assembled by the
5121                // backend on Complete; we delete it after the shadow
5122                // PUT lands.
5123                let put_target_key = if let Some(pv) = pending_version.as_ref() {
5124                    if pv.versioned_response {
5125                        versioned_shadow_key(&key, &pv.version_id)
5126                    } else {
5127                        key.clone()
5128                    }
5129                } else {
5130                    key.clone()
5131                };
5132                let new_body_len = new_body.len() as i64;
5133                let put_req = S3Request {
5134                    input: PutObjectInput {
5135                        bucket: bucket.clone(),
5136                        key: put_target_key.clone(),
5137                        body: Some(bytes_to_blob(new_body.clone())),
5138                        metadata: Some(new_metadata.clone()),
5139                        content_length: Some(new_body_len),
5140                        ..Default::default()
5141                    },
5142                    method: http::Method::PUT,
5143                    uri: safe_object_uri(&bucket, &put_target_key)?,
5144                    headers: http::HeaderMap::new(),
5145                    extensions: http::Extensions::new(),
5146                    credentials: None,
5147                    region: None,
5148                    service: None,
5149                    trailing_headers: None,
5150                };
5151                self.backend.put_object(put_req).await?;
5152                // v0.8.11 CRIT-2 fix: refresh the replication snapshot
5153                // with the bytes that were actually persisted to the
5154                // backend (post-SSE-encrypt for SSE modes; identical to
5155                // `body` for `MultipartSseMode::None` + versioning-only
5156                // re-PUT). The destination then sees the same on-disk
5157                // shape the source does, and a destination GET decrypts
5158                // correctly when SSE is on.
5159                replication_body = Some(new_body.clone());
5160                // If we rewrote the storage key (versioning shadow),
5161                // we must drop the original (unshadowed) Complete-
5162                // assembled bytes so subsequent listings don't see a
5163                // duplicate.
5164                if put_target_key != key {
5165                    let del_req = S3Request {
5166                        input: DeleteObjectInput {
5167                            bucket: bucket.clone(),
5168                            key: key.clone(),
5169                            ..Default::default()
5170                        },
5171                        method: http::Method::DELETE,
5172                        uri: safe_object_uri(&bucket, &key)?,
5173                        headers: http::HeaderMap::new(),
5174                        extensions: http::Extensions::new(),
5175                        credentials: None,
5176                        region: None,
5177                        service: None,
5178                        trailing_headers: None,
5179                    };
5180                    let _ = self.backend.delete_object(del_req).await;
5181                }
5182                applied_metadata = Some(new_metadata);
5183            }
5184            // v0.8 #54 BUG-6 commit: register the new version with
5185            // the VersioningManager so list_object_versions /
5186            // GET ?versionId= see it.
5187            if let (Some(mgr), Some(pv)) = (self.versioning.as_ref(), pending_version.as_ref()) {
5188                let etag = resp
5189                    .output
5190                    .e_tag
5191                    .clone()
5192                    .map(ETag::into_value)
5193                    .unwrap_or_default();
5194                let now = chrono::Utc::now();
5195                mgr.commit_put_with_version(
5196                    &bucket,
5197                    &key,
5198                    crate::versioning::VersionEntry {
5199                        version_id: pv.version_id.clone(),
5200                        etag,
5201                        size: replication_body
5202                            .as_ref()
5203                            .map(|b| b.len() as u64)
5204                            .unwrap_or(0),
5205                        is_delete_marker: false,
5206                        created_at: now,
5207                    },
5208                );
5209                if pv.versioned_response {
5210                    resp.output.version_id = Some(pv.version_id.clone());
5211                }
5212            }
5213            // v0.8 #54 BUG-7 fix: persist any per-upload Object Lock
5214            // recipe + auto-apply the bucket default. Mirrors the
5215            // put_object L2057-L2074 block.
5216            if let Some(mgr) = self.object_lock.as_ref() {
5217                if ctx.object_lock_mode.is_some()
5218                    || ctx.object_lock_retain_until.is_some()
5219                    || ctx.object_lock_legal_hold
5220                {
5221                    let mut state = mgr.get(&bucket, &key).unwrap_or_default();
5222                    if let Some(m) = ctx.object_lock_mode {
5223                        state.mode = Some(m);
5224                    }
5225                    if let Some(u) = ctx.object_lock_retain_until {
5226                        state.retain_until = Some(u);
5227                    }
5228                    if ctx.object_lock_legal_hold {
5229                        state.legal_hold_on = true;
5230                    }
5231                    mgr.set(&bucket, &key, state);
5232                }
5233                mgr.apply_default_on_put(&bucket, &key, chrono::Utc::now());
5234            }
5235            // v0.8 #54 BUG-9 fix: persist the captured tags via the
5236            // TagManager so GetObjectTagging returns them.
5237            if let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), ctx.tags.as_ref()) {
5238                mgr.put_object_tags(&bucket, &key, tags.clone());
5239            }
5240            // SSE-C / SSE-KMS response echo. The
5241            // CompleteMultipartUploadOutput only exposes
5242            // `server_side_encryption` + `ssekms_key_id` (no
5243            // sse_customer_* — those round-tripped on Create / parts).
5244            match &ctx.sse {
5245                crate::multipart_state::MultipartSseMode::SseC { .. } => {
5246                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5247                        ServerSideEncryption::AES256,
5248                    ));
5249                }
5250                crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5251                    resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5252                        ServerSideEncryption::AWS_KMS,
5253                    ));
5254                    resp.output.ssekms_key_id = Some(key_id.clone());
5255                }
5256                _ => {}
5257            }
5258            // v0.8 #54 BUG-8 fix: fire cross-bucket replication just
5259            // like put_object L2165 does. We hand the dispatcher the
5260            // assembled body bytes (post-encrypt where applicable, so
5261            // the destination ends up byte-identical to the source's
5262            // on-disk shape) plus the metadata that was actually
5263            // committed.
5264            let replication_body_bytes = replication_body.unwrap_or_default();
5265            // v0.8.2 #61: thread the multipart-Complete `pending_version`
5266            // through so a versioning-Enabled source's destination
5267            // receives the same shadow-key path (mirror of the
5268            // single-PUT branch above).
5269            self.spawn_replication_if_matched(
5270                &bucket,
5271                &key,
5272                &ctx.tags,
5273                &replication_body_bytes,
5274                &applied_metadata,
5275                true,
5276                pending_version.as_ref(),
5277            );
5278            self.multipart_state.remove(upload_id.as_str());
5279        }
5280        // v0.8.1 #59 janitor: best-effort sweep of stale completion
5281        // locks while we are still on the critical path of a single
5282        // Complete (so steady-state workloads of unique keys don't
5283        // accumulate `DashMap` entries). The sweep only retires
5284        // entries whose `Arc::strong_count == 1`, so any other in-
5285        // flight Complete on a different key keeps its lock alive.
5286        // Our own `_completion_guard` keeps `bucket`/`key`'s entry
5287        // alive across this call; it's reaped on the next Complete or
5288        // the next caller-driven prune.
5289        self.multipart_state.prune_completion_locks();
5290        Ok(resp)
5291    }
5292    async fn abort_multipart_upload(
5293        &self,
5294        req: S3Request<AbortMultipartUploadInput>,
5295    ) -> S3Result<S3Response<AbortMultipartUploadOutput>> {
5296        // v0.8.12 HIGH-9 fix: gate Abort on `s3:AbortMultipartUpload`
5297        // — the AWS-spec action verb for this operation. Without the
5298        // gate, anyone who could guess an upload_id could throw away
5299        // someone else's in-flight multipart upload.
5300        let abort_bucket = req.input.bucket.clone();
5301        let abort_key = req.input.key.clone();
5302        self.enforce_policy(
5303            &req,
5304            "s3:AbortMultipartUpload",
5305            &abort_bucket,
5306            Some(&abort_key),
5307        )?;
5308        // v0.8 #54: drop the per-upload state (SSE-C key bytes / tag
5309        // set) promptly so an aborted upload doesn't leak the
5310        // customer's key into a long-running gateway's RSS.
5311        //
5312        // v0.8.4 #71 (H-7 audit fix): backend.abort_multipart_upload
5313        // FIRST, then drop in-process state ONLY on success. The
5314        // previous order ("remove → call backend") meant a transient
5315        // backend abort failure (5xx, network) wiped the SSE-C key
5316        // bytes locally while leaving the parts on the backend, so a
5317        // client retry would have to re-validate the SSE-C key against
5318        // a context the gateway no longer has — and the retried abort
5319        // would still hit the unaborted backend parts. Calling the
5320        // backend first lets the failure propagate to the client with
5321        // state intact for a clean retry; only on success do we wipe
5322        // the local state.
5323        let upload_id = req.input.upload_id.as_str().to_owned();
5324        let resp = self.backend.abort_multipart_upload(req).await?;
5325        self.multipart_state.remove(&upload_id);
5326        Ok(resp)
5327    }
5328    async fn list_multipart_uploads(
5329        &self,
5330        req: S3Request<ListMultipartUploadsInput>,
5331    ) -> S3Result<S3Response<ListMultipartUploadsOutput>> {
5332        self.backend.list_multipart_uploads(req).await
5333    }
5334    async fn list_parts(
5335        &self,
5336        req: S3Request<ListPartsInput>,
5337    ) -> S3Result<S3Response<ListPartsOutput>> {
5338        self.backend.list_parts(req).await
5339    }
5340
5341    // =========================================================================
5342    // Phase 2 — pure passthrough delegations。S4 はこれらに対して圧縮 hook を
5343    // 持たないので、backend (= AWS S3) の動作と完全に同一。
5344    //
5345    // 既知の制限事項:
5346    // - copy_object / upload_part_copy: source object が S4-compressed の場合、
5347    //   backend が bytes を copy するだけなので metadata (s4-codec etc) も一緒に
5348    //   coppied される (AWS S3 default = MetadataDirective COPY)。GET は manifest
5349    //   経由で正しく decompress できる。MetadataDirective REPLACE で上書き
5350    //   されると圧縮 metadata が消えて壊れる — 顧客側の運用で注意
5351    // - list_object_versions: versioning enabled bucket では各 version も S4
5352    //   metadata を維持する。古い version も S4 経由で正しく GET できる。
5353    // =========================================================================
5354
5355    // ---- Object ACL / tagging / attributes ----
5356    async fn get_object_acl(
5357        &self,
5358        req: S3Request<GetObjectAclInput>,
5359    ) -> S3Result<S3Response<GetObjectAclOutput>> {
5360        self.backend.get_object_acl(req).await
5361    }
5362    async fn put_object_acl(
5363        &self,
5364        req: S3Request<PutObjectAclInput>,
5365    ) -> S3Result<S3Response<PutObjectAclOutput>> {
5366        self.backend.put_object_acl(req).await
5367    }
5368    // v0.6 #39: object tagging — when a `TagManager` is attached the
5369    // configuration / per-(bucket, key) state lives in the manager and
5370    // these handlers serve directly from it; when no manager is
5371    // attached they fall back to the backend (legacy passthrough so
5372    // v0.5 deployments are unaffected).
5373    async fn get_object_tagging(
5374        &self,
5375        req: S3Request<GetObjectTaggingInput>,
5376    ) -> S3Result<S3Response<GetObjectTaggingOutput>> {
5377        let Some(mgr) = self.tagging.as_ref() else {
5378            return self.backend.get_object_tagging(req).await;
5379        };
5380        let tags = mgr
5381            .get_object_tags(&req.input.bucket, &req.input.key)
5382            .unwrap_or_default();
5383        Ok(S3Response::new(GetObjectTaggingOutput {
5384            tag_set: tagset_to_aws(&tags),
5385            ..Default::default()
5386        }))
5387    }
5388    async fn put_object_tagging(
5389        &self,
5390        req: S3Request<PutObjectTaggingInput>,
5391    ) -> S3Result<S3Response<PutObjectTaggingOutput>> {
5392        let Some(mgr) = self.tagging.as_ref() else {
5393            return self.backend.put_object_tagging(req).await;
5394        };
5395        let bucket = req.input.bucket.clone();
5396        let key = req.input.key.clone();
5397        let parsed = aws_to_tagset(&req.input.tagging.tag_set)
5398            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5399        // v0.6 #39: gate via IAM policy with both the request tags
5400        // (`s3:RequestObjectTag/<key>`) and any existing tags on the
5401        // target object (`s3:ExistingObjectTag/<key>`).
5402        let existing = mgr.get_object_tags(&bucket, &key);
5403        self.enforce_policy_with_extra(
5404            &req,
5405            "s3:PutObjectTagging",
5406            &bucket,
5407            Some(&key),
5408            Some(&parsed),
5409            existing.as_ref(),
5410        )?;
5411        mgr.put_object_tags(&bucket, &key, parsed);
5412        Ok(S3Response::new(PutObjectTaggingOutput::default()))
5413    }
5414    async fn delete_object_tagging(
5415        &self,
5416        req: S3Request<DeleteObjectTaggingInput>,
5417    ) -> S3Result<S3Response<DeleteObjectTaggingOutput>> {
5418        let Some(mgr) = self.tagging.as_ref() else {
5419            return self.backend.delete_object_tagging(req).await;
5420        };
5421        let bucket = req.input.bucket.clone();
5422        let key = req.input.key.clone();
5423        let existing = mgr.get_object_tags(&bucket, &key);
5424        self.enforce_policy_with_extra(
5425            &req,
5426            "s3:DeleteObjectTagging",
5427            &bucket,
5428            Some(&key),
5429            None,
5430            existing.as_ref(),
5431        )?;
5432        mgr.delete_object_tags(&bucket, &key);
5433        Ok(S3Response::new(DeleteObjectTaggingOutput::default()))
5434    }
5435    async fn get_object_attributes(
5436        &self,
5437        req: S3Request<GetObjectAttributesInput>,
5438    ) -> S3Result<S3Response<GetObjectAttributesOutput>> {
5439        self.backend.get_object_attributes(req).await
5440    }
5441    async fn restore_object(
5442        &self,
5443        req: S3Request<RestoreObjectInput>,
5444    ) -> S3Result<S3Response<RestoreObjectOutput>> {
5445        self.backend.restore_object(req).await
5446    }
5447    async fn upload_part_copy(
5448        &self,
5449        req: S3Request<UploadPartCopyInput>,
5450    ) -> S3Result<S3Response<UploadPartCopyOutput>> {
5451        // v0.8.12 HIGH-9 fix: same per-action gates as `copy_object` —
5452        // destination PUT + source GET.
5453        let dst_bucket = req.input.bucket.clone();
5454        let dst_key = req.input.key.clone();
5455        self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
5456        if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
5457            self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
5458        }
5459        self.enforce_rate_limit(&req, &dst_bucket)?;
5460        // v0.2 #6: byte-range aware copy when the source is S4-framed.
5461        //
5462        // For a framed source (multipart upload OR single-PUT framed-v2),
5463        // a naive byte-range passthrough would copy compressed bytes that
5464        // don't align with S4 frame boundaries — silently corrupting the
5465        // result. Instead we GET the source through S4 (which handles
5466        // decompression + Range), re-compress + re-frame as a new part,
5467        // and forward as upload_part. For non-framed sources (S4-untouched
5468        // raw objects), passthrough is correct and we keep the original
5469        // (cheaper) code path.
5470        // v0.8.4 #74: propagate the optional `?versionId=<vid>` from the
5471        // copy-source header. Without this, a versioned source bucket
5472        // copy that pins a specific old version would silently fall
5473        // back to "latest", assembling wrong bytes into the destination
5474        // multipart object (silent data corruption).
5475        let CopySource::Bucket {
5476            bucket: src_bucket,
5477            key: src_key,
5478            version_id: src_version_id,
5479        } = &req.input.copy_source
5480        else {
5481            return self.backend.upload_part_copy(req).await;
5482        };
5483        let src_bucket = src_bucket.to_string();
5484        let src_key = src_key.to_string();
5485        let src_version_id: Option<String> = src_version_id.as_deref().map(str::to_owned);
5486
5487        // Probe metadata to decide whether the source needs S4-aware copy.
5488        let head_input = HeadObjectInput {
5489            bucket: src_bucket.clone(),
5490            key: src_key.clone(),
5491            version_id: src_version_id.clone(),
5492            ..Default::default()
5493        };
5494        let head_req = S3Request {
5495            input: head_input,
5496            method: http::Method::HEAD,
5497            uri: req.uri.clone(),
5498            headers: req.headers.clone(),
5499            extensions: http::Extensions::new(),
5500            credentials: req.credentials.clone(),
5501            region: req.region.clone(),
5502            service: req.service.clone(),
5503            trailing_headers: None,
5504        };
5505        let needs_s4_copy = match self.backend.head_object(head_req).await {
5506            Ok(h) => {
5507                is_multipart_object(&h.output.metadata) || is_framed_v2_object(&h.output.metadata)
5508            }
5509            Err(_) => false,
5510        };
5511        if !needs_s4_copy {
5512            return self.backend.upload_part_copy(req).await;
5513        }
5514
5515        // Resolve the optional source byte range to pass to GET.
5516        let source_range = req
5517            .input
5518            .copy_source_range
5519            .as_ref()
5520            .map(|r| parse_copy_source_range(r))
5521            .transpose()
5522            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
5523
5524        // GET source via S4 (handles decompression + sidecar partial fetch
5525        // when range is present). The result is the requested user-visible
5526        // byte range, fully decompressed. version_id is propagated so
5527        // pinned-version copies fetch the exact version requested.
5528        let mut get_input = GetObjectInput {
5529            bucket: src_bucket.clone(),
5530            key: src_key.clone(),
5531            version_id: src_version_id.clone(),
5532            ..Default::default()
5533        };
5534        get_input.range = source_range;
5535        let get_req = S3Request {
5536            input: get_input,
5537            method: http::Method::GET,
5538            uri: req.uri.clone(),
5539            headers: req.headers.clone(),
5540            extensions: http::Extensions::new(),
5541            credentials: req.credentials.clone(),
5542            region: req.region.clone(),
5543            service: req.service.clone(),
5544            trailing_headers: None,
5545        };
5546        let get_resp = self.get_object(get_req).await?;
5547        let blob = get_resp.output.body.ok_or_else(|| {
5548            S3Error::with_message(
5549                S3ErrorCode::InternalError,
5550                "upload_part_copy: empty body from source GET",
5551            )
5552        })?;
5553        let bytes = collect_blob(blob, self.max_body_bytes)
5554            .await
5555            .map_err(internal("collect upload_part_copy source body"))?;
5556
5557        // Compress + frame as a fresh part (mirrors upload_part path).
5558        let sample_len = bytes.len().min(SAMPLE_BYTES);
5559        // v0.8 #56: same size-hint promotion as the upload_part path.
5560        let codec_kind = self
5561            .dispatcher
5562            .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
5563            .await;
5564        let original_size = bytes.len() as u64;
5565        // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
5566        let (compress_res, tel) = self
5567            .registry
5568            .compress_with_telemetry(bytes, codec_kind)
5569            .await;
5570        stamp_gpu_compress_telemetry(&tel);
5571        let (compressed, manifest) =
5572            compress_res.map_err(internal("registry compress upload_part_copy"))?;
5573        let header = FrameHeader {
5574            codec: codec_kind,
5575            original_size,
5576            compressed_size: compressed.len() as u64,
5577            crc32c: manifest.crc32c,
5578        };
5579        let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
5580        write_frame(&mut framed, header, &compressed);
5581        let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
5582        if !likely_final {
5583            pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
5584        }
5585        let framed_bytes = framed.freeze();
5586        let framed_len = framed_bytes.len() as i64;
5587
5588        // Forward as upload_part to the destination multipart upload.
5589        let part_input = UploadPartInput {
5590            bucket: req.input.bucket.clone(),
5591            key: req.input.key.clone(),
5592            part_number: req.input.part_number,
5593            upload_id: req.input.upload_id.clone(),
5594            body: Some(bytes_to_blob(framed_bytes)),
5595            content_length: Some(framed_len),
5596            ..Default::default()
5597        };
5598        let part_req = S3Request {
5599            input: part_input,
5600            method: http::Method::PUT,
5601            uri: req.uri.clone(),
5602            headers: req.headers.clone(),
5603            extensions: http::Extensions::new(),
5604            credentials: req.credentials.clone(),
5605            region: req.region.clone(),
5606            service: req.service.clone(),
5607            trailing_headers: None,
5608        };
5609        let upload_resp = self.backend.upload_part(part_req).await?;
5610
5611        let copy_output = UploadPartCopyOutput {
5612            copy_part_result: Some(CopyPartResult {
5613                e_tag: upload_resp.output.e_tag.clone(),
5614                ..Default::default()
5615            }),
5616            ..Default::default()
5617        };
5618        Ok(S3Response::new(copy_output))
5619    }
5620
5621    // ---- Object lock / retention / legal hold (v0.5 #30) ----
5622    //
5623    // When an `ObjectLockManager` is attached the configuration / per-object
5624    // state lives in the manager and these handlers serve directly from it;
5625    // when no manager is attached they fall back to the backend (legacy
5626    // passthrough so v0.4 deployments are unaffected).
5627    async fn get_object_lock_configuration(
5628        &self,
5629        req: S3Request<GetObjectLockConfigurationInput>,
5630    ) -> S3Result<S3Response<GetObjectLockConfigurationOutput>> {
5631        self.enforce_policy(
5632            &req,
5633            "s3:GetBucketObjectLockConfiguration",
5634            &req.input.bucket,
5635            None,
5636        )?;
5637        if let Some(mgr) = self.object_lock.as_ref() {
5638            let cfg = mgr
5639                .bucket_default(&req.input.bucket)
5640                .map(|d| ObjectLockConfiguration {
5641                    object_lock_enabled: Some(ObjectLockEnabled::from_static(
5642                        ObjectLockEnabled::ENABLED,
5643                    )),
5644                    rule: Some(ObjectLockRule {
5645                        default_retention: Some(DefaultRetention {
5646                            days: Some(d.retention_days as i32),
5647                            mode: Some(ObjectLockRetentionMode::from_static(match d.mode {
5648                                crate::object_lock::LockMode::Governance => {
5649                                    ObjectLockRetentionMode::GOVERNANCE
5650                                }
5651                                crate::object_lock::LockMode::Compliance => {
5652                                    ObjectLockRetentionMode::COMPLIANCE
5653                                }
5654                            })),
5655                            years: None,
5656                        }),
5657                    }),
5658                });
5659            let output = GetObjectLockConfigurationOutput {
5660                object_lock_configuration: cfg,
5661            };
5662            return Ok(S3Response::new(output));
5663        }
5664        self.backend.get_object_lock_configuration(req).await
5665    }
5666    async fn put_object_lock_configuration(
5667        &self,
5668        req: S3Request<PutObjectLockConfigurationInput>,
5669    ) -> S3Result<S3Response<PutObjectLockConfigurationOutput>> {
5670        self.enforce_policy(
5671            &req,
5672            "s3:PutBucketObjectLockConfiguration",
5673            &req.input.bucket,
5674            None,
5675        )?;
5676        if let Some(mgr) = self.object_lock.as_ref() {
5677            let bucket = req.input.bucket.clone();
5678            if let Some(cfg) = req.input.object_lock_configuration.as_ref()
5679                && let Some(rule) = cfg.rule.as_ref()
5680                && let Some(d) = rule.default_retention.as_ref()
5681            {
5682                let mode = d
5683                    .mode
5684                    .as_ref()
5685                    .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()))
5686                    .ok_or_else(|| {
5687                        S3Error::with_message(
5688                            S3ErrorCode::InvalidRequest,
5689                            "Object Lock default retention requires a valid Mode (GOVERNANCE | COMPLIANCE)",
5690                        )
5691                    })?;
5692                // S3 spec: exactly one of Days / Years (we accept Days
5693                // outright and convert Years → Days for storage; Years
5694                // is just a UX shorthand on the wire).
5695                let days: u32 = match (d.days, d.years) {
5696                    (Some(d), None) if d > 0 => d as u32,
5697                    (None, Some(y)) if y > 0 => (y as u32).saturating_mul(365),
5698                    _ => {
5699                        return Err(S3Error::with_message(
5700                            S3ErrorCode::InvalidRequest,
5701                            "Object Lock default retention requires exactly one of Days or Years (positive integer)",
5702                        ));
5703                    }
5704                };
5705                mgr.set_bucket_default(
5706                    &bucket,
5707                    crate::object_lock::BucketObjectLockDefault {
5708                        mode,
5709                        retention_days: days,
5710                    },
5711                );
5712            }
5713            return Ok(S3Response::new(PutObjectLockConfigurationOutput::default()));
5714        }
5715        self.backend.put_object_lock_configuration(req).await
5716    }
5717    async fn get_object_legal_hold(
5718        &self,
5719        req: S3Request<GetObjectLegalHoldInput>,
5720    ) -> S3Result<S3Response<GetObjectLegalHoldOutput>> {
5721        let key = req.input.key.clone();
5722        self.enforce_policy(&req, "s3:GetObjectLegalHold", &req.input.bucket, Some(&key))?;
5723        if let Some(mgr) = self.object_lock.as_ref() {
5724            let on = mgr
5725                .get(&req.input.bucket, &req.input.key)
5726                .map(|s| s.legal_hold_on)
5727                .unwrap_or(false);
5728            let status = ObjectLockLegalHoldStatus::from_static(if on {
5729                ObjectLockLegalHoldStatus::ON
5730            } else {
5731                ObjectLockLegalHoldStatus::OFF
5732            });
5733            let output = GetObjectLegalHoldOutput {
5734                legal_hold: Some(ObjectLockLegalHold {
5735                    status: Some(status),
5736                }),
5737            };
5738            return Ok(S3Response::new(output));
5739        }
5740        self.backend.get_object_legal_hold(req).await
5741    }
5742    async fn put_object_legal_hold(
5743        &self,
5744        req: S3Request<PutObjectLegalHoldInput>,
5745    ) -> S3Result<S3Response<PutObjectLegalHoldOutput>> {
5746        let key = req.input.key.clone();
5747        self.enforce_policy(&req, "s3:PutObjectLegalHold", &req.input.bucket, Some(&key))?;
5748        if let Some(mgr) = self.object_lock.as_ref() {
5749            let on = req
5750                .input
5751                .legal_hold
5752                .as_ref()
5753                .and_then(|h| h.status.as_ref())
5754                .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
5755                .unwrap_or(false);
5756            mgr.set_legal_hold(&req.input.bucket, &req.input.key, on);
5757            return Ok(S3Response::new(PutObjectLegalHoldOutput::default()));
5758        }
5759        self.backend.put_object_legal_hold(req).await
5760    }
5761    async fn get_object_retention(
5762        &self,
5763        req: S3Request<GetObjectRetentionInput>,
5764    ) -> S3Result<S3Response<GetObjectRetentionOutput>> {
5765        let key = req.input.key.clone();
5766        self.enforce_policy(&req, "s3:GetObjectRetention", &req.input.bucket, Some(&key))?;
5767        if let Some(mgr) = self.object_lock.as_ref() {
5768            let retention = mgr
5769                .get(&req.input.bucket, &req.input.key)
5770                .filter(|s| s.mode.is_some() || s.retain_until.is_some())
5771                .map(|s| {
5772                    let mode = s.mode.map(|m| {
5773                        ObjectLockRetentionMode::from_static(match m {
5774                            crate::object_lock::LockMode::Governance => {
5775                                ObjectLockRetentionMode::GOVERNANCE
5776                            }
5777                            crate::object_lock::LockMode::Compliance => {
5778                                ObjectLockRetentionMode::COMPLIANCE
5779                            }
5780                        })
5781                    });
5782                    let until = s.retain_until.map(chrono_utc_to_timestamp);
5783                    ObjectLockRetention {
5784                        mode,
5785                        retain_until_date: until,
5786                    }
5787                });
5788            let output = GetObjectRetentionOutput { retention };
5789            return Ok(S3Response::new(output));
5790        }
5791        self.backend.get_object_retention(req).await
5792    }
5793    async fn put_object_retention(
5794        &self,
5795        req: S3Request<PutObjectRetentionInput>,
5796    ) -> S3Result<S3Response<PutObjectRetentionOutput>> {
5797        let key = req.input.key.clone();
5798        self.enforce_policy(&req, "s3:PutObjectRetention", &req.input.bucket, Some(&key))?;
5799        if let Some(mgr) = self.object_lock.as_ref() {
5800            let bucket = req.input.bucket.clone();
5801            let key = req.input.key.clone();
5802            // v0.8.12 HIGH-7 fix: the bypass header gates Governance
5803            // shortening only when the caller has the matching IAM
5804            // action explicitly allowed; otherwise it's silently
5805            // dropped to `false` and the "shortening Governance
5806            // requires bypass" branch below rejects.
5807            let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
5808            let bypass = if bypass_header {
5809                self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
5810                    .is_ok()
5811            } else {
5812                false
5813            };
5814            let retention = req.input.retention.as_ref().ok_or_else(|| {
5815                S3Error::with_message(
5816                    S3ErrorCode::InvalidRequest,
5817                    "PutObjectRetention requires a Retention element",
5818                )
5819            })?;
5820            let new_mode = retention
5821                .mode
5822                .as_ref()
5823                .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
5824            let new_until = retention
5825                .retain_until_date
5826                .as_ref()
5827                .map(timestamp_to_chrono_utc)
5828                .unwrap_or(None);
5829            let now = chrono::Utc::now();
5830            let existing = mgr.get(&bucket, &key).unwrap_or_default();
5831            // S3 immutability rules:
5832            //   - Compliance is one-way: once set, mode cannot move to
5833            //     Governance, and retain-until cannot be shortened.
5834            //   - Governance can be lengthened freely; shortened only
5835            //     with bypass=true.
5836            if let Some(existing_mode) = existing.mode
5837                && existing_mode == crate::object_lock::LockMode::Compliance
5838                && existing.is_locked(now)
5839            {
5840                if matches!(new_mode, Some(crate::object_lock::LockMode::Governance)) {
5841                    return Err(S3Error::with_message(
5842                        S3ErrorCode::AccessDenied,
5843                        "Cannot downgrade Compliance retention to Governance while lock is active",
5844                    ));
5845                }
5846                if let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5847                    && next < prev
5848                {
5849                    return Err(S3Error::with_message(
5850                        S3ErrorCode::AccessDenied,
5851                        "Cannot shorten Compliance retention while lock is active",
5852                    ));
5853                }
5854            }
5855            if let Some(existing_mode) = existing.mode
5856                && existing_mode == crate::object_lock::LockMode::Governance
5857                && existing.is_locked(now)
5858                && !bypass
5859                && let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5860                && next < prev
5861            {
5862                return Err(S3Error::with_message(
5863                    S3ErrorCode::AccessDenied,
5864                    "Shortening Governance retention requires x-amz-bypass-governance-retention: true",
5865                ));
5866            }
5867            let mut state = existing;
5868            if new_mode.is_some() {
5869                state.mode = new_mode;
5870            }
5871            if new_until.is_some() {
5872                state.retain_until = new_until;
5873            }
5874            mgr.set(&bucket, &key, state);
5875            return Ok(S3Response::new(PutObjectRetentionOutput::default()));
5876        }
5877        self.backend.put_object_retention(req).await
5878    }
5879
5880    // ---- Versioning ----
5881    // list_object_versions is implemented above in the compression-hook
5882    // section so it filters S4-internal sidecars (v0.4 #17) AND, when a
5883    // VersioningManager is attached (v0.5 #34), serves chains directly
5884    // from the in-memory index.
5885    async fn get_bucket_versioning(
5886        &self,
5887        req: S3Request<GetBucketVersioningInput>,
5888    ) -> S3Result<S3Response<GetBucketVersioningOutput>> {
5889        // v0.5 #34: when a VersioningManager is attached, the bucket's
5890        // versioning state lives in the manager (= S4-server's
5891        // authoritative source). Pass-through hits the backend only
5892        // when no manager is configured (legacy v0.4 behaviour).
5893        if let Some(mgr) = self.versioning.as_ref() {
5894            let output = match mgr.state(&req.input.bucket).as_aws_status() {
5895                Some(s) => GetBucketVersioningOutput {
5896                    status: Some(BucketVersioningStatus::from(s.to_owned())),
5897                    ..Default::default()
5898                },
5899                None => GetBucketVersioningOutput::default(),
5900            };
5901            return Ok(S3Response::new(output));
5902        }
5903        self.backend.get_bucket_versioning(req).await
5904    }
5905    async fn put_bucket_versioning(
5906        &self,
5907        req: S3Request<PutBucketVersioningInput>,
5908    ) -> S3Result<S3Response<PutBucketVersioningOutput>> {
5909        // v0.6 #42: MFA gating on the `PutBucketVersioning` request
5910        // itself. S3 spec: when the request body carries an
5911        // `MfaDelete` element (either `Enabled` or `Disabled`), the
5912        // request must include a valid `x-amz-mfa` token — both for
5913        // the *first* enable (so the operator can't quietly side-step
5914        // the gate by never enabling it) and for any subsequent
5915        // change (so a leaked credential alone can't disable MFA
5916        // Delete to bypass it on subsequent DELETEs). Requests that
5917        // omit the `MfaDelete` element entirely (i.e. they flip only
5918        // `Status`) skip this gate, matching AWS.
5919        if let Some(mgr) = self.mfa_delete.as_ref()
5920            && let Some(target_enabled) = req
5921                .input
5922                .versioning_configuration
5923                .mfa_delete
5924                .as_ref()
5925                .map(|m| m.as_str().eq_ignore_ascii_case("Enabled"))
5926        {
5927            let bucket = req.input.bucket.clone();
5928            let header = req.input.mfa.as_deref();
5929            let secret = mgr.lookup_secret(&bucket);
5930            let verified = match (header, secret.as_ref()) {
5931                (Some(h), Some(s)) => match crate::mfa::parse_mfa_header(h) {
5932                    Ok((serial, code)) => {
5933                        serial == s.serial
5934                            && crate::mfa::verify_totp(&s.secret_base32, &code, current_unix_secs())
5935                    }
5936                    Err(_) => false,
5937                },
5938                _ => false,
5939            };
5940            if !verified {
5941                crate::metrics::record_mfa_delete_denial(&bucket);
5942                let err = if header.is_none() {
5943                    crate::mfa::MfaError::Missing
5944                } else {
5945                    crate::mfa::MfaError::InvalidCode
5946                };
5947                return Err(mfa_error_to_s3(err));
5948            }
5949            mgr.set_bucket_state(&bucket, target_enabled);
5950        }
5951        // v0.5 #34: stash the new state in the manager, then forward to
5952        // the backend so any downstream that *also* tracks state
5953        // (e.g. a real S3 backend) stays in sync. Manager-attached but
5954        // backend rejection is treated as a soft-fail (state is still
5955        // owned by the manager).
5956        if let Some(mgr) = self.versioning.as_ref() {
5957            let new_state = match req
5958                .input
5959                .versioning_configuration
5960                .status
5961                .as_ref()
5962                .map(|s| s.as_str())
5963            {
5964                Some(s) if s.eq_ignore_ascii_case("Enabled") => {
5965                    crate::versioning::VersioningState::Enabled
5966                }
5967                Some(s) if s.eq_ignore_ascii_case("Suspended") => {
5968                    crate::versioning::VersioningState::Suspended
5969                }
5970                _ => crate::versioning::VersioningState::Unversioned,
5971            };
5972            mgr.set_state(&req.input.bucket, new_state);
5973            return Ok(S3Response::new(PutBucketVersioningOutput::default()));
5974        }
5975        self.backend.put_bucket_versioning(req).await
5976    }
5977
5978    // ---- Bucket location ----
5979    async fn get_bucket_location(
5980        &self,
5981        req: S3Request<GetBucketLocationInput>,
5982    ) -> S3Result<S3Response<GetBucketLocationOutput>> {
5983        self.backend.get_bucket_location(req).await
5984    }
5985
5986    // ---- Bucket policy ----
5987    async fn get_bucket_policy(
5988        &self,
5989        req: S3Request<GetBucketPolicyInput>,
5990    ) -> S3Result<S3Response<GetBucketPolicyOutput>> {
5991        self.backend.get_bucket_policy(req).await
5992    }
5993    async fn put_bucket_policy(
5994        &self,
5995        req: S3Request<PutBucketPolicyInput>,
5996    ) -> S3Result<S3Response<PutBucketPolicyOutput>> {
5997        self.backend.put_bucket_policy(req).await
5998    }
5999    async fn delete_bucket_policy(
6000        &self,
6001        req: S3Request<DeleteBucketPolicyInput>,
6002    ) -> S3Result<S3Response<DeleteBucketPolicyOutput>> {
6003        self.backend.delete_bucket_policy(req).await
6004    }
6005    async fn get_bucket_policy_status(
6006        &self,
6007        req: S3Request<GetBucketPolicyStatusInput>,
6008    ) -> S3Result<S3Response<GetBucketPolicyStatusOutput>> {
6009        self.backend.get_bucket_policy_status(req).await
6010    }
6011
6012    // ---- Bucket ACL ----
6013    async fn get_bucket_acl(
6014        &self,
6015        req: S3Request<GetBucketAclInput>,
6016    ) -> S3Result<S3Response<GetBucketAclOutput>> {
6017        self.backend.get_bucket_acl(req).await
6018    }
6019    async fn put_bucket_acl(
6020        &self,
6021        req: S3Request<PutBucketAclInput>,
6022    ) -> S3Result<S3Response<PutBucketAclOutput>> {
6023        self.backend.put_bucket_acl(req).await
6024    }
6025
6026    // ---- Bucket CORS (v0.6 #38) ----
6027    async fn get_bucket_cors(
6028        &self,
6029        req: S3Request<GetBucketCorsInput>,
6030    ) -> S3Result<S3Response<GetBucketCorsOutput>> {
6031        if let Some(mgr) = self.cors.as_ref() {
6032            let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6033                S3Error::with_message(
6034                    S3ErrorCode::NoSuchCORSConfiguration,
6035                    "The CORS configuration does not exist".to_string(),
6036                )
6037            })?;
6038            let rules: Vec<CORSRule> = cfg
6039                .rules
6040                .into_iter()
6041                .map(|r| CORSRule {
6042                    allowed_headers: if r.allowed_headers.is_empty() {
6043                        None
6044                    } else {
6045                        Some(r.allowed_headers)
6046                    },
6047                    allowed_methods: r.allowed_methods,
6048                    allowed_origins: r.allowed_origins,
6049                    expose_headers: if r.expose_headers.is_empty() {
6050                        None
6051                    } else {
6052                        Some(r.expose_headers)
6053                    },
6054                    id: r.id,
6055                    max_age_seconds: r.max_age_seconds.map(|s| s as i32),
6056                })
6057                .collect();
6058            return Ok(S3Response::new(GetBucketCorsOutput {
6059                cors_rules: Some(rules),
6060            }));
6061        }
6062        self.backend.get_bucket_cors(req).await
6063    }
6064    async fn put_bucket_cors(
6065        &self,
6066        req: S3Request<PutBucketCorsInput>,
6067    ) -> S3Result<S3Response<PutBucketCorsOutput>> {
6068        if let Some(mgr) = self.cors.as_ref() {
6069            let cfg = crate::cors::CorsConfig {
6070                rules: req
6071                    .input
6072                    .cors_configuration
6073                    .cors_rules
6074                    .into_iter()
6075                    .map(|r| crate::cors::CorsRule {
6076                        allowed_origins: r.allowed_origins,
6077                        allowed_methods: r.allowed_methods,
6078                        allowed_headers: r.allowed_headers.unwrap_or_default(),
6079                        expose_headers: r.expose_headers.unwrap_or_default(),
6080                        max_age_seconds: r
6081                            .max_age_seconds
6082                            .and_then(|s| if s < 0 { None } else { Some(s as u32) }),
6083                        id: r.id,
6084                    })
6085                    .collect(),
6086            };
6087            // v0.8.15 M-3: AWS S3 rejects `AllowedMethods` outside
6088            // the canonical {GET,PUT,POST,DELETE,HEAD} set (including
6089            // the `*` wildcard). Validate at PutBucketCors time so
6090            // operators see the misconfiguration in the API response
6091            // instead of having silently-broken preflights at the
6092            // browser later.
6093            if let Err(e) = crate::cors::CorsManager::validate(&cfg) {
6094                return Err(S3Error::with_message(
6095                    S3ErrorCode::InvalidArgument,
6096                    e.to_string(),
6097                ));
6098            }
6099            mgr.put(&req.input.bucket, cfg);
6100            return Ok(S3Response::new(PutBucketCorsOutput::default()));
6101        }
6102        self.backend.put_bucket_cors(req).await
6103    }
6104    async fn delete_bucket_cors(
6105        &self,
6106        req: S3Request<DeleteBucketCorsInput>,
6107    ) -> S3Result<S3Response<DeleteBucketCorsOutput>> {
6108        if let Some(mgr) = self.cors.as_ref() {
6109            mgr.delete(&req.input.bucket);
6110            return Ok(S3Response::new(DeleteBucketCorsOutput::default()));
6111        }
6112        self.backend.delete_bucket_cors(req).await
6113    }
6114
6115    // ---- Bucket lifecycle (v0.6 #37) ----
6116    async fn get_bucket_lifecycle_configuration(
6117        &self,
6118        req: S3Request<GetBucketLifecycleConfigurationInput>,
6119    ) -> S3Result<S3Response<GetBucketLifecycleConfigurationOutput>> {
6120        if let Some(mgr) = self.lifecycle.as_ref() {
6121            let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6122                S3Error::with_message(
6123                    S3ErrorCode::NoSuchLifecycleConfiguration,
6124                    "The lifecycle configuration does not exist".to_string(),
6125                )
6126            })?;
6127            let rules: Vec<LifecycleRule> = cfg.rules.iter().map(internal_rule_to_dto).collect();
6128            return Ok(S3Response::new(GetBucketLifecycleConfigurationOutput {
6129                rules: Some(rules),
6130                transition_default_minimum_object_size: None,
6131            }));
6132        }
6133        self.backend.get_bucket_lifecycle_configuration(req).await
6134    }
6135    async fn put_bucket_lifecycle_configuration(
6136        &self,
6137        req: S3Request<PutBucketLifecycleConfigurationInput>,
6138    ) -> S3Result<S3Response<PutBucketLifecycleConfigurationOutput>> {
6139        if let Some(mgr) = self.lifecycle.as_ref() {
6140            let bucket = req.input.bucket.clone();
6141            let dto_cfg = req.input.lifecycle_configuration.unwrap_or_default();
6142            let cfg = dto_lifecycle_to_internal(&dto_cfg);
6143            mgr.put(&bucket, cfg);
6144            return Ok(S3Response::new(
6145                PutBucketLifecycleConfigurationOutput::default(),
6146            ));
6147        }
6148        self.backend.put_bucket_lifecycle_configuration(req).await
6149    }
6150    async fn delete_bucket_lifecycle(
6151        &self,
6152        req: S3Request<DeleteBucketLifecycleInput>,
6153    ) -> S3Result<S3Response<DeleteBucketLifecycleOutput>> {
6154        if let Some(mgr) = self.lifecycle.as_ref() {
6155            mgr.delete(&req.input.bucket);
6156            return Ok(S3Response::new(DeleteBucketLifecycleOutput::default()));
6157        }
6158        self.backend.delete_bucket_lifecycle(req).await
6159    }
6160
6161    // ---- Bucket tagging (v0.6 #39) ----
6162    async fn get_bucket_tagging(
6163        &self,
6164        req: S3Request<GetBucketTaggingInput>,
6165    ) -> S3Result<S3Response<GetBucketTaggingOutput>> {
6166        let Some(mgr) = self.tagging.as_ref() else {
6167            return self.backend.get_bucket_tagging(req).await;
6168        };
6169        let tags = mgr.get_bucket_tags(&req.input.bucket).unwrap_or_default();
6170        Ok(S3Response::new(GetBucketTaggingOutput {
6171            tag_set: tagset_to_aws(&tags),
6172        }))
6173    }
6174    async fn put_bucket_tagging(
6175        &self,
6176        req: S3Request<PutBucketTaggingInput>,
6177    ) -> S3Result<S3Response<PutBucketTaggingOutput>> {
6178        let Some(mgr) = self.tagging.as_ref() else {
6179            return self.backend.put_bucket_tagging(req).await;
6180        };
6181        let bucket = req.input.bucket.clone();
6182        let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6183            .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6184        self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6185        mgr.put_bucket_tags(&bucket, parsed);
6186        Ok(S3Response::new(PutBucketTaggingOutput::default()))
6187    }
6188    async fn delete_bucket_tagging(
6189        &self,
6190        req: S3Request<DeleteBucketTaggingInput>,
6191    ) -> S3Result<S3Response<DeleteBucketTaggingOutput>> {
6192        let Some(mgr) = self.tagging.as_ref() else {
6193            return self.backend.delete_bucket_tagging(req).await;
6194        };
6195        let bucket = req.input.bucket.clone();
6196        self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6197        mgr.delete_bucket_tags(&bucket);
6198        Ok(S3Response::new(DeleteBucketTaggingOutput::default()))
6199    }
6200
6201    // ---- Bucket encryption ----
6202    async fn get_bucket_encryption(
6203        &self,
6204        req: S3Request<GetBucketEncryptionInput>,
6205    ) -> S3Result<S3Response<GetBucketEncryptionOutput>> {
6206        self.backend.get_bucket_encryption(req).await
6207    }
6208    async fn put_bucket_encryption(
6209        &self,
6210        req: S3Request<PutBucketEncryptionInput>,
6211    ) -> S3Result<S3Response<PutBucketEncryptionOutput>> {
6212        self.backend.put_bucket_encryption(req).await
6213    }
6214    async fn delete_bucket_encryption(
6215        &self,
6216        req: S3Request<DeleteBucketEncryptionInput>,
6217    ) -> S3Result<S3Response<DeleteBucketEncryptionOutput>> {
6218        self.backend.delete_bucket_encryption(req).await
6219    }
6220
6221    // ---- Bucket logging ----
6222    async fn get_bucket_logging(
6223        &self,
6224        req: S3Request<GetBucketLoggingInput>,
6225    ) -> S3Result<S3Response<GetBucketLoggingOutput>> {
6226        self.backend.get_bucket_logging(req).await
6227    }
6228    async fn put_bucket_logging(
6229        &self,
6230        req: S3Request<PutBucketLoggingInput>,
6231    ) -> S3Result<S3Response<PutBucketLoggingOutput>> {
6232        self.backend.put_bucket_logging(req).await
6233    }
6234
6235    // ---- Bucket notification (v0.6 #35) ----
6236    //
6237    // When a `NotificationManager` is attached, S4 itself owns per-bucket
6238    // notification configurations and the PUT / GET handlers route through
6239    // the manager. The wire DTO's queue / topic configurations map onto
6240    // S4's `Destination::Sqs` / `Destination::Sns`; LambdaFunction and
6241    // EventBridge configurations are accepted on PUT but silently dropped
6242    // (out of scope for v0.6 #35). When no manager is attached the legacy
6243    // backend-passthrough behaviour applies.
6244    async fn get_bucket_notification_configuration(
6245        &self,
6246        req: S3Request<GetBucketNotificationConfigurationInput>,
6247    ) -> S3Result<S3Response<GetBucketNotificationConfigurationOutput>> {
6248        if let Some(mgr) = self.notifications.as_ref() {
6249            let cfg = mgr.get(&req.input.bucket).unwrap_or_default();
6250            let dto = notif_to_dto(&cfg);
6251            return Ok(S3Response::new(GetBucketNotificationConfigurationOutput {
6252                event_bridge_configuration: dto.event_bridge_configuration,
6253                lambda_function_configurations: dto.lambda_function_configurations,
6254                queue_configurations: dto.queue_configurations,
6255                topic_configurations: dto.topic_configurations,
6256            }));
6257        }
6258        self.backend
6259            .get_bucket_notification_configuration(req)
6260            .await
6261    }
6262    async fn put_bucket_notification_configuration(
6263        &self,
6264        req: S3Request<PutBucketNotificationConfigurationInput>,
6265    ) -> S3Result<S3Response<PutBucketNotificationConfigurationOutput>> {
6266        if let Some(mgr) = self.notifications.as_ref() {
6267            let cfg = notif_from_dto(&req.input.notification_configuration);
6268            mgr.put(&req.input.bucket, cfg);
6269            return Ok(S3Response::new(
6270                PutBucketNotificationConfigurationOutput::default(),
6271            ));
6272        }
6273        self.backend
6274            .put_bucket_notification_configuration(req)
6275            .await
6276    }
6277
6278    // ---- Bucket request payment ----
6279    async fn get_bucket_request_payment(
6280        &self,
6281        req: S3Request<GetBucketRequestPaymentInput>,
6282    ) -> S3Result<S3Response<GetBucketRequestPaymentOutput>> {
6283        self.backend.get_bucket_request_payment(req).await
6284    }
6285    async fn put_bucket_request_payment(
6286        &self,
6287        req: S3Request<PutBucketRequestPaymentInput>,
6288    ) -> S3Result<S3Response<PutBucketRequestPaymentOutput>> {
6289        self.backend.put_bucket_request_payment(req).await
6290    }
6291
6292    // ---- Bucket website ----
6293    async fn get_bucket_website(
6294        &self,
6295        req: S3Request<GetBucketWebsiteInput>,
6296    ) -> S3Result<S3Response<GetBucketWebsiteOutput>> {
6297        self.backend.get_bucket_website(req).await
6298    }
6299    async fn put_bucket_website(
6300        &self,
6301        req: S3Request<PutBucketWebsiteInput>,
6302    ) -> S3Result<S3Response<PutBucketWebsiteOutput>> {
6303        self.backend.put_bucket_website(req).await
6304    }
6305    async fn delete_bucket_website(
6306        &self,
6307        req: S3Request<DeleteBucketWebsiteInput>,
6308    ) -> S3Result<S3Response<DeleteBucketWebsiteOutput>> {
6309        self.backend.delete_bucket_website(req).await
6310    }
6311
6312    // ---- Bucket replication (v0.6 #40) ----
6313    async fn get_bucket_replication(
6314        &self,
6315        req: S3Request<GetBucketReplicationInput>,
6316    ) -> S3Result<S3Response<GetBucketReplicationOutput>> {
6317        if let Some(mgr) = self.replication.as_ref() {
6318            return match mgr.get(&req.input.bucket) {
6319                Some(cfg) => Ok(S3Response::new(GetBucketReplicationOutput {
6320                    replication_configuration: Some(replication_to_dto(&cfg)),
6321                })),
6322                None => Err(S3Error::with_message(
6323                    S3ErrorCode::Custom("ReplicationConfigurationNotFoundError".into()),
6324                    format!(
6325                        "no replication configuration on bucket {}",
6326                        req.input.bucket
6327                    ),
6328                )),
6329            };
6330        }
6331        self.backend.get_bucket_replication(req).await
6332    }
6333    async fn put_bucket_replication(
6334        &self,
6335        req: S3Request<PutBucketReplicationInput>,
6336    ) -> S3Result<S3Response<PutBucketReplicationOutput>> {
6337        if let Some(mgr) = self.replication.as_ref() {
6338            let cfg = replication_from_dto(&req.input.replication_configuration);
6339            mgr.put(&req.input.bucket, cfg);
6340            return Ok(S3Response::new(PutBucketReplicationOutput::default()));
6341        }
6342        self.backend.put_bucket_replication(req).await
6343    }
6344    async fn delete_bucket_replication(
6345        &self,
6346        req: S3Request<DeleteBucketReplicationInput>,
6347    ) -> S3Result<S3Response<DeleteBucketReplicationOutput>> {
6348        if let Some(mgr) = self.replication.as_ref() {
6349            mgr.delete(&req.input.bucket);
6350            return Ok(S3Response::new(DeleteBucketReplicationOutput::default()));
6351        }
6352        self.backend.delete_bucket_replication(req).await
6353    }
6354
6355    // ---- Bucket accelerate ----
6356    async fn get_bucket_accelerate_configuration(
6357        &self,
6358        req: S3Request<GetBucketAccelerateConfigurationInput>,
6359    ) -> S3Result<S3Response<GetBucketAccelerateConfigurationOutput>> {
6360        self.backend.get_bucket_accelerate_configuration(req).await
6361    }
6362    async fn put_bucket_accelerate_configuration(
6363        &self,
6364        req: S3Request<PutBucketAccelerateConfigurationInput>,
6365    ) -> S3Result<S3Response<PutBucketAccelerateConfigurationOutput>> {
6366        self.backend.put_bucket_accelerate_configuration(req).await
6367    }
6368
6369    // ---- Bucket ownership controls ----
6370    async fn get_bucket_ownership_controls(
6371        &self,
6372        req: S3Request<GetBucketOwnershipControlsInput>,
6373    ) -> S3Result<S3Response<GetBucketOwnershipControlsOutput>> {
6374        self.backend.get_bucket_ownership_controls(req).await
6375    }
6376    async fn put_bucket_ownership_controls(
6377        &self,
6378        req: S3Request<PutBucketOwnershipControlsInput>,
6379    ) -> S3Result<S3Response<PutBucketOwnershipControlsOutput>> {
6380        self.backend.put_bucket_ownership_controls(req).await
6381    }
6382    async fn delete_bucket_ownership_controls(
6383        &self,
6384        req: S3Request<DeleteBucketOwnershipControlsInput>,
6385    ) -> S3Result<S3Response<DeleteBucketOwnershipControlsOutput>> {
6386        self.backend.delete_bucket_ownership_controls(req).await
6387    }
6388
6389    // ---- Public access block ----
6390    async fn get_public_access_block(
6391        &self,
6392        req: S3Request<GetPublicAccessBlockInput>,
6393    ) -> S3Result<S3Response<GetPublicAccessBlockOutput>> {
6394        self.backend.get_public_access_block(req).await
6395    }
6396    async fn put_public_access_block(
6397        &self,
6398        req: S3Request<PutPublicAccessBlockInput>,
6399    ) -> S3Result<S3Response<PutPublicAccessBlockOutput>> {
6400        self.backend.put_public_access_block(req).await
6401    }
6402    async fn delete_public_access_block(
6403        &self,
6404        req: S3Request<DeletePublicAccessBlockInput>,
6405    ) -> S3Result<S3Response<DeletePublicAccessBlockOutput>> {
6406        self.backend.delete_public_access_block(req).await
6407    }
6408
6409    // ====================================================================
6410    // v0.6 #41: S3 Select — server-side SQL filter on object body.
6411    //
6412    // Fetch the object via the regular `get_object` path (so SSE-C /
6413    // SSE-S4 / SSE-KMS / S4 codec all decompress + decrypt transparently),
6414    // run a small SQL subset (CSV + JSON Lines, equality / inequality /
6415    // LIKE / AND / OR / NOT) over the in-memory body, and stream the
6416    // matched rows back as AWS event-stream `Records` + `Stats` + `End`
6417    // frames.
6418    //
6419    // Limitations (deliberate, documented):
6420    //   - Parquet input is rejected with NotImplemented.
6421    //   - Aggregates / GROUP BY / JOIN / ORDER BY / LIMIT are rejected at
6422    //     parse time as InvalidRequest (s3s 0.13 doesn't expose AWS's
6423    //     domain-specific `InvalidSqlExpression` code).
6424    //   - The body is fully buffered before SQL evaluation (S3 Select
6425    //     streaming-during-evaluation is v0.7 scope).
6426    //   - GPU-accelerated WHERE evaluation is stubbed out (always None).
6427    async fn select_object_content(
6428        &self,
6429        req: S3Request<SelectObjectContentInput>,
6430    ) -> S3Result<S3Response<SelectObjectContentOutput>> {
6431        use crate::select::{
6432            EventStreamWriter, SelectInputFormat, SelectOutputFormat, run_select_csv,
6433            run_select_jsonlines,
6434        };
6435
6436        let select_bucket = req.input.bucket.clone();
6437        let select_key = req.input.key.clone();
6438        self.enforce_rate_limit(&req, &select_bucket)?;
6439        self.enforce_policy(&req, "s3:GetObject", &select_bucket, Some(&select_key))?;
6440
6441        let request = req.input.request;
6442        let sql = request.expression.clone();
6443        if request.expression_type.as_str() != "SQL" {
6444            return Err(S3Error::with_message(
6445                S3ErrorCode::InvalidExpressionType,
6446                format!(
6447                    "ExpressionType must be SQL, got: {}",
6448                    request.expression_type.as_str()
6449                ),
6450            ));
6451        }
6452
6453        let input_format = if let Some(_json) = request.input_serialization.json.as_ref() {
6454            SelectInputFormat::JsonLines
6455        } else if let Some(csv) = request.input_serialization.csv.as_ref() {
6456            let has_header = csv
6457                .file_header_info
6458                .as_ref()
6459                .map(|h| {
6460                    let s = h.as_str();
6461                    s.eq_ignore_ascii_case("USE") || s.eq_ignore_ascii_case("IGNORE")
6462                })
6463                .unwrap_or(false);
6464            let delim = csv
6465                .field_delimiter
6466                .as_deref()
6467                .and_then(|s| s.chars().next())
6468                .unwrap_or(',');
6469            SelectInputFormat::Csv {
6470                has_header,
6471                delimiter: delim,
6472            }
6473        } else if request.input_serialization.parquet.is_some() {
6474            return Err(S3Error::with_message(
6475                S3ErrorCode::NotImplemented,
6476                "Parquet input is not supported by this S3 Select implementation (v0.6: CSV / JSON Lines only)",
6477            ));
6478        } else {
6479            return Err(S3Error::with_message(
6480                S3ErrorCode::InvalidRequest,
6481                "InputSerialization requires exactly one of CSV / JSON / Parquet",
6482            ));
6483        };
6484        if let Some(ct) = request.input_serialization.compression_type.as_ref()
6485            && !ct.as_str().eq_ignore_ascii_case("NONE")
6486        {
6487            return Err(S3Error::with_message(
6488                S3ErrorCode::NotImplemented,
6489                format!(
6490                    "InputSerialization CompressionType={} is not supported (v0.6: NONE only)",
6491                    ct.as_str()
6492                ),
6493            ));
6494        }
6495
6496        let output_format = if request.output_serialization.json.is_some() {
6497            SelectOutputFormat::Json
6498        } else if request.output_serialization.csv.is_some() {
6499            SelectOutputFormat::Csv
6500        } else {
6501            return Err(S3Error::with_message(
6502                S3ErrorCode::InvalidRequest,
6503                "OutputSerialization requires exactly one of CSV / JSON",
6504            ));
6505        };
6506
6507        let get_input = GetObjectInput {
6508            bucket: select_bucket.clone(),
6509            key: select_key.clone(),
6510            sse_customer_algorithm: req.input.sse_customer_algorithm.clone(),
6511            sse_customer_key: req.input.sse_customer_key.clone(),
6512            sse_customer_key_md5: req.input.sse_customer_key_md5.clone(),
6513            ..Default::default()
6514        };
6515        let get_req = S3Request {
6516            input: get_input,
6517            method: http::Method::GET,
6518            uri: format!("/{}/{}", select_bucket, select_key)
6519                .parse()
6520                .map_err(|e| {
6521                    S3Error::with_message(
6522                        S3ErrorCode::InternalError,
6523                        format!("constructing inner GET URI: {e}"),
6524                    )
6525                })?,
6526            headers: http::HeaderMap::new(),
6527            extensions: http::Extensions::new(),
6528            credentials: req.credentials.clone(),
6529            region: req.region.clone(),
6530            service: req.service.clone(),
6531            trailing_headers: None,
6532        };
6533        let mut get_resp = self.get_object(get_req).await?;
6534        let blob = get_resp.output.body.take().ok_or_else(|| {
6535            S3Error::with_message(
6536                S3ErrorCode::InternalError,
6537                "Select: object body was empty after GET",
6538            )
6539        })?;
6540        let body_bytes = crate::blob::collect_blob(blob, self.max_body_bytes)
6541            .await
6542            .map_err(internal("collect Select body"))?;
6543        let scanned = body_bytes.len() as u64;
6544
6545        let matched_payload = match input_format {
6546            SelectInputFormat::JsonLines => run_select_jsonlines(&sql, &body_bytes, output_format)
6547                .map_err(|e| select_error_to_s3(e, "JSON Lines"))?,
6548            SelectInputFormat::Csv { .. } => {
6549                run_select_csv(&sql, &body_bytes, input_format, output_format)
6550                    .map_err(|e| select_error_to_s3(e, "CSV"))?
6551            }
6552        };
6553
6554        let returned = matched_payload.len() as u64;
6555        let processed = scanned;
6556        let mut events: Vec<S3Result<SelectObjectContentEvent>> = Vec::with_capacity(3);
6557        if !matched_payload.is_empty() {
6558            events.push(Ok(SelectObjectContentEvent::Records(RecordsEvent {
6559                payload: Some(bytes::Bytes::from(matched_payload)),
6560            })));
6561        }
6562        events.push(Ok(SelectObjectContentEvent::Stats(StatsEvent {
6563            details: Some(Stats {
6564                bytes_scanned: Some(scanned as i64),
6565                bytes_processed: Some(processed as i64),
6566                bytes_returned: Some(returned as i64),
6567            }),
6568        })));
6569        events.push(Ok(SelectObjectContentEvent::End(EndEvent {})));
6570        // Touch EventStreamWriter so the public API stays linked into the
6571        // build (the actual wire framing is delegated to s3s).
6572        let _writer = EventStreamWriter::new();
6573
6574        let stream = SelectObjectContentEventStream::new(futures::stream::iter(events));
6575        let output = SelectObjectContentOutput {
6576            payload: Some(stream),
6577        };
6578        Ok(S3Response::new(output))
6579    }
6580
6581    // ---- Bucket Inventory configuration (v0.6 #36) ----
6582    //
6583    // When an `InventoryManager` is attached, S4-server owns the
6584    // configuration store and these handlers no longer pass through to
6585    // the backend. The mapping between the s3s-typed
6586    // `InventoryConfiguration` and the inventory module's internal
6587    // `InventoryConfig` is intentionally lossy: only the fields S4
6588    // actually uses for periodic CSV emission survive the round trip
6589    // (id, source bucket, destination bucket / prefix, format, included
6590    // versions, schedule frequency). Optional fields, encryption, and
6591    // filter prefixes are accepted on PUT and re-surfaced on GET via
6592    // a best-effort default-shape `InventoryConfiguration` so the
6593    // client sees a roundtrip-clean response.
6594    async fn put_bucket_inventory_configuration(
6595        &self,
6596        req: S3Request<PutBucketInventoryConfigurationInput>,
6597    ) -> S3Result<S3Response<PutBucketInventoryConfigurationOutput>> {
6598        if let Some(mgr) = self.inventory.as_ref() {
6599            let cfg = inv_from_dto(
6600                &req.input.bucket,
6601                &req.input.id,
6602                &req.input.inventory_configuration,
6603            );
6604            mgr.put(cfg);
6605            return Ok(S3Response::new(
6606                PutBucketInventoryConfigurationOutput::default(),
6607            ));
6608        }
6609        self.backend.put_bucket_inventory_configuration(req).await
6610    }
6611
6612    async fn get_bucket_inventory_configuration(
6613        &self,
6614        req: S3Request<GetBucketInventoryConfigurationInput>,
6615    ) -> S3Result<S3Response<GetBucketInventoryConfigurationOutput>> {
6616        if let Some(mgr) = self.inventory.as_ref() {
6617            let cfg = mgr.get(&req.input.bucket, &req.input.id);
6618            if let Some(cfg) = cfg {
6619                let out = GetBucketInventoryConfigurationOutput {
6620                    inventory_configuration: Some(inv_to_dto(&cfg)),
6621                };
6622                return Ok(S3Response::new(out));
6623            }
6624            // AWS returns `NoSuchConfiguration` (404) when the id has no
6625            // matching inventory configuration on the bucket. The
6626            // generated `S3ErrorCode` enum doesn't expose a typed variant
6627            // for this code, so we round-trip through `from_bytes` which
6628            // wraps unknown codes as `Custom(...)` (= the AWS-canonical
6629            // error-code string survives into the XML response envelope).
6630            let code =
6631                S3ErrorCode::from_bytes(b"NoSuchConfiguration").unwrap_or(S3ErrorCode::NoSuchKey);
6632            return Err(S3Error::with_message(
6633                code,
6634                format!(
6635                    "no inventory configuration with id={} on bucket={}",
6636                    req.input.id, req.input.bucket
6637                ),
6638            ));
6639        }
6640        self.backend.get_bucket_inventory_configuration(req).await
6641    }
6642
6643    async fn list_bucket_inventory_configurations(
6644        &self,
6645        req: S3Request<ListBucketInventoryConfigurationsInput>,
6646    ) -> S3Result<S3Response<ListBucketInventoryConfigurationsOutput>> {
6647        if let Some(mgr) = self.inventory.as_ref() {
6648            let list = mgr.list_for_bucket(&req.input.bucket);
6649            let dto_list: Vec<InventoryConfiguration> = list.iter().map(inv_to_dto).collect();
6650            let out = ListBucketInventoryConfigurationsOutput {
6651                continuation_token: req.input.continuation_token.clone(),
6652                inventory_configuration_list: if dto_list.is_empty() {
6653                    None
6654                } else {
6655                    Some(dto_list)
6656                },
6657                is_truncated: Some(false),
6658                next_continuation_token: None,
6659            };
6660            return Ok(S3Response::new(out));
6661        }
6662        self.backend.list_bucket_inventory_configurations(req).await
6663    }
6664
6665    async fn delete_bucket_inventory_configuration(
6666        &self,
6667        req: S3Request<DeleteBucketInventoryConfigurationInput>,
6668    ) -> S3Result<S3Response<DeleteBucketInventoryConfigurationOutput>> {
6669        if let Some(mgr) = self.inventory.as_ref() {
6670            mgr.delete(&req.input.bucket, &req.input.id);
6671            return Ok(S3Response::new(
6672                DeleteBucketInventoryConfigurationOutput::default(),
6673            ));
6674        }
6675        self.backend
6676            .delete_bucket_inventory_configuration(req)
6677            .await
6678    }
6679}
6680
6681// ---------------------------------------------------------------------------
6682// v0.6 #36: Convert between the s3s-typed `InventoryConfiguration` (the wire
6683// surface) and our internal `crate::inventory::InventoryConfig`. Only the
6684// fields S4 actually uses for CSV emission survive the round trip; the
6685// missing fields (filter prefix, optional fields, encryption) are dropped on
6686// PUT and re-rendered as the AWS-default shape on GET so the client sees a
6687// well-formed `InventoryConfiguration`.
6688// ---------------------------------------------------------------------------
6689
6690fn inv_from_dto(
6691    bucket: &str,
6692    id: &str,
6693    dto: &InventoryConfiguration,
6694) -> crate::inventory::InventoryConfig {
6695    let frequency_hours = match dto.schedule.frequency.as_str() {
6696        "Weekly" => 24 * 7,
6697        // Daily is the default; anything S4 doesn't recognise (incl.
6698        // empty, which is the s3s-default) maps to Daily so the
6699        // operator's PUT doesn't silently turn into a no-op cadence.
6700        _ => 24,
6701    };
6702    // Parquet/ORC are not supported (issue #36 scope); we still accept
6703    // the PUT so callers don't fail-loud, but we record CSV and rely on
6704    // the operator catching the discrepancy on GET.
6705    let format = crate::inventory::InventoryFormat::Csv;
6706    crate::inventory::InventoryConfig {
6707        id: id.to_owned(),
6708        bucket: bucket.to_owned(),
6709        destination_bucket: dto.destination.s3_bucket_destination.bucket.clone(),
6710        destination_prefix: dto
6711            .destination
6712            .s3_bucket_destination
6713            .prefix
6714            .clone()
6715            .unwrap_or_default(),
6716        frequency_hours,
6717        format,
6718        included_object_versions: crate::inventory::IncludedVersions::from_aws_str(
6719            dto.included_object_versions.as_str(),
6720        ),
6721    }
6722}
6723
6724fn inv_to_dto(cfg: &crate::inventory::InventoryConfig) -> InventoryConfiguration {
6725    InventoryConfiguration {
6726        id: cfg.id.clone(),
6727        is_enabled: true,
6728        included_object_versions: InventoryIncludedObjectVersions::from(
6729            cfg.included_object_versions.as_aws_str().to_owned(),
6730        ),
6731        destination: InventoryDestination {
6732            s3_bucket_destination: InventoryS3BucketDestination {
6733                account_id: None,
6734                bucket: cfg.destination_bucket.clone(),
6735                encryption: None,
6736                format: InventoryFormat::from(cfg.format.as_aws_str().to_owned()),
6737                prefix: if cfg.destination_prefix.is_empty() {
6738                    None
6739                } else {
6740                    Some(cfg.destination_prefix.clone())
6741                },
6742            },
6743        },
6744        schedule: InventorySchedule {
6745            // `frequency_hours == 168` -> Weekly; everything else maps to
6746            // Daily for the wire response (the manager keeps the precise
6747            // hour count internally for due-checking).
6748            frequency: InventoryFrequency::from(
6749                if cfg.frequency_hours == 24 * 7 {
6750                    "Weekly"
6751                } else {
6752                    "Daily"
6753                }
6754                .to_owned(),
6755            ),
6756        },
6757        filter: None,
6758        optional_fields: None,
6759    }
6760}
6761
6762// ---------------------------------------------------------------------------
6763// v0.6 #35: Convert between the s3s-typed `NotificationConfiguration` (the
6764// wire surface) and our internal `crate::notifications::NotificationConfig`.
6765//
6766// We support TopicConfiguration (-> Destination::Sns) and QueueConfiguration
6767// (-> Destination::Sqs). LambdaFunction and EventBridge configurations are
6768// silently dropped on PUT (out of scope for v0.6 #35); the GET response only
6769// surfaces topic / queue rules.
6770//
6771// The webhook destination has no AWS-native wire form: operators configure
6772// webhooks via the JSON snapshot file (`--notifications-state-file`) or by
6773// poking `NotificationManager::put` directly from a custom binary. This
6774// keeps the wire surface AWS-compatible while still letting the always-
6775// available `Webhook` destination be reachable.
6776// ---------------------------------------------------------------------------
6777
6778fn notif_from_dto(dto: &NotificationConfiguration) -> crate::notifications::NotificationConfig {
6779    let mut rules: Vec<crate::notifications::NotificationRule> = Vec::new();
6780    if let Some(topics) = dto.topic_configurations.as_ref() {
6781        for (idx, t) in topics.iter().enumerate() {
6782            let events = events_from_dto(&t.events);
6783            let (prefix, suffix) = filter_from_dto(t.filter.as_ref());
6784            rules.push(crate::notifications::NotificationRule {
6785                id: t.id.clone().unwrap_or_else(|| format!("topic-{idx}")),
6786                events,
6787                destination: crate::notifications::Destination::Sns {
6788                    topic_arn: t.topic_arn.clone(),
6789                },
6790                filter_prefix: prefix,
6791                filter_suffix: suffix,
6792            });
6793        }
6794    }
6795    if let Some(queues) = dto.queue_configurations.as_ref() {
6796        for (idx, q) in queues.iter().enumerate() {
6797            let events = events_from_dto(&q.events);
6798            let (prefix, suffix) = filter_from_dto(q.filter.as_ref());
6799            rules.push(crate::notifications::NotificationRule {
6800                id: q.id.clone().unwrap_or_else(|| format!("queue-{idx}")),
6801                events,
6802                destination: crate::notifications::Destination::Sqs {
6803                    queue_arn: q.queue_arn.clone(),
6804                },
6805                filter_prefix: prefix,
6806                filter_suffix: suffix,
6807            });
6808        }
6809    }
6810    crate::notifications::NotificationConfig { rules }
6811}
6812
6813fn notif_to_dto(cfg: &crate::notifications::NotificationConfig) -> NotificationConfiguration {
6814    let mut topics: Vec<TopicConfiguration> = Vec::new();
6815    let mut queues: Vec<QueueConfiguration> = Vec::new();
6816    for rule in &cfg.rules {
6817        let events: Vec<Event> = rule
6818            .events
6819            .iter()
6820            .map(|e| Event::from(e.as_aws_str().to_owned()))
6821            .collect();
6822        let filter = filter_to_dto(rule.filter_prefix.as_deref(), rule.filter_suffix.as_deref());
6823        match &rule.destination {
6824            crate::notifications::Destination::Sns { topic_arn } => {
6825                topics.push(TopicConfiguration {
6826                    events,
6827                    filter,
6828                    id: Some(rule.id.clone()),
6829                    topic_arn: topic_arn.clone(),
6830                });
6831            }
6832            crate::notifications::Destination::Sqs { queue_arn } => {
6833                queues.push(QueueConfiguration {
6834                    events,
6835                    filter,
6836                    id: Some(rule.id.clone()),
6837                    queue_arn: queue_arn.clone(),
6838                });
6839            }
6840            // Webhook destinations have no AWS wire equivalent — they
6841            // round-trip through the JSON snapshot only. Skip them on the
6842            // GET surface (an SDK consumer wouldn't know what to do with
6843            // them anyway).
6844            crate::notifications::Destination::Webhook { .. } => {}
6845        }
6846    }
6847    NotificationConfiguration {
6848        event_bridge_configuration: None,
6849        lambda_function_configurations: None,
6850        queue_configurations: if queues.is_empty() {
6851            None
6852        } else {
6853            Some(queues)
6854        },
6855        topic_configurations: if topics.is_empty() {
6856            None
6857        } else {
6858            Some(topics)
6859        },
6860    }
6861}
6862
6863fn events_from_dto(events: &[Event]) -> Vec<crate::notifications::EventType> {
6864    events
6865        .iter()
6866        .filter_map(|e| crate::notifications::EventType::from_aws_str(e.as_ref()))
6867        .collect()
6868}
6869
6870fn filter_from_dto(
6871    f: Option<&NotificationConfigurationFilter>,
6872) -> (Option<String>, Option<String>) {
6873    let Some(f) = f else {
6874        return (None, None);
6875    };
6876    let Some(key) = f.key.as_ref() else {
6877        return (None, None);
6878    };
6879    let Some(rules) = key.filter_rules.as_ref() else {
6880        return (None, None);
6881    };
6882    let mut prefix = None;
6883    let mut suffix = None;
6884    for r in rules {
6885        let name = r.name.as_ref().map(|n| n.as_str().to_ascii_lowercase());
6886        let value = r.value.clone();
6887        match name.as_deref() {
6888            Some("prefix") => prefix = value,
6889            Some("suffix") => suffix = value,
6890            _ => {}
6891        }
6892    }
6893    (prefix, suffix)
6894}
6895
6896fn filter_to_dto(
6897    prefix: Option<&str>,
6898    suffix: Option<&str>,
6899) -> Option<NotificationConfigurationFilter> {
6900    if prefix.is_none() && suffix.is_none() {
6901        return None;
6902    }
6903    let mut rules: Vec<FilterRule> = Vec::new();
6904    if let Some(p) = prefix {
6905        rules.push(FilterRule {
6906            name: Some(FilterRuleName::from("prefix".to_owned())),
6907            value: Some(p.to_owned()),
6908        });
6909    }
6910    if let Some(s) = suffix {
6911        rules.push(FilterRule {
6912            name: Some(FilterRuleName::from("suffix".to_owned())),
6913            value: Some(s.to_owned()),
6914        });
6915    }
6916    Some(NotificationConfigurationFilter {
6917        key: Some(S3KeyFilter {
6918            filter_rules: Some(rules),
6919        }),
6920    })
6921}
6922
6923// ---------------------------------------------------------------------------
6924// v0.6 #40: Convert between the s3s-typed `ReplicationConfiguration` (the
6925// wire surface) and our internal `crate::replication::ReplicationConfig`.
6926// AWS's `ReplicationRuleFilter` is a sum type — `Prefix | Tag | And { Prefix,
6927// Tags }`; we flatten it into the single `(prefix, tag-vec)` representation
6928// the matcher needs. Sub-blocks v0.6 #40 does not implement
6929// (DeleteMarkerReplication / SourceSelectionCriteria / ReplicationTime /
6930// Metrics / EncryptionConfiguration) round-trip as `None` on GET — operators
6931// who set them on PUT see them silently dropped, mirroring "feature not
6932// supported in this release" semantics.
6933// ---------------------------------------------------------------------------
6934
6935fn replication_from_dto(dto: &ReplicationConfiguration) -> crate::replication::ReplicationConfig {
6936    let rules = dto
6937        .rules
6938        .iter()
6939        .enumerate()
6940        .map(|(idx, r)| {
6941            let id =
6942                r.id.as_ref()
6943                    .map(|s| s.as_str().to_owned())
6944                    .unwrap_or_else(|| format!("rule-{idx}"));
6945            let priority = r.priority.unwrap_or(0).max(0) as u32;
6946            let status_enabled = r.status.as_str() == ReplicationRuleStatus::ENABLED;
6947            let filter = replication_filter_from_dto(r.filter.as_ref(), r.prefix.as_deref());
6948            let destination_bucket = r.destination.bucket.clone();
6949            let destination_storage_class = r
6950                .destination
6951                .storage_class
6952                .as_ref()
6953                .map(|s| s.as_str().to_owned());
6954            crate::replication::ReplicationRule {
6955                id,
6956                priority,
6957                status_enabled,
6958                filter,
6959                destination_bucket,
6960                destination_storage_class,
6961            }
6962        })
6963        .collect();
6964    crate::replication::ReplicationConfig {
6965        role: dto.role.clone(),
6966        rules,
6967    }
6968}
6969
6970fn replication_to_dto(cfg: &crate::replication::ReplicationConfig) -> ReplicationConfiguration {
6971    let rules = cfg
6972        .rules
6973        .iter()
6974        .map(|r| {
6975            let status = if r.status_enabled {
6976                ReplicationRuleStatus::from_static(ReplicationRuleStatus::ENABLED)
6977            } else {
6978                ReplicationRuleStatus::from_static(ReplicationRuleStatus::DISABLED)
6979            };
6980            let destination = Destination {
6981                access_control_translation: None,
6982                account: None,
6983                bucket: r.destination_bucket.clone(),
6984                encryption_configuration: None,
6985                metrics: None,
6986                replication_time: None,
6987                storage_class: r
6988                    .destination_storage_class
6989                    .as_ref()
6990                    .map(|s| StorageClass::from(s.clone())),
6991            };
6992            let filter = Some(replication_filter_to_dto(&r.filter));
6993            ReplicationRule {
6994                delete_marker_replication: None,
6995                destination,
6996                existing_object_replication: None,
6997                filter,
6998                id: Some(r.id.clone()),
6999                prefix: None,
7000                priority: Some(r.priority as i32),
7001                source_selection_criteria: None,
7002                status,
7003            }
7004        })
7005        .collect();
7006    ReplicationConfiguration {
7007        role: cfg.role.clone(),
7008        rules,
7009    }
7010}
7011
7012fn replication_filter_from_dto(
7013    f: Option<&ReplicationRuleFilter>,
7014    rule_level_prefix: Option<&str>,
7015) -> crate::replication::ReplicationFilter {
7016    let mut prefix: Option<String> = rule_level_prefix.map(str::to_owned);
7017    let mut tags: Vec<(String, String)> = Vec::new();
7018    if let Some(f) = f {
7019        if let Some(p) = f.prefix.as_ref()
7020            && prefix.is_none()
7021        {
7022            prefix = Some(p.clone());
7023        }
7024        if let Some(t) = f.tag.as_ref()
7025            && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7026        {
7027            tags.push((k.clone(), v.clone()));
7028        }
7029        if let Some(and) = f.and.as_ref() {
7030            if let Some(p) = and.prefix.as_ref()
7031                && prefix.is_none()
7032            {
7033                prefix = Some(p.clone());
7034            }
7035            if let Some(ts) = and.tags.as_ref() {
7036                for t in ts {
7037                    if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7038                        tags.push((k.clone(), v.clone()));
7039                    }
7040                }
7041            }
7042        }
7043    }
7044    crate::replication::ReplicationFilter { prefix, tags }
7045}
7046
7047fn replication_filter_to_dto(f: &crate::replication::ReplicationFilter) -> ReplicationRuleFilter {
7048    if f.tags.is_empty() {
7049        ReplicationRuleFilter {
7050            and: None,
7051            prefix: f.prefix.clone(),
7052            tag: None,
7053        }
7054    } else if f.tags.len() == 1 && f.prefix.is_none() {
7055        let (k, v) = &f.tags[0];
7056        ReplicationRuleFilter {
7057            and: None,
7058            prefix: None,
7059            tag: Some(Tag {
7060                key: Some(k.clone()),
7061                value: Some(v.clone()),
7062            }),
7063        }
7064    } else {
7065        let tags: Vec<Tag> = f
7066            .tags
7067            .iter()
7068            .map(|(k, v)| Tag {
7069                key: Some(k.clone()),
7070                value: Some(v.clone()),
7071            })
7072            .collect();
7073        ReplicationRuleFilter {
7074            and: Some(ReplicationRuleAndOperator {
7075                prefix: f.prefix.clone(),
7076                tags: Some(tags),
7077            }),
7078            prefix: None,
7079            tag: None,
7080        }
7081    }
7082}
7083
7084// ---------------------------------------------------------------------------
7085// v0.6 #37: Convert between the s3s-typed `BucketLifecycleConfiguration`
7086// (the wire surface) and our internal `crate::lifecycle::LifecycleConfig`.
7087// The internal representation flattens AWS's "Filter | And" disjunction
7088// into a single `LifecycleFilter` struct of optional fields plus a tag
7089// vector. Fields S4's evaluator does not consume
7090// (`expired_object_delete_marker`, `noncurrent_version_transitions`,
7091// `transition_default_minimum_object_size`, the storage class on the
7092// noncurrent expiration) are dropped on PUT and re-rendered as their
7093// AWS-default shape on GET so the client always sees a well-formed
7094// configuration.
7095// ---------------------------------------------------------------------------
7096
7097fn dto_lifecycle_to_internal(
7098    dto: &BucketLifecycleConfiguration,
7099) -> crate::lifecycle::LifecycleConfig {
7100    crate::lifecycle::LifecycleConfig {
7101        rules: dto.rules.iter().map(dto_rule_to_internal).collect(),
7102    }
7103}
7104
7105fn dto_rule_to_internal(rule: &LifecycleRule) -> crate::lifecycle::LifecycleRule {
7106    let status = crate::lifecycle::LifecycleStatus::from_aws_str(rule.status.as_str());
7107    let filter = rule
7108        .filter
7109        .as_ref()
7110        .map(dto_filter_to_internal)
7111        .unwrap_or_default();
7112    let expiration_days = rule
7113        .expiration
7114        .as_ref()
7115        .and_then(|e| e.days)
7116        .and_then(|d| u32::try_from(d).ok());
7117    let expiration_date = rule
7118        .expiration
7119        .as_ref()
7120        .and_then(|e| e.date.as_ref())
7121        .and_then(timestamp_to_chrono_utc);
7122    let transitions: Vec<crate::lifecycle::TransitionRule> = rule
7123        .transitions
7124        .as_ref()
7125        .map(|ts| {
7126            ts.iter()
7127                .filter_map(|t| {
7128                    let days = u32::try_from(t.days?).ok()?;
7129                    let storage_class = t.storage_class.as_ref()?.as_str().to_owned();
7130                    Some(crate::lifecycle::TransitionRule {
7131                        days,
7132                        storage_class,
7133                    })
7134                })
7135                .collect()
7136        })
7137        .unwrap_or_default();
7138    let noncurrent_version_expiration_days = rule
7139        .noncurrent_version_expiration
7140        .as_ref()
7141        .and_then(|n| n.noncurrent_days)
7142        .and_then(|d| u32::try_from(d).ok());
7143    let abort_incomplete_multipart_upload_days = rule
7144        .abort_incomplete_multipart_upload
7145        .as_ref()
7146        .and_then(|a| a.days_after_initiation)
7147        .and_then(|d| u32::try_from(d).ok());
7148    crate::lifecycle::LifecycleRule {
7149        id: rule.id.clone().unwrap_or_default(),
7150        status,
7151        filter,
7152        expiration_days,
7153        expiration_date,
7154        transitions,
7155        noncurrent_version_expiration_days,
7156        abort_incomplete_multipart_upload_days,
7157    }
7158}
7159
7160fn dto_filter_to_internal(filter: &LifecycleRuleFilter) -> crate::lifecycle::LifecycleFilter {
7161    let mut prefix = filter.prefix.clone();
7162    let mut tags: Vec<(String, String)> = Vec::new();
7163    let mut size_gt: Option<u64> = filter
7164        .object_size_greater_than
7165        .and_then(|n| u64::try_from(n).ok());
7166    let mut size_lt: Option<u64> = filter
7167        .object_size_less_than
7168        .and_then(|n| u64::try_from(n).ok());
7169    if let Some(t) = &filter.tag
7170        && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7171    {
7172        tags.push((k.clone(), v.clone()));
7173    }
7174    if let Some(and) = &filter.and {
7175        if prefix.is_none() {
7176            prefix = and.prefix.clone();
7177        }
7178        if size_gt.is_none() {
7179            size_gt = and
7180                .object_size_greater_than
7181                .and_then(|n| u64::try_from(n).ok());
7182        }
7183        if size_lt.is_none() {
7184            size_lt = and
7185                .object_size_less_than
7186                .and_then(|n| u64::try_from(n).ok());
7187        }
7188        if let Some(ts) = &and.tags {
7189            for t in ts {
7190                if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7191                    tags.push((k.clone(), v.clone()));
7192                }
7193            }
7194        }
7195    }
7196    crate::lifecycle::LifecycleFilter {
7197        prefix,
7198        tags,
7199        object_size_greater_than: size_gt,
7200        object_size_less_than: size_lt,
7201    }
7202}
7203
7204fn internal_rule_to_dto(rule: &crate::lifecycle::LifecycleRule) -> LifecycleRule {
7205    let expiration = if rule.expiration_days.is_some() || rule.expiration_date.is_some() {
7206        Some(LifecycleExpiration {
7207            date: rule.expiration_date.map(chrono_utc_to_timestamp),
7208            days: rule.expiration_days.map(|d| d as i32),
7209            expired_object_delete_marker: None,
7210        })
7211    } else {
7212        None
7213    };
7214    let transitions: Option<TransitionList> = if rule.transitions.is_empty() {
7215        None
7216    } else {
7217        Some(
7218            rule.transitions
7219                .iter()
7220                .map(|t| Transition {
7221                    date: None,
7222                    days: Some(t.days as i32),
7223                    storage_class: Some(TransitionStorageClass::from(t.storage_class.clone())),
7224                })
7225                .collect(),
7226        )
7227    };
7228    let noncurrent_version_expiration =
7229        rule.noncurrent_version_expiration_days
7230            .map(|d| NoncurrentVersionExpiration {
7231                newer_noncurrent_versions: None,
7232                noncurrent_days: Some(d as i32),
7233            });
7234    let abort_incomplete_multipart_upload =
7235        rule.abort_incomplete_multipart_upload_days
7236            .map(|d| AbortIncompleteMultipartUpload {
7237                days_after_initiation: Some(d as i32),
7238            });
7239    let filter = if rule.filter.tags.is_empty()
7240        && rule.filter.object_size_greater_than.is_none()
7241        && rule.filter.object_size_less_than.is_none()
7242    {
7243        rule.filter.prefix.as_ref().map(|p| LifecycleRuleFilter {
7244            and: None,
7245            object_size_greater_than: None,
7246            object_size_less_than: None,
7247            prefix: Some(p.clone()),
7248            tag: None,
7249        })
7250    } else if rule.filter.tags.len() == 1
7251        && rule.filter.prefix.is_none()
7252        && rule.filter.object_size_greater_than.is_none()
7253        && rule.filter.object_size_less_than.is_none()
7254    {
7255        let (k, v) = rule.filter.tags[0].clone();
7256        Some(LifecycleRuleFilter {
7257            and: None,
7258            object_size_greater_than: None,
7259            object_size_less_than: None,
7260            prefix: None,
7261            tag: Some(Tag {
7262                key: Some(k),
7263                value: Some(v),
7264            }),
7265        })
7266    } else {
7267        let tags = if rule.filter.tags.is_empty() {
7268            None
7269        } else {
7270            Some(
7271                rule.filter
7272                    .tags
7273                    .iter()
7274                    .map(|(k, v)| Tag {
7275                        key: Some(k.clone()),
7276                        value: Some(v.clone()),
7277                    })
7278                    .collect(),
7279            )
7280        };
7281        Some(LifecycleRuleFilter {
7282            and: Some(LifecycleRuleAndOperator {
7283                object_size_greater_than: rule
7284                    .filter
7285                    .object_size_greater_than
7286                    .and_then(|n| i64::try_from(n).ok()),
7287                object_size_less_than: rule
7288                    .filter
7289                    .object_size_less_than
7290                    .and_then(|n| i64::try_from(n).ok()),
7291                prefix: rule.filter.prefix.clone(),
7292                tags,
7293            }),
7294            object_size_greater_than: None,
7295            object_size_less_than: None,
7296            prefix: None,
7297            tag: None,
7298        })
7299    };
7300    LifecycleRule {
7301        abort_incomplete_multipart_upload,
7302        expiration,
7303        filter,
7304        id: if rule.id.is_empty() {
7305            None
7306        } else {
7307            Some(rule.id.clone())
7308        },
7309        noncurrent_version_expiration,
7310        noncurrent_version_transitions: None,
7311        prefix: None,
7312        status: ExpirationStatus::from(rule.status.as_aws_str().to_owned()),
7313        transitions,
7314    }
7315}
7316
7317// (timestamp <-> chrono helpers `timestamp_to_chrono_utc` /
7318// `chrono_utc_to_timestamp` are defined earlier in this file for the
7319// tagging/notifications work; the lifecycle DTO converters reuse them.)
7320
7321// ---------------------------------------------------------------------------
7322// v0.5 #33: SigV4a (asymmetric ECDSA-P256) integration hook.
7323//
7324// Kept as a self-contained block at the bottom of the file so it doesn't
7325// touch the existing `S4Service` struct, `new()`, or any of the per-op
7326// handlers above. The hook is wired in by the binary at server-build time
7327// as a hyper middleware layer (see `main.rs`), NOT inside `S4Service`.
7328//
7329// Lifecycle:
7330//   1. `SigV4aGate::new(store)` is constructed once at boot from the
7331//      operator-supplied credential directory.
7332//   2. For each incoming request, `SigV4aGate::pre_route(&req,
7333//      &requested_region, &canonical_request_bytes)` is invoked BEFORE
7334//      the request hits the S3 framework. If the request claims SigV4a
7335//      and verifies, control returns to the framework. Otherwise a 403
7336//      `SignatureDoesNotMatch` is produced.
7337//   3. Plain SigV4 (HMAC-SHA256) requests pass through untouched.
7338// ---------------------------------------------------------------------------
7339
7340/// Gate that fronts the S3 service path with SigV4a verification (v0.5 #33).
7341///
7342/// Wraps a [`crate::sigv4a::SigV4aCredentialStore`] and exposes a single
7343/// `pre_route` entry point that returns `Ok(())` for both
7344/// "request is plain SigV4 — pass through" and "request is SigV4a and
7345/// verified", and an `Err(...)` containing a 403-equivalent diagnostic
7346/// otherwise. Cheap to clone (the inner store is `Arc`-backed).
7347///
7348/// v0.8.4 #76 (audit H-6): the gate now enforces an `x-amz-date`
7349/// freshness window (default 15 min, AWS-spec) and a strict credential
7350/// scope shape (`<key>/<YYYYMMDD>/s3/aws4_request`), shutting the
7351/// captured-request replay vector — previously a stolen valid SigV4a
7352/// signature could be replayed indefinitely (including DELETE).
7353#[derive(Debug, Clone)]
7354pub struct SigV4aGate {
7355    store: crate::sigv4a::SharedSigV4aCredentialStore,
7356    /// v0.8.4 #76: how far the request's `x-amz-date` may drift from
7357    /// the server's clock before being rejected with 403
7358    /// `RequestTimeTooSkewed`. Matches the AWS S3 spec default of
7359    /// 15 min when constructed via [`SigV4aGate::new`]; the operator
7360    /// can override via [`SigV4aGate::with_skew_tolerance`] (CLI flag
7361    /// `--sigv4a-skew-tolerance-seconds`).
7362    skew_tolerance: chrono::Duration,
7363}
7364
7365impl SigV4aGate {
7366    /// Default `x-amz-date` skew tolerance — 15 min, matching AWS S3.
7367    pub const DEFAULT_SKEW_TOLERANCE_SECS: i64 = 900;
7368
7369    #[must_use]
7370    pub fn new(store: crate::sigv4a::SharedSigV4aCredentialStore) -> Self {
7371        Self {
7372            store,
7373            skew_tolerance: chrono::Duration::seconds(Self::DEFAULT_SKEW_TOLERANCE_SECS),
7374        }
7375    }
7376
7377    /// v0.8.4 #76: override the `x-amz-date` skew tolerance (default
7378    /// 15 min). Operators can widen this for high-clock-drift
7379    /// environments or tighten it for compliance regimes that demand
7380    /// stricter freshness.
7381    #[must_use]
7382    pub fn with_skew_tolerance(mut self, skew: chrono::Duration) -> Self {
7383        self.skew_tolerance = skew;
7384        self
7385    }
7386
7387    /// Read the configured skew tolerance — exposed mostly for test +
7388    /// observability use.
7389    #[must_use]
7390    pub fn skew_tolerance(&self) -> chrono::Duration {
7391        self.skew_tolerance
7392    }
7393
7394    /// Inspect an incoming HTTP request. Behaviour:
7395    ///
7396    /// - Not SigV4a (no `X-Amz-Region-Set` and no SigV4a `Authorization`
7397    ///   prefix) → returns `Ok(())`; the framework's existing SigV4
7398    ///   path handles the request.
7399    /// - SigV4a + valid signature + region match + fresh x-amz-date
7400    ///   → `Ok(())`.
7401    /// - SigV4a + unknown access-key-id → `Err` with `InvalidAccessKeyId`.
7402    /// - SigV4a + bad signature / region mismatch → `Err` with
7403    ///   `SignatureDoesNotMatch`.
7404    /// - SigV4a + missing or skewed `x-amz-date` → `Err` with one of
7405    ///   the v0.8.4 #76 freshness variants (`RequestTimeTooSkewed`
7406    ///   et al.).
7407    ///
7408    /// `canonical_request_bytes` is the SigV4a string-to-sign (or
7409    /// canonical-request bytes; the caller decides) that the framework
7410    /// has already produced for this request. Keeping it as a parameter
7411    /// instead of rebuilding it inside the hook avoids duplicating the
7412    /// canonicalisation logic.
7413    pub fn pre_route<B>(
7414        &self,
7415        req: &http::Request<B>,
7416        requested_region: &str,
7417        canonical_request_bytes: &[u8],
7418    ) -> Result<(), SigV4aGateError> {
7419        self.pre_route_at(
7420            req,
7421            requested_region,
7422            canonical_request_bytes,
7423            chrono::Utc::now(),
7424        )
7425    }
7426
7427    /// Like [`SigV4aGate::pre_route`] but takes an explicit `now` for
7428    /// tests that need to pin the freshness clock. Production callers
7429    /// use `pre_route` (which calls `chrono::Utc::now()`).
7430    pub fn pre_route_at<B>(
7431        &self,
7432        req: &http::Request<B>,
7433        requested_region: &str,
7434        canonical_request_bytes: &[u8],
7435        now: chrono::DateTime<chrono::Utc>,
7436    ) -> Result<(), SigV4aGateError> {
7437        if !crate::sigv4a::detect(req) {
7438            return Ok(());
7439        }
7440        let auth_hdr = req
7441            .headers()
7442            .get(http::header::AUTHORIZATION)
7443            .and_then(|v| v.to_str().ok())
7444            .ok_or(SigV4aGateError::MissingAuthorization)?;
7445        let parsed = crate::sigv4a::parse_authorization_header(auth_hdr)
7446            .map_err(|_| SigV4aGateError::MalformedAuthorization)?;
7447        let region_set = req
7448            .headers()
7449            .get(crate::sigv4a::REGION_SET_HEADER)
7450            .and_then(|v| v.to_str().ok())
7451            .unwrap_or("*");
7452        let key = self
7453            .store
7454            .get(&parsed.access_key_id)
7455            .ok_or_else(|| SigV4aGateError::UnknownAccessKey(parsed.access_key_id.clone()))?;
7456        // v0.8.4 #76: snapshot the request headers into a
7457        // lowercase-keyed flat map so `verify_request` can do the
7458        // x-amz-date freshness checks without taking a generic
7459        // `HeaderMap` dep. Cheap because the headers list is tiny.
7460        //
7461        // v0.8.5 #84 (audit H-4): detect duplicate header names while
7462        // we flatten — `HashMap::insert` would silently overwrite the
7463        // first value with the second, mirroring the auth-confusion
7464        // vector the canonical-request builder also defends against.
7465        // Reject upfront so the rest of the gate (freshness check,
7466        // ECDSA verify) never sees a half-truncated header set. We
7467        // detect by checking `contains_key` *before* insertion rather
7468        // than by counting via `headers().get_all`, because the
7469        // upstream `HeaderMap` iteration yields each duplicate entry
7470        // as its own (name, value) pair — the second-seen entry is
7471        // exactly what `contains_key` traps.
7472        let mut header_map: std::collections::HashMap<String, String> =
7473            std::collections::HashMap::with_capacity(req.headers().len());
7474        for (name, value) in req.headers() {
7475            if let Ok(v) = value.to_str() {
7476                let lower = name.as_str().to_ascii_lowercase();
7477                if header_map.contains_key(&lower) {
7478                    return Err(SigV4aGateError::Verify(
7479                        crate::sigv4a::SigV4aError::DuplicateSignedHeader { header: lower },
7480                    ));
7481                }
7482                header_map.insert(lower, v.to_string());
7483            }
7484        }
7485        crate::sigv4a::verify_request(
7486            &parsed,
7487            &header_map,
7488            canonical_request_bytes,
7489            key,
7490            region_set,
7491            requested_region,
7492            now,
7493            self.skew_tolerance,
7494        )
7495        .map_err(SigV4aGateError::Verify)?;
7496        Ok(())
7497    }
7498}
7499
7500/// Failure modes from [`SigV4aGate::pre_route`]. All variants map to
7501/// HTTP 403 with one of the two AWS-standard error codes
7502/// (`InvalidAccessKeyId` / `SignatureDoesNotMatch` / `RequestTimeTooSkewed`)
7503/// — see [`SigV4aGateError::s3_error_code`].
7504#[derive(Debug, thiserror::Error)]
7505pub enum SigV4aGateError {
7506    #[error("missing Authorization header")]
7507    MissingAuthorization,
7508    #[error("malformed SigV4a Authorization header")]
7509    MalformedAuthorization,
7510    #[error("unknown SigV4a access-key-id: {0}")]
7511    UnknownAccessKey(String),
7512    #[error("SigV4a verification failed: {0}")]
7513    Verify(#[source] crate::sigv4a::SigV4aError),
7514}
7515
7516impl SigV4aGateError {
7517    /// AWS S3 error code that should accompany the response.
7518    ///
7519    /// v0.8.4 #76 (audit H-6): the freshness check surfaces
7520    /// `RequestTimeTooSkewed` (matches AWS spec); date / scope shape
7521    /// failures surface as `InvalidRequest` (400); other failures stay
7522    /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` (403) so the wire
7523    /// surface stays AWS-compatible.
7524    #[must_use]
7525    pub fn s3_error_code(&self) -> &'static str {
7526        match self {
7527            Self::UnknownAccessKey(_) => "InvalidAccessKeyId",
7528            Self::Verify(crate::sigv4a::SigV4aError::RequestTimeTooSkewed { .. }) => {
7529                "RequestTimeTooSkewed"
7530            }
7531            Self::Verify(
7532                crate::sigv4a::SigV4aError::MissingXAmzDate
7533                | crate::sigv4a::SigV4aError::InvalidDateFormat
7534                | crate::sigv4a::SigV4aError::DateScopeMismatch
7535                | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7536                | crate::sigv4a::SigV4aError::InvalidTerminator
7537                | crate::sigv4a::SigV4aError::WrongService { .. }
7538                | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7539            ) => "InvalidRequest",
7540            _ => "SignatureDoesNotMatch",
7541        }
7542    }
7543
7544    /// HTTP status code to accompany the response. v0.8.4 #76: format
7545    /// errors that are clearly client mistakes (missing / malformed
7546    /// `x-amz-date`, malformed credential scope, wrong service) are
7547    /// surfaced as 400 InvalidRequest; the rest stay 403.
7548    #[must_use]
7549    pub fn http_status(&self) -> http::StatusCode {
7550        match self {
7551            Self::Verify(
7552                crate::sigv4a::SigV4aError::MissingXAmzDate
7553                | crate::sigv4a::SigV4aError::InvalidDateFormat
7554                | crate::sigv4a::SigV4aError::DateScopeMismatch
7555                | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7556                | crate::sigv4a::SigV4aError::InvalidTerminator
7557                | crate::sigv4a::SigV4aError::WrongService { .. }
7558                | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7559            ) => http::StatusCode::BAD_REQUEST,
7560            _ => http::StatusCode::FORBIDDEN,
7561        }
7562    }
7563}
7564
7565#[cfg(test)]
7566mod tests {
7567    use super::*;
7568
7569    #[test]
7570    fn manifest_roundtrip_via_metadata() {
7571        let original = ChunkManifest {
7572            codec: CodecKind::CpuZstd,
7573            original_size: 1234,
7574            compressed_size: 567,
7575            crc32c: 0xdead_beef,
7576        };
7577        let mut meta: Option<Metadata> = None;
7578        write_manifest(&mut meta, &original);
7579        let extracted = extract_manifest(&meta).expect("manifest must round-trip");
7580        assert_eq!(extracted.codec, original.codec);
7581        assert_eq!(extracted.original_size, original.original_size);
7582        assert_eq!(extracted.compressed_size, original.compressed_size);
7583        assert_eq!(extracted.crc32c, original.crc32c);
7584    }
7585
7586    #[test]
7587    fn missing_metadata_yields_none() {
7588        let meta: Option<Metadata> = None;
7589        assert!(extract_manifest(&meta).is_none());
7590    }
7591
7592    #[test]
7593    fn partial_metadata_yields_none() {
7594        let mut meta = Metadata::new();
7595        meta.insert(META_CODEC.into(), "cpu-zstd".into());
7596        let opt = Some(meta);
7597        assert!(extract_manifest(&opt).is_none());
7598    }
7599
7600    #[test]
7601    fn parse_copy_source_range_basic() {
7602        let r = parse_copy_source_range("bytes=10-20").unwrap();
7603        match r {
7604            s3s::dto::Range::Int { first, last } => {
7605                assert_eq!(first, 10);
7606                assert_eq!(last, Some(20));
7607            }
7608            _ => panic!("expected Int range"),
7609        }
7610    }
7611
7612    #[test]
7613    fn parse_copy_source_range_rejects_inverted() {
7614        let err = parse_copy_source_range("bytes=20-10").unwrap_err();
7615        assert!(err.contains("last < first"));
7616    }
7617
7618    #[test]
7619    fn parse_copy_source_range_rejects_missing_prefix() {
7620        let err = parse_copy_source_range("10-20").unwrap_err();
7621        assert!(err.contains("must start with 'bytes='"));
7622    }
7623
7624    #[test]
7625    fn parse_copy_source_range_rejects_open_ended() {
7626        // S3 upload_part_copy spec requires N-M (closed); suffix and
7627        // open-ended forms are not allowed for this header.
7628        assert!(parse_copy_source_range("bytes=10-").is_err());
7629        assert!(parse_copy_source_range("bytes=-10").is_err());
7630    }
7631
7632    // v0.7 #49: safe_object_uri must round-trip every legal S3 key
7633    // (which includes spaces, slashes, control chars, raw UTF-8) into
7634    // a parseable `http::Uri` instead of panicking like the previous
7635    // `format!(...).parse().unwrap()` call sites did.
7636
7637    #[test]
7638    fn safe_object_uri_basic_ascii() {
7639        let uri = safe_object_uri("bucket", "key").expect("ascii must be safe");
7640        assert_eq!(uri.path(), "/bucket/key");
7641    }
7642
7643    #[test]
7644    fn safe_object_uri_encodes_spaces() {
7645        let uri = safe_object_uri("bucket", "key with spaces").expect("must encode spaces");
7646        // RFC 3986 path-segment encoding turns ' ' into %20.
7647        assert!(
7648            uri.path().contains("%20"),
7649            "expected percent-encoded space, got {}",
7650            uri.path()
7651        );
7652        assert!(uri.path().starts_with("/bucket/"));
7653    }
7654
7655    #[test]
7656    fn safe_object_uri_preserves_slashes() {
7657        // S3 keys legally contain '/' as a logical path separator —
7658        // the helper must NOT escape it (otherwise the synthetic URI
7659        // changes the perceived hierarchy).
7660        let uri = safe_object_uri("bucket", "key/with/slashes").expect("slashes must round-trip");
7661        assert_eq!(uri.path(), "/bucket/key/with/slashes");
7662    }
7663
7664    #[test]
7665    fn safe_object_uri_handles_newline_without_panic() {
7666        // Newlines are control chars in URIs; whether the result is
7667        // Ok (encoded as %0A) or Err (parse rejects), the helper
7668        // MUST NOT panic. Either outcome is acceptable.
7669        let _ = safe_object_uri("bucket", "key\n");
7670    }
7671
7672    #[test]
7673    fn safe_object_uri_handles_null_byte_without_panic() {
7674        let _ = safe_object_uri("bucket", "key\0bad");
7675    }
7676
7677    #[test]
7678    fn safe_object_uri_handles_unicode_without_panic() {
7679        // RTL override, BOM, plain Japanese — none should panic.
7680        let _ = safe_object_uri("bucket", "rtl\u{202E}override");
7681        let _ = safe_object_uri("bucket", "\u{FEFF}bom-key");
7682        let _ = safe_object_uri("bucket", "日本語キー");
7683    }
7684
7685    #[test]
7686    fn safe_object_uri_no_panic_for_every_byte() {
7687        // Exhaustive byte coverage: 0x00..=0xFF as a 1-byte key.
7688        // None of these may panic. (0x80..=0xFF are not valid UTF-8
7689        // by themselves; we go through `String::from_utf8_lossy` so
7690        // the helper sees a real `&str` regardless of the raw byte.)
7691        for b in 0u8..=255 {
7692            let s = String::from_utf8_lossy(&[b]).into_owned();
7693            let _ = safe_object_uri("bucket", &s);
7694        }
7695    }
7696
7697    /// v0.8.1 #58: smoke test for the DEK-handling shape used by the
7698    /// SSE-KMS branches of `put_object` and `complete_multipart_upload`.
7699    /// Mirrors the call pattern (generate_dek → length check → copy
7700    /// into stack `[u8; 32]` → reborrow as `&[u8; 32]` for `SseSource`)
7701    /// without spinning up a full `S4Service`.
7702    ///
7703    /// The real assertion this guards against is a regression where
7704    /// the `Zeroizing` wrapper is accidentally dropped before the
7705    /// stack copy lands (e.g. someone refactors to use
7706    /// `let dek = kms.generate_dek(...).await?.0; drop(dek); ...`)
7707    /// or where `&**dek` is rewritten in a way that doesn't compile.
7708    #[tokio::test]
7709    async fn kms_dek_lifetime_within_function_scope() {
7710        use crate::kms::{KmsBackend, LocalKms};
7711        use std::collections::HashMap;
7712        use std::path::PathBuf;
7713        use zeroize::Zeroizing;
7714
7715        let mut keks = HashMap::new();
7716        keks.insert("scope".to_string(), [33u8; 32]);
7717        let kms = LocalKms::from_keks(PathBuf::from("/tmp/kms-scope-test"), keks);
7718
7719        // Mirror the put_object KMS branch shape exactly.
7720        let (dek, wrapped) = kms.generate_dek("scope").await.unwrap();
7721        assert_eq!(dek.len(), 32);
7722        let mut dek_arr: Zeroizing<[u8; 32]> = Zeroizing::new([0u8; 32]);
7723        dek_arr.copy_from_slice(&dek);
7724
7725        // The reborrow used at the SseSource construction site —
7726        // mirrors the call-site pattern where `let dek_ref: &[u8; 32]`
7727        // auto-derefs from a `Zeroizing<[u8; 32]>` reference.
7728        let dek_ref: &[u8; 32] = &dek_arr;
7729        // Sanity: the reborrow points at the same bytes.
7730        assert_eq!(dek_ref, &*dek_arr);
7731        // Wrapped key id flows through unchanged.
7732        assert_eq!(wrapped.key_id, "scope");
7733
7734        // At end of scope, both `dek` (Zeroizing<Vec<u8>>) and
7735        // `dek_arr` (Zeroizing<[u8; 32]>) are dropped, wiping the
7736        // backing memory. Cannot directly assert the wipe (would be
7737        // UB to read freed memory), so this test instead enforces
7738        // that the call shape compiles and executes; the wipe itself
7739        // is exercised by the `zeroize` crate's own test suite.
7740    }
7741
7742    /// v0.8.5 #86 (audit M-2): the replication dispatcher must
7743    /// `acquire_owned()` a permit from `replication_semaphore` before
7744    /// kicking off the destination PUT, so a saturated semaphore
7745    /// back-pressures the in-flight queue depth instead of letting it
7746    /// grow without bound. We exercise the field directly (initial
7747    /// permit count, override via `with_replication_max_concurrent`,
7748    /// permit drop on `Drop`) — the full `spawn_replication_if_matched`
7749    /// integration is exercised by the existing replication tests in
7750    /// `tests/feature_e2e.rs` once a `ReplicationManager` is attached.
7751    #[tokio::test]
7752    async fn replication_semaphore_caps_concurrent_dispatchers() {
7753        // Build a minimal `S4Service` directly — no handler path is
7754        // exercised, only the constructor + setter + accessor shape.
7755        let registry = Arc::new(
7756            CodecRegistry::new(CodecKind::Passthrough)
7757                .with(Arc::new(s4_codec::passthrough::Passthrough)),
7758        );
7759        let dispatcher = Arc::new(s4_codec::dispatcher::AlwaysDispatcher(
7760            CodecKind::Passthrough,
7761        ));
7762        let s4 = S4Service::new(NoopBackend, registry, dispatcher);
7763
7764        // Default cap matches the documented constant.
7765        assert_eq!(
7766            s4.replication_semaphore().available_permits(),
7767            S4Service::<NoopBackend>::DEFAULT_REPLICATION_MAX_CONCURRENT,
7768            "fresh S4Service must expose DEFAULT_REPLICATION_MAX_CONCURRENT permits"
7769        );
7770
7771        // Override via the builder — replaces the underlying `Semaphore`.
7772        let s4 = s4.with_replication_max_concurrent(2);
7773        assert_eq!(
7774            s4.replication_semaphore().available_permits(),
7775            2,
7776            "with_replication_max_concurrent(2) must expose exactly 2 permits"
7777        );
7778
7779        // Acquiring permits must reduce `available_permits()` and
7780        // dropping them must restore the count — this is the contract
7781        // `spawn_replication_if_matched` relies on for back-pressure.
7782        let sem = Arc::clone(s4.replication_semaphore());
7783        let p1 = sem.clone().acquire_owned().await.expect("permit 1");
7784        let p2 = sem.clone().acquire_owned().await.expect("permit 2");
7785        assert_eq!(
7786            sem.available_permits(),
7787            0,
7788            "two acquired permits must zero `available_permits()`"
7789        );
7790        // A third `try_acquire_owned` must fail — the cap is enforced
7791        // synchronously, no extra spawn slips through.
7792        assert!(
7793            sem.clone().try_acquire_owned().is_err(),
7794            "third acquire must back-pressure: cap was 2"
7795        );
7796        drop(p1);
7797        drop(p2);
7798        assert_eq!(
7799            sem.available_permits(),
7800            2,
7801            "dropping permits must restore cap"
7802        );
7803
7804        // Lower-bound clamp: a 0 cap would deadlock all dispatchers,
7805        // so the setter clamps it to 1 instead of accepting it
7806        // (callers are warned in the CLI doc).
7807        let s4 = s4.with_replication_max_concurrent(0);
7808        assert_eq!(
7809            s4.replication_semaphore().available_permits(),
7810            1,
7811            "cap=0 must be clamped to 1 to avoid total deadlock"
7812        );
7813    }
7814
7815    /// v0.8.5 #86 (audit M-1): the access-log flusher must return a
7816    /// `JoinHandle<()>` that the caller can `abort()` on shutdown
7817    /// without leaving a dangling task. The pre-#86 call site dropped
7818    /// the handle at end-of-block (silently detaching it); the fix is
7819    /// hoisting it into a process-lived `Vec` so the graceful-shutdown
7820    /// branch in `main.rs` can wait for clean exit. This test exercises
7821    /// the `JoinHandle.abort()` shape directly so a future refactor that
7822    /// stops returning the handle (or returns a non-abortable wrapper)
7823    /// trips this regression guard.
7824    #[tokio::test]
7825    async fn flusher_handle_can_be_aborted_cleanly() {
7826        // Stand up a minimal `AccessLog` pointing at a tmp dir so the
7827        // flusher's `create_dir_all` succeeds. The dir is cleaned up
7828        // by the OS / test harness; we don't assert on the contents.
7829        let tmp = std::env::temp_dir().join(format!(
7830            "s4-86-flusher-{}-{}",
7831            std::process::id(),
7832            std::time::SystemTime::now()
7833                .duration_since(std::time::UNIX_EPOCH)
7834                .map(|d| d.as_nanos())
7835                .unwrap_or(0)
7836        ));
7837        let dest = crate::access_log::AccessLogDest { dir: tmp.clone() };
7838        let log = crate::access_log::AccessLog::new(dest);
7839        let handle = log.spawn_flusher(None);
7840        assert!(
7841            !handle.is_finished(),
7842            "freshly-spawned flusher must not yet be finished"
7843        );
7844        handle.abort();
7845        // `await`-ing an aborted handle returns `Err(JoinError)` whose
7846        // `is_cancelled()` is true.
7847        let join_result = handle.await;
7848        assert!(
7849            join_result.is_err(),
7850            "aborted flusher must surface JoinError, got Ok"
7851        );
7852        assert!(
7853            join_result.unwrap_err().is_cancelled(),
7854            "JoinError must report .is_cancelled() = true after abort()"
7855        );
7856        let _ = std::fs::remove_dir_all(&tmp);
7857    }
7858
7859    /// Stub backend used solely by the v0.8.5 #86 unit tests above —
7860    /// the `S4Service` constructor needs `B: S3` but the tests only
7861    /// exercise builder / accessor shape, never a handler call. Every
7862    /// `S3` method falls through to the trait's default
7863    /// `NotImplemented` (which `s3s` provides automatically).
7864    struct NoopBackend;
7865
7866    #[async_trait::async_trait]
7867    impl S3 for NoopBackend {}
7868
7869    /// v0.8.5 #81 (audit H-7): the panic-catch wrapper at the
7870    /// dispatcher spawn site must intercept a panicking inner future,
7871    /// log at ERROR, and bump the per-kind counter — instead of letting
7872    /// the panic propagate as a `JoinError` that no operator dashboard
7873    /// scrapes. We exercise the wrapper directly (rather than driving a
7874    /// full `spawn_replication_if_matched` end-to-end, which would
7875    /// require a full `S4Service` + backend) because the wrapper shape
7876    /// is the load-bearing piece — any inner-future swap would still
7877    /// route through the same `AssertUnwindSafe(...).catch_unwind()`
7878    /// closure we want to lock in here.
7879    #[tokio::test]
7880    async fn dispatcher_panic_caught_and_metric_bumped() {
7881        use futures::FutureExt as _;
7882
7883        let handle = crate::metrics::test_metrics_handle();
7884        let kind = "replication";
7885
7886        // Mirror the production wrapper shape verbatim — if the
7887        // production code ever stops using `AssertUnwindSafe.catch_unwind`
7888        // this test shouldn't keep passing on a hand-rolled copy that
7889        // diverged.
7890        let panicking = async {
7891            panic!("simulated dispatcher panic");
7892        };
7893        let result = std::panic::AssertUnwindSafe(panicking).catch_unwind().await;
7894        assert!(
7895            result.is_err(),
7896            "catch_unwind must surface the panic instead of swallowing it"
7897        );
7898        // Bump the production counter via the same helper the wrapper
7899        // calls so the rendered output gates on the production code
7900        // path, not a parallel bookkeeping copy.
7901        crate::metrics::record_dispatcher_panic(kind);
7902
7903        let rendered = handle.render();
7904        assert!(
7905            rendered.contains("s4_dispatcher_panics_total"),
7906            "expected s4_dispatcher_panics_total in metrics output, got: {rendered}"
7907        );
7908        assert!(
7909            rendered.contains("kind=\"replication\""),
7910            "expected kind=\"replication\" label in metrics output, got: {rendered}"
7911        );
7912    }
7913}