s4_server/service.rs
1//! `s3s::S3` 実装 — `s3s_aws::Proxy` への delegation を default にしつつ、
2//! `put_object` / `get_object` 経路で `s4_codec::CodecRegistry` を呼ぶ。
3//!
4//! ## カバー範囲 (Phase 1 月 2)
5//!
6//! - 圧縮 hook あり: `put_object`, `get_object`
7//! - 純 delegation (圧縮なし): `head_bucket`, `list_buckets`, `create_bucket`, `delete_bucket`,
8//! `head_object`, `delete_object`, `delete_objects`, `copy_object`, `list_objects`,
9//! `list_objects_v2`, `create_multipart_upload`, `upload_part`,
10//! `complete_multipart_upload`, `abort_multipart_upload`, `list_multipart_uploads`,
11//! `list_parts`
12//! - 未対応 (デフォルトで NotImplemented): その他 80+ ops (Tagging / ACL / Lifecycle 等は Phase 2)
13//!
14//! ## アーキテクチャ
15//!
16//! - `S4Service<B>` は backend (B: S3) と `Arc<CodecRegistry>` と `Arc<dyn CodecDispatcher>`
17//! を保持する。`CodecRegistry` 経由で複数 codec を抱えられるので、ひとつの S4 インスタンスが
18//! 複数 codec で書かれた object を透過的に GET できる
19//! - PUT: dispatcher が body の先頭 sample から codec を選び、registry で compress、
20//! manifest を S3 metadata に書いて backend に forward
21//! - GET: backend から取得 → metadata から manifest を復元 → registry.decompress で
22//! manifest 指定の codec で解凍 → 元の bytes を return
23//!
24//! ## 既知の制限事項
25//!
26//! - **Multipart Upload は per-part 圧縮が未実装**: 現状は upload_part を素通し。
27//! Phase 1 月 2 後半で per-part compress + complete_multipart_upload で manifest 集約。
28//! - **PUT body は memory に collect**: max_body_bytes 上限あり (default 5 GiB = S3 単発 PUT 上限)。
29//! Streaming-aware 圧縮は Phase 2。
30
31use std::sync::Arc;
32
33use base64::Engine as _;
34use bytes::BytesMut;
35use s3s::dto::*;
36use s3s::{S3, S3Error, S3ErrorCode, S3Request, S3Response, S3Result};
37use s4_codec::index::{FrameIndex, build_index_from_body, decode_index, encode_index, sidecar_key};
38use s4_codec::multipart::{
39 FRAME_HEADER_BYTES, FrameHeader, FrameIter, S3_MULTIPART_MIN_PART_BYTES, pad_to_minimum,
40 write_frame,
41};
42use s4_codec::{ChunkManifest, CodecDispatcher, CodecKind, CodecRegistry, CompressTelemetry};
43use std::time::Instant;
44use tracing::{debug, info};
45
46use crate::blob::{
47 bytes_to_blob, chain_sample_with_rest, collect_blob, collect_with_sample, peek_sample,
48};
49use crate::streaming::{
50 Crc32cVerifyingReader, async_read_to_blob, blob_to_async_read, cpu_zstd_decompress_stream,
51 pick_chunk_size, streaming_compress_to_frames, supports_streaming_compress,
52 supports_streaming_decompress,
53};
54
55/// PUT body の先頭 sampling で渡す最大 byte 数。
56const SAMPLE_BYTES: usize = 4096;
57
58/// v0.8 #55: stamp the GPU pipeline metrics (`s4_gpu_compress_seconds`,
59/// `s4_gpu_throughput_bytes_per_sec`, `s4_gpu_oom_total`) from a
60/// `CompressTelemetry` returned by `CodecRegistry::compress_with_telemetry`.
61/// CPU codecs (`gpu_seconds = None`) are no-ops here — they're already
62/// covered by the existing `s4_request_latency_seconds` / `s4_bytes_*`
63/// counters in the request-level `record_put` / `record_get` calls.
64#[inline]
65fn stamp_gpu_compress_telemetry(tel: &CompressTelemetry) {
66 if let Some(secs) = tel.gpu_seconds {
67 crate::metrics::record_gpu_compress(tel.codec, secs, tel.bytes_in, tel.bytes_out);
68 }
69 if tel.oom {
70 crate::metrics::record_gpu_oom(tel.codec);
71 }
72}
73
74/// v0.7 #49: percent-encoding set covering everything that is **not** an
75/// `unreserved` character per RFC 3986 §2.3, **plus** we additionally
76/// encode the path-reserved sub-delims that `http::Uri` rejects in a
77/// path segment (`?`, `#`, `%`, control bytes, space, etc.). We
78/// deliberately keep `/` un-encoded because S3 keys legally use `/` as
79/// a logical separator and the rest of the synthetic URI relies on the
80/// path layout `/{bucket}/{key}` round-tripping byte-for-byte.
81const URI_KEY_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
82 .add(b' ')
83 .add(b'"')
84 .add(b'#')
85 .add(b'<')
86 .add(b'>')
87 .add(b'?')
88 .add(b'`')
89 .add(b'{')
90 .add(b'}')
91 .add(b'|')
92 .add(b'\\')
93 .add(b'^')
94 .add(b'[')
95 .add(b']')
96 .add(b'%');
97
98/// v0.7 #49: build the synthetic `/{bucket}/{key}` request URI used by
99/// the sidecar / replication helpers when they re-enter the backend
100/// trait without going through the HTTP layer. S3 object keys can
101/// contain spaces, control bytes, and arbitrary Unicode that would
102/// make `format!(...).parse::<http::Uri>()` panic; we percent-encode
103/// the key bytes (RFC 3986 path segment) and the bucket name (defensive
104/// — bucket names are normally DNS-safe, but the helper is the single
105/// choke-point) before splicing them in. If the encoded form *still*
106/// fails to parse (extremely unlikely once everything outside the
107/// unreserved set is escaped) we surface a typed `400 InvalidObjectName`
108/// instead of crashing the worker.
109pub(crate) fn safe_object_uri(bucket: &str, key: &str) -> S3Result<http::Uri> {
110 use percent_encoding::utf8_percent_encode;
111 let bucket_enc = utf8_percent_encode(bucket, URI_KEY_ENCODE_SET);
112 let key_enc = utf8_percent_encode(key, URI_KEY_ENCODE_SET);
113 let raw = format!("/{bucket_enc}/{key_enc}");
114 raw.parse::<http::Uri>().map_err(|e| {
115 // S3 spec uses `InvalidObjectName` (HTTP 400) for keys that
116 // can't be represented in a request URI. The generated
117 // `S3ErrorCode` enum doesn't expose a typed variant for it,
118 // so we round-trip through `from_bytes` which preserves the
119 // canonical wire string while falling back to InvalidArgument
120 // if even that lookup fails (cannot happen at runtime — kept
121 // as a belt-and-suspenders branch so this helper never
122 // panics).
123 let code =
124 S3ErrorCode::from_bytes(b"InvalidObjectName").unwrap_or(S3ErrorCode::InvalidArgument);
125 S3Error::with_message(
126 code,
127 format!("object key cannot be encoded as a request URI: {e}"),
128 )
129 })
130}
131
132/// v0.8.12 HIGH-12 fix: verify a client-supplied integrity checksum
133/// against the received body BEFORE we strip the header on the way
134/// to the backend. Returns `Err(BadDigest)` on mismatch (matches
135/// AWS S3 wire behaviour); `Ok(())` when the supplied digest matches
136/// OR when the supplied algorithm is one we don't yet implement
137/// (the latter is logged so operators see the gap — fail-open on
138/// unsupported algorithms is the documented trade in the v0.8.11
139/// CHANGELOG, with full coverage tracked as a follow-up issue).
140///
141/// Algorithms covered: `Content-MD5` (base64 MD5),
142/// `x-amz-checksum-crc32c` (base64 big-endian u32),
143/// `x-amz-checksum-sha256` (base64 SHA-256). The remaining S3
144/// checksum algorithms (CRC32 non-Castagnoli, SHA-1, CRC64-NVME)
145/// are accepted and silently passed; verifying them needs new
146/// dependencies and was held back to keep the v0.8.12 surface
147/// bounded.
148#[allow(clippy::too_many_arguments)]
149fn verify_client_body_checksums(
150 body: &[u8],
151 content_md5_b64: Option<&str>,
152 checksum_crc32_b64: Option<&str>,
153 checksum_crc32c_b64: Option<&str>,
154 checksum_sha1_b64: Option<&str>,
155 checksum_sha256_b64: Option<&str>,
156 checksum_crc64nvme_b64: Option<&str>,
157) -> S3Result<()> {
158 use base64::Engine as _;
159 use md5::Md5;
160 use sha2::Sha256;
161 // `Digest` from md-5 / sha2 brings the `new`, `update`, `finalize`
162 // trait methods into scope. Bind anonymously so this `use` is
163 // never flagged as unused while still serving its real purpose.
164 use md5::Digest as _;
165 let b64 = base64::engine::general_purpose::STANDARD;
166 let bad = |what: &str| {
167 let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
168 S3Error::with_message(
169 code,
170 format!("client-supplied {what} did not match the received body"),
171 )
172 };
173 if let Some(claimed) = content_md5_b64 {
174 let want = b64.decode(claimed).map_err(|_| {
175 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed Content-MD5")
176 })?;
177 if want.len() != 16 {
178 return Err(S3Error::with_message(
179 S3ErrorCode::InvalidDigest,
180 "Content-MD5 must decode to 16 bytes",
181 ));
182 }
183 let mut h = Md5::new();
184 h.update(body);
185 let got = h.finalize();
186 // `subtle::ConstantTimeEq` would be ideal but the existing
187 // `constant_time_eq` helper in sse.rs is private; use a
188 // straightforward byte compare. The attacker doesn't get to
189 // choose the body retroactively, so a timing oracle here
190 // doesn't help them. `&got[..]` derefs the GenericArray
191 // into a `&[u8]` (the deprecated `.as_slice()` is gone in
192 // generic-array 1.x; CI runs `-D warnings`).
193 if got[..] != *want.as_slice() {
194 return Err(bad("Content-MD5"));
195 }
196 }
197 if let Some(claimed) = checksum_crc32c_b64 {
198 let want = b64.decode(claimed).map_err(|_| {
199 S3Error::with_message(
200 S3ErrorCode::InvalidDigest,
201 "malformed x-amz-checksum-crc32c",
202 )
203 })?;
204 if want.len() != 4 {
205 return Err(S3Error::with_message(
206 S3ErrorCode::InvalidDigest,
207 "x-amz-checksum-crc32c must decode to 4 bytes (big-endian u32)",
208 ));
209 }
210 let got = crc32c::crc32c(body).to_be_bytes();
211 if got != want.as_slice() {
212 return Err(bad("x-amz-checksum-crc32c"));
213 }
214 }
215 if let Some(claimed) = checksum_sha256_b64 {
216 let want = b64.decode(claimed).map_err(|_| {
217 S3Error::with_message(
218 S3ErrorCode::InvalidDigest,
219 "malformed x-amz-checksum-sha256",
220 )
221 })?;
222 if want.len() != 32 {
223 return Err(S3Error::with_message(
224 S3ErrorCode::InvalidDigest,
225 "x-amz-checksum-sha256 must decode to 32 bytes",
226 ));
227 }
228 let mut h = Sha256::new();
229 h.update(body);
230 let got = h.finalize();
231 if got[..] != *want.as_slice() {
232 return Err(bad("x-amz-checksum-sha256"));
233 }
234 }
235 // v0.8.12 #128 (MED-C): CRC32 (IEEE 802.3 — the non-Castagnoli
236 // variant AWS uses for `x-amz-checksum-crc32`). 4-byte
237 // big-endian value, base64-encoded.
238 if let Some(claimed) = checksum_crc32_b64 {
239 let want = b64.decode(claimed).map_err(|_| {
240 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-crc32")
241 })?;
242 if want.len() != 4 {
243 return Err(S3Error::with_message(
244 S3ErrorCode::InvalidDigest,
245 "x-amz-checksum-crc32 must decode to 4 bytes (big-endian u32)",
246 ));
247 }
248 let mut h = crc32fast::Hasher::new();
249 h.update(body);
250 let got = h.finalize().to_be_bytes();
251 if got != want.as_slice() {
252 return Err(bad("x-amz-checksum-crc32"));
253 }
254 }
255 // v0.8.12 #128 (MED-C): SHA-1. 20-byte digest, base64-encoded.
256 if let Some(claimed) = checksum_sha1_b64 {
257 use sha1::Sha1;
258 let want = b64.decode(claimed).map_err(|_| {
259 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-sha1")
260 })?;
261 if want.len() != 20 {
262 return Err(S3Error::with_message(
263 S3ErrorCode::InvalidDigest,
264 "x-amz-checksum-sha1 must decode to 20 bytes",
265 ));
266 }
267 let mut h = Sha1::new();
268 h.update(body);
269 let got = h.finalize();
270 if got[..] != *want.as_slice() {
271 return Err(bad("x-amz-checksum-sha1"));
272 }
273 }
274 // v0.8.12 #128 (MED-C): CRC64-NVME — AWS's newest checksum
275 // algorithm. NVMe spec: poly 0xad93d23594c93659, init / xorout
276 // 0xffffffffffffffff, refin / refout true. The reflected
277 // polynomial + 256-entry lookup table are computed lazily on
278 // first call (small enough to inline rather than pull in a
279 // dedicated crc64 crate).
280 if let Some(claimed) = checksum_crc64nvme_b64 {
281 let want = b64.decode(claimed).map_err(|_| {
282 S3Error::with_message(
283 S3ErrorCode::InvalidDigest,
284 "malformed x-amz-checksum-crc64nvme",
285 )
286 })?;
287 if want.len() != 8 {
288 return Err(S3Error::with_message(
289 S3ErrorCode::InvalidDigest,
290 "x-amz-checksum-crc64nvme must decode to 8 bytes (big-endian u64)",
291 ));
292 }
293 let got = crc64_nvme(body).to_be_bytes();
294 if got != want.as_slice() {
295 return Err(bad("x-amz-checksum-crc64nvme"));
296 }
297 }
298 Ok(())
299}
300
301/// v0.8.12 #128 (MED-C): CRC-64/NVME (AWS S3 `x-amz-checksum-crc64nvme`).
302/// NVMe spec: poly 0xad93d23594c93659, init 0xffffffffffffffff, refin
303/// true, refout true, xorout 0xffffffffffffffff. The reflected
304/// polynomial table is computed lazily on first call via
305/// [`std::sync::OnceLock`]; subsequent calls share the 256-entry table.
306fn crc64_nvme(bytes: &[u8]) -> u64 {
307 use std::sync::OnceLock;
308 static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
309 let tbl = TABLE.get_or_init(|| {
310 // Reflected polynomial (bit-reverse of 0xad93d23594c93659).
311 const POLY_REFLECTED: u64 = 0x9a6c_9329_ac4b_c9b5;
312 let mut t = [0u64; 256];
313 let mut i = 0usize;
314 while i < 256 {
315 let mut c = i as u64;
316 let mut j = 0;
317 while j < 8 {
318 c = if c & 1 != 0 {
319 (c >> 1) ^ POLY_REFLECTED
320 } else {
321 c >> 1
322 };
323 j += 1;
324 }
325 t[i] = c;
326 i += 1;
327 }
328 t
329 });
330 let mut crc: u64 = !0u64;
331 for &b in bytes {
332 let idx = ((crc as u8) ^ b) as usize;
333 crc = (crc >> 8) ^ tbl[idx];
334 }
335 !crc
336}
337
338/// v0.4 #20: captured at the start of a handler, before the request is
339/// consumed by the backend call, so the matching `record_access` at
340/// end-of-request can fill in the structured access log entry.
341struct AccessLogPreamble {
342 remote_ip: Option<String>,
343 requester: Option<String>,
344 request_uri: String,
345 user_agent: Option<String>,
346}
347
348pub struct S4Service<B: S3> {
349 /// Wrapped in `Arc` so the v0.6 #40 cross-bucket replication
350 /// dispatcher can clone it into a detached `tokio::spawn` task
351 /// (Arc::clone is cheap; backend trait methods take `&self` so no
352 /// other handler is affected by the indirection).
353 backend: Arc<B>,
354 registry: Arc<CodecRegistry>,
355 dispatcher: Arc<dyn CodecDispatcher>,
356 max_body_bytes: usize,
357 policy: Option<crate::policy::SharedPolicy>,
358 /// v0.3 #13: surfaced as the `aws:SecureTransport` Condition key. Set
359 /// to `true` when the listener is wrapped in TLS (or ACME), so policies
360 /// gating "deny if not over TLS" can do their job. Defaults to `false`
361 /// (HTTP); set via [`S4Service::with_secure_transport`] at boot.
362 secure_transport: bool,
363 /// v0.4 #19: optional per-(principal, bucket) token-bucket limiter.
364 rate_limits: Option<crate::rate_limit::SharedRateLimits>,
365 /// v0.4 #20: optional S3-style access log emitter.
366 access_log: Option<crate::access_log::SharedAccessLog>,
367 /// v0.4 #21 / v0.5 #29: optional server-side encryption keyring
368 /// (AES-256-GCM). When set, every PUT body gets wrapped in S4E2
369 /// (with the keyring's active key id) after the compress + framing
370 /// steps; every GET that sniffs as S4E1/S4E2 is decrypted before
371 /// frame parsing. A `with_sse_key(...)` call wraps the supplied
372 /// key in a 1-slot keyring so single-key (v0.4) operators get the
373 /// same behaviour they had before, just on the v2 frame.
374 sse_keyring: Option<crate::sse::SharedSseKeyring>,
375 /// v0.5 #34: optional first-class versioning state machine. When
376 /// `Some(...)`, S4-server itself owns the per-bucket versioning
377 /// state + per-(bucket, key) version chain; PUT / GET / DELETE /
378 /// list_object_versions / get_bucket_versioning /
379 /// put_bucket_versioning handlers consult the manager instead of
380 /// passing through. When `None` (default), the legacy
381 /// backend-passthrough behaviour applies so existing v0.4
382 /// deployments are unaffected until they explicitly call
383 /// `with_versioning(...)`.
384 versioning: Option<Arc<crate::versioning::VersioningManager>>,
385 /// v0.5 #28: optional SSE-KMS envelope-encryption backend. When
386 /// `Some(...)`, PUTs carrying `x-amz-server-side-encryption: aws:kms`
387 /// generate a fresh DEK via the backend, encrypt the body with it
388 /// (S4E4 frame), and persist only the wrapped DEK. GETs sniffing as
389 /// S4E4 unwrap the DEK through the same backend before decrypt.
390 /// `kms_default_key_id` is used when the request omits an explicit
391 /// `x-amz-server-side-encryption-aws-kms-key-id` (mirrors AWS S3
392 /// bucket-default behaviour).
393 kms: Option<Arc<dyn crate::kms::KmsBackend>>,
394 kms_default_key_id: Option<String>,
395 /// v0.5 #30: optional Object Lock (WORM) enforcement layer. When
396 /// `Some(...)`, `delete_object` and overwrite-style `put_object`
397 /// consult the manager and refuse the operation with HTTP 403
398 /// `AccessDenied` while the object is locked (Compliance until
399 /// expiry, Governance unless the bypass header is set, or any time
400 /// a legal hold is on). PUT also auto-applies the bucket-default
401 /// retention to brand-new objects when configured. When `None`
402 /// (default), the legacy backend-passthrough behaviour applies, so
403 /// existing v0.4 deployments are unaffected until they explicitly
404 /// call `with_object_lock(...)`.
405 object_lock: Option<Arc<crate::object_lock::ObjectLockManager>>,
406 /// v0.6 #38: optional first-class CORS bucket configuration manager.
407 /// When `Some(...)`, S4-server itself owns per-bucket CORS rules and
408 /// `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
409 /// consult the manager instead of passing through to the backend.
410 /// `handle_preflight` (public method on `S4Service`) routes OPTIONS-
411 /// style preflight matching through the same store; the actual HTTP
412 /// OPTIONS routing wire-up at the listener level is a follow-up
413 /// (s3s framework does not surface OPTIONS as a typed handler).
414 cors: Option<Arc<crate::cors::CorsManager>>,
415 /// v0.6 #36: optional first-class S3 Inventory manager. When
416 /// `Some(...)`, S4-server itself owns per-(bucket, id) inventory
417 /// configurations and `put_bucket_inventory_configuration` /
418 /// `get_bucket_inventory_configuration` /
419 /// `list_bucket_inventory_configurations` /
420 /// `delete_bucket_inventory_configuration` consult the manager
421 /// instead of passing through to the backend. The actual periodic
422 /// CSV emission is driven by a tokio task in `main.rs` that calls
423 /// `InventoryManager::run_once_for_test` on a fixed cadence; the
424 /// service handlers below only deal with config-level CRUD.
425 inventory: Option<Arc<crate::inventory::InventoryManager>>,
426 /// v0.6 #35: optional first-class S3 bucket-notification manager.
427 /// When `Some(...)`, S4-server itself owns per-bucket notification
428 /// configurations and `put_bucket_notification_configuration` /
429 /// `get_bucket_notification_configuration` consult the manager
430 /// instead of passing through to the backend. Successful PUT /
431 /// DELETE handlers fire matching destinations on a detached tokio
432 /// task (best-effort; see `crate::notifications::dispatch_event`).
433 notifications: Option<Arc<crate::notifications::NotificationManager>>,
434 /// v0.6 #37: optional first-class S3 Lifecycle configuration
435 /// manager. When `Some(...)`, S4-server itself owns per-bucket
436 /// lifecycle rules and `put_bucket_lifecycle_configuration` /
437 /// `get_bucket_lifecycle_configuration` /
438 /// `delete_bucket_lifecycle` consult the manager instead of
439 /// passing through to the backend. The actual background scanner
440 /// (list_objects_v2 -> evaluate -> delete / metadata-rewrite per
441 /// rule) is a v0.7+ follow-up; the test path
442 /// `S4Service::run_lifecycle_once_for_test` exercises the
443 /// evaluator end-to-end so this v0.6 #37 wiring is enough to ship
444 /// the configuration-management half without putting a
445 /// half-wired bucket-walk in front of users.
446 lifecycle: Option<Arc<crate::lifecycle::LifecycleManager>>,
447 /// v0.6 #39: optional first-class object + bucket Tagging manager.
448 /// When `Some(...)`, S4-server itself owns per-(bucket, key) and
449 /// per-bucket tag state — `PutObjectTagging` /
450 /// `GetObjectTagging` / `DeleteObjectTagging` /
451 /// `PutBucketTagging` / `GetBucketTagging` /
452 /// `DeleteBucketTagging` route through the manager (replacing the
453 /// previous backend-passthrough behaviour). `put_object` also
454 /// pre-parses the `x-amz-tagging` header / `Tagging` input field
455 /// so the IAM policy evaluator can gate on
456 /// `s3:RequestObjectTag/<key>` and `s3:ExistingObjectTag/<key>`.
457 /// On a successful PUT the parsed tags are persisted; on a
458 /// successful DELETE the matching tag entry is dropped.
459 tagging: Option<Arc<crate::tagging::TagManager>>,
460 /// v0.6 #40: optional first-class cross-bucket replication manager.
461 /// When `Some(...)`, S4-server itself owns per-bucket replication
462 /// rules; `PutBucketReplication` / `GetBucketReplication` /
463 /// `DeleteBucketReplication` route through the manager (replacing
464 /// the previous backend-passthrough behaviour). On every successful
465 /// `put_object` the manager's rule list is consulted; the
466 /// highest-priority matching enabled rule wins, the per-key status
467 /// is recorded as `Pending`, and the source body and metadata are
468 /// handed to a detached tokio task that PUTs to the destination
469 /// bucket through the same backend. The replica is stamped with
470 /// `x-amz-replication-status: REPLICA` in its metadata; the
471 /// source-side status is updated to `Completed` on success or
472 /// `Failed` after the 3-attempt retry budget is exhausted (drop
473 /// counter bumps in either-side case so dashboards see the loss).
474 /// `head_object` / `get_object` echo the recorded status back as
475 /// `x-amz-replication-status` so consumers can poll progress.
476 /// Limited to single-instance (same `S4Service`) replication; true
477 /// cross-region (multi-instance) is a v0.7+ follow-up.
478 replication: Option<Arc<crate::replication::ReplicationManager>>,
479 /// v0.6 #42: optional MFA-Delete enforcement layer. When `Some(...)`,
480 /// every DELETE / DELETE-version / delete-marker / `PutBucketVersioning`
481 /// request against a bucket whose MFA-Delete state is `Enabled`
482 /// must carry `x-amz-mfa: <serial> <code>` (RFC 6238 6-digit TOTP);
483 /// missing or invalid tokens return HTTP 403 `AccessDenied`. When
484 /// `None` (default), the gate is a no-op so existing v0.4 / v0.5
485 /// deployments are unaffected until they explicitly call
486 /// `with_mfa_delete(...)`.
487 mfa_delete: Option<Arc<crate::mfa::MfaDeleteManager>>,
488 /// v0.5 #32: when `true`, every PUT must carry an SSE indicator
489 /// (`x-amz-server-side-encryption`, the SSE-C customer-key headers,
490 /// or be matched against a configured server-managed keyring/KMS).
491 /// Set by `--compliance-mode strict` after the boot-time
492 /// prerequisite check passes.
493 compliance_strict: bool,
494 /// v0.7 #47: optional SigV4a (asymmetric ECDSA-P256-SHA256) verify
495 /// gate. When `Some(...)`, the listener-side middleware (see
496 /// [`crate::routing::try_sigv4a_verify`]) inspects every incoming
497 /// request and short-circuits SigV4a-signed ones — verifying the
498 /// signature against the credential store and returning 403
499 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` on failure. Plain
500 /// SigV4 (HMAC-SHA256) requests pass through to s3s untouched. When
501 /// `None`, the middleware is a no-op so the existing SigV4 path is
502 /// unaffected (operators opt in via `--sigv4a-credentials <DIR>`).
503 sigv4a_gate: Option<Arc<SigV4aGate>>,
504 /// v0.8 #54 BUG-5..10: per-`upload_id` side-table that ferries the
505 /// SSE / Tagging / Object-Lock context captured at
506 /// `CreateMultipartUpload` time through to `UploadPart` /
507 /// `CompleteMultipartUpload`. Always-on (no `with_*` flag) — the
508 /// store is gateway-internal and idle when no multipart is in
509 /// flight. See [`crate::multipart_state`] for rationale.
510 multipart_state: Arc<crate::multipart_state::MultipartStateStore>,
511 /// v0.8 #52: plaintext bytes per S4E5 chunk on the SSE-S4 PUT
512 /// path. `0` (default) → use the legacy buffered S4E2 path
513 /// (whole-body AES-GCM tag, GET buffers + verifies before
514 /// emitting). Non-zero → use the chunked S4E5 frame so GET can
515 /// stream-decrypt chunk-by-chunk. Wired by `--sse-chunk-size`
516 /// in `main.rs`. SSE-C and SSE-KMS are intentionally unaffected
517 /// (chunked variants tracked in a follow-up issue).
518 sse_chunk_size: usize,
519 /// v0.8.5 #86 (audit M-2): bounded permit pool gating the detached
520 /// replication dispatcher in [`Self::spawn_replication_if_matched`].
521 /// Without this cap, a high-volume PUT workload (1k req/s × N enabled
522 /// rules × slow destination = O(10k) in-flight tokio tasks) could
523 /// exhaust process memory before the destination drains. Each
524 /// dispatcher spawn `acquire_owned`s one permit and holds it for the
525 /// lifetime of the destination PUT + status stamp; once the cap is
526 /// reached the dispatcher async-blocks on `acquire_owned()` so the
527 /// listener path itself never stalls — only the in-flight replica
528 /// queue depth is bounded. Default 1024 (operator-tunable via
529 /// `--replication-max-concurrent`).
530 replication_semaphore: Arc<tokio::sync::Semaphore>,
531 /// v0.8.11 CRIT-4 fix: trust the `X-Forwarded-For` header for the
532 /// `aws:SourceIp` Condition key only when the operator has
533 /// explicitly opted in via `--trust-x-forwarded-for`. Default
534 /// (`false`) makes the policy evaluator see `source_ip = None`
535 /// for incoming requests, so a public-internet client can no
536 /// longer spoof an internal CIDR by setting `X-Forwarded-For`
537 /// themselves. Operators behind a trusted reverse proxy that
538 /// scrubs / sets `X-Forwarded-For` enable the flag; gateways
539 /// listening directly on the public internet leave it off and
540 /// gain a clear fail-closed default. A future release plumbs
541 /// the TCP peer address through the s3s service trait so we can
542 /// validate the forwarded header against a `--trusted-proxies`
543 /// CIDR list; until then the boolean opt-in closes the immediate
544 /// auth-bypass surface.
545 trust_x_forwarded_for: bool,
546}
547
548impl<B: S3> S4Service<B> {
549 /// AWS S3 単発 PUT の API 上限 (5 GiB)
550 pub const DEFAULT_MAX_BODY_BYTES: usize = 5 * 1024 * 1024 * 1024;
551
552 /// v0.8.5 #86 (audit M-2): default cap on simultaneously-in-flight
553 /// replication dispatcher tasks. See the `replication_semaphore`
554 /// field doc for the rationale + override path.
555 pub const DEFAULT_REPLICATION_MAX_CONCURRENT: usize = 1024;
556
557 pub fn new(
558 backend: B,
559 registry: Arc<CodecRegistry>,
560 dispatcher: Arc<dyn CodecDispatcher>,
561 ) -> Self {
562 Self {
563 backend: Arc::new(backend),
564 registry,
565 dispatcher,
566 max_body_bytes: Self::DEFAULT_MAX_BODY_BYTES,
567 policy: None,
568 secure_transport: false,
569 rate_limits: None,
570 access_log: None,
571 sse_keyring: None,
572 versioning: None,
573 kms: None,
574 kms_default_key_id: None,
575 object_lock: None,
576 cors: None,
577 inventory: None,
578 notifications: None,
579 lifecycle: None,
580 tagging: None,
581 replication: None,
582 mfa_delete: None,
583 compliance_strict: false,
584 sigv4a_gate: None,
585 multipart_state: Arc::new(crate::multipart_state::MultipartStateStore::new()),
586 // v0.8 #52: chunked SSE-S4 disabled by default — opt
587 // in via `S4Service::with_sse_chunk_size(...)` /
588 // `--sse-chunk-size <BYTES>`. Default keeps the legacy
589 // S4E2 buffered path so existing deployments are
590 // bit-for-bit unchanged.
591 sse_chunk_size: 0,
592 // v0.8.5 #86 (audit M-2): default cap of 1024 in-flight
593 // replication tasks. Picked to be (a) ample headroom over a
594 // typical steady-state replication rate (the v0.8.3 #66
595 // status-sweep doc cites 1k keys/hour as a "steady" rate, so
596 // even a 100x burst lands well under 1024), (b) small enough
597 // that the worst-case memory pinned by stalled dispatchers
598 // — body bytes + metadata — stays bounded (1024 × 5 MiB
599 // typical S3 PUT ≈ 5 GiB, recoverable). Operators with
600 // wider cross-region fan-out can override via
601 // `--replication-max-concurrent`.
602 replication_semaphore: Arc::new(tokio::sync::Semaphore::new(
603 Self::DEFAULT_REPLICATION_MAX_CONCURRENT,
604 )),
605 // v0.8.11 CRIT-4: default fail-closed — ignore client-
606 // supplied `X-Forwarded-For` until the operator opts in
607 // through `with_trust_x_forwarded_for(true)`.
608 trust_x_forwarded_for: false,
609 }
610 }
611
612 /// v0.8.11 CRIT-4 fix: opt in to consuming the leftmost token of
613 /// the `X-Forwarded-For` header as `aws:SourceIp`. Only enable
614 /// when the gateway sits behind a trusted reverse proxy that
615 /// strips (or rewrites) any client-supplied value. When left
616 /// off (default), the policy evaluator sees `source_ip = None`
617 /// regardless of what the client sends — closing the
618 /// public-internet `X-Forwarded-For: 10.0.0.1` IAM-allowlist
619 /// bypass.
620 #[must_use]
621 pub fn with_trust_x_forwarded_for(mut self, on: bool) -> Self {
622 self.trust_x_forwarded_for = on;
623 self
624 }
625
626 /// v0.7 #47: attach the SigV4a verify gate. Once set, the
627 /// listener-side middleware (`crate::routing::try_sigv4a_verify`)
628 /// short-circuits any incoming `AWS4-ECDSA-P256-SHA256` request,
629 /// verifying it against the supplied credential store and
630 /// returning 403 on failure. Plain SigV4 (HMAC-SHA256) requests
631 /// are unaffected. When the gate is unset (default), the
632 /// middleware skips entirely so existing SigV4 deployments keep
633 /// working.
634 #[must_use]
635 pub fn with_sigv4a_gate(mut self, gate: Arc<SigV4aGate>) -> Self {
636 self.sigv4a_gate = Some(gate);
637 self
638 }
639
640 /// v0.7 #47: borrow the attached SigV4a gate. Used by `main.rs`
641 /// to snapshot the gate `Arc` before the s3s `ServiceBuilder`
642 /// consumes the `S4Service` (the listener-side middleware needs
643 /// the same `Arc` because s3s' SigV4 verifier rejects SigV4a
644 /// algorithm tokens with "unknown algorithm" — match has to
645 /// happen at the hyper layer instead).
646 #[must_use]
647 pub fn sigv4a_gate(&self) -> Option<&Arc<SigV4aGate>> {
648 self.sigv4a_gate.as_ref()
649 }
650
651 /// v0.8.2 #62: borrow the multipart state store so `main.rs` can
652 /// snapshot the `Arc` before the s3s `ServiceBuilder` consumes
653 /// the `S4Service`. The background `sweep_stale` task in `main.rs`
654 /// holds this `Arc` and ticks once an hour to drop abandoned
655 /// upload contexts (and their `Zeroizing<[u8; 32]>` SSE-C keys).
656 #[must_use]
657 pub fn multipart_state(&self) -> &Arc<crate::multipart_state::MultipartStateStore> {
658 &self.multipart_state
659 }
660
661 /// v0.6 #39: attach the in-memory object + bucket Tagging manager.
662 /// Once set, `Put/Get/Delete` `Object/Bucket Tagging` route
663 /// through the manager (instead of forwarding to the backend),
664 /// and `put_object`'s `x-amz-tagging` parse path becomes the
665 /// source of `s3:RequestObjectTag/<key>` for the IAM policy
666 /// evaluator. The manager itself is shared via `Arc`.
667 #[must_use]
668 pub fn with_tagging(mut self, mgr: Arc<crate::tagging::TagManager>) -> Self {
669 self.tagging = Some(mgr);
670 self
671 }
672
673 /// v0.6 #39: borrow the attached tagging manager (test /
674 /// introspection — the snapshotter in `main.rs`, when wired,
675 /// will keep its own `Arc` clone).
676 #[must_use]
677 pub fn tag_manager(&self) -> Option<&Arc<crate::tagging::TagManager>> {
678 self.tagging.as_ref()
679 }
680
681 /// v0.6 #36: attach the in-memory S3 Inventory manager. Once set,
682 /// `put_bucket_inventory_configuration` /
683 /// `get_bucket_inventory_configuration` /
684 /// `list_bucket_inventory_configurations` /
685 /// `delete_bucket_inventory_configuration` route through the
686 /// manager. The actual periodic CSV / manifest emission is
687 /// orchestrated by a tokio task started in `main.rs`; the manager
688 /// itself is shared between the handler and the scheduler via
689 /// `Arc`.
690 #[must_use]
691 pub fn with_inventory(mut self, mgr: Arc<crate::inventory::InventoryManager>) -> Self {
692 self.inventory = Some(mgr);
693 self
694 }
695
696 /// v0.6 #36: borrow the attached inventory manager (test /
697 /// introspection — the background scheduler in `main.rs` keeps its
698 /// own `Arc` clone, so this accessor is for the test path that
699 /// invokes `run_once_for_test` directly).
700 #[must_use]
701 pub fn inventory_manager(&self) -> Option<&Arc<crate::inventory::InventoryManager>> {
702 self.inventory.as_ref()
703 }
704
705 /// v0.6 #37: attach the in-memory S3 Lifecycle configuration
706 /// manager. Once set, `put_bucket_lifecycle_configuration` /
707 /// `get_bucket_lifecycle_configuration` / `delete_bucket_lifecycle`
708 /// route through the manager (replacing the previous backend-
709 /// passthrough behaviour). The actual periodic scanner that walks
710 /// the source bucket and invokes Expiration / Transition /
711 /// NoncurrentExpiration actions is a v0.7+ follow-up — see
712 /// [`Self::run_lifecycle_once_for_test`] for the in-memory test
713 /// path that exercises the evaluator end-to-end.
714 #[must_use]
715 pub fn with_lifecycle(mut self, mgr: Arc<crate::lifecycle::LifecycleManager>) -> Self {
716 self.lifecycle = Some(mgr);
717 self
718 }
719
720 /// v0.6 #37: borrow the attached lifecycle manager (test /
721 /// introspection — the background scheduler in `main.rs` keeps its
722 /// own `Arc` clone, so this accessor is for the test path that
723 /// invokes the evaluator directly).
724 #[must_use]
725 pub fn lifecycle_manager(&self) -> Option<&Arc<crate::lifecycle::LifecycleManager>> {
726 self.lifecycle.as_ref()
727 }
728
729 /// v0.6 #37: synchronous test entry that runs the lifecycle evaluator
730 /// against a caller-provided list of `(key, age, size, tags)` tuples
731 /// and returns the `(key, action)` pairs that should fire. The actual
732 /// backend invocation (S3.delete_object / metadata rewrite) is left
733 /// to the caller — the unit + E2E tests use this to verify the
734 /// evaluator without spawning the (deferred) background scanner.
735 /// Returns an empty `Vec` when no lifecycle manager is attached or
736 /// no rule matches.
737 #[must_use]
738 pub fn run_lifecycle_once_for_test(
739 &self,
740 bucket: &str,
741 objects: &[crate::lifecycle::EvaluateBatchEntry],
742 ) -> Vec<(String, crate::lifecycle::LifecycleAction)> {
743 let Some(mgr) = self.lifecycle.as_ref() else {
744 return Vec::new();
745 };
746 crate::lifecycle::evaluate_batch(mgr, bucket, objects)
747 }
748
749 /// v0.6 #35: attach the in-memory bucket-notification manager. Once
750 /// set, `put_bucket_notification_configuration` /
751 /// `get_bucket_notification_configuration` route through the manager
752 /// (replacing the previous backend-passthrough behaviour); successful
753 /// `put_object` / `delete_object` calls fire matching destinations
754 /// on a detached tokio task via
755 /// `crate::notifications::dispatch_event` (best-effort, fire-and-
756 /// forget — failures bump the manager's `dropped_total` counter and
757 /// log at warn but do NOT fail the originating S3 request).
758 #[must_use]
759 pub fn with_notifications(
760 mut self,
761 mgr: Arc<crate::notifications::NotificationManager>,
762 ) -> Self {
763 self.notifications = Some(mgr);
764 self
765 }
766
767 /// v0.6 #35: borrow the attached notifications manager (test /
768 /// introspection — used by the metrics layer to read
769 /// `dropped_total`).
770 #[must_use]
771 pub fn notifications_manager(&self) -> Option<&Arc<crate::notifications::NotificationManager>> {
772 self.notifications.as_ref()
773 }
774
775 /// v0.6 #35: internal helper used by the DELETE handlers to fire a
776 /// matching notification on a detached tokio task. No-op when no
777 /// manager is attached or no rule on the bucket matches the given
778 /// (event, key) tuple.
779 fn fire_delete_notification(
780 &self,
781 bucket: &str,
782 key: &str,
783 event: crate::notifications::EventType,
784 version_id: Option<String>,
785 ) {
786 let Some(mgr) = self.notifications.as_ref() else {
787 return;
788 };
789 let dests = mgr.match_destinations(bucket, &event, key);
790 if dests.is_empty() {
791 return;
792 }
793 tokio::spawn(crate::notifications::dispatch_event(
794 Arc::clone(mgr),
795 bucket.to_owned(),
796 key.to_owned(),
797 event,
798 None,
799 None,
800 version_id,
801 format!("S4-{}", uuid::Uuid::new_v4()),
802 ));
803 }
804
805 /// v0.6 #40: attach the in-memory cross-bucket replication manager.
806 /// Once set, `put_bucket_replication` / `get_bucket_replication` /
807 /// `delete_bucket_replication` route through the manager (replacing
808 /// the previous backend-passthrough behaviour); a successful
809 /// `put_object` whose key matches an enabled rule fires a detached
810 /// tokio task that PUTs the same body + metadata to the rule's
811 /// destination bucket, stamping the replica with
812 /// `x-amz-replication-status: REPLICA`. Failures after the retry
813 /// budget bump the manager's `dropped_total` counter and are
814 /// surfaced in the `s4_replication_dropped_total` Prometheus
815 /// counter; successes bump `s4_replication_replicated_total`.
816 #[must_use]
817 pub fn with_replication(mut self, mgr: Arc<crate::replication::ReplicationManager>) -> Self {
818 self.replication = Some(mgr);
819 self
820 }
821
822 /// v0.6 #40: borrow the attached replication manager (test /
823 /// introspection — used by the metrics layer to read
824 /// `dropped_total`).
825 #[must_use]
826 pub fn replication_manager(&self) -> Option<&Arc<crate::replication::ReplicationManager>> {
827 self.replication.as_ref()
828 }
829
830 /// v0.6 #40: internal helper used by the PUT handlers to fire a
831 /// detached cross-bucket replication task. No-op when no manager
832 /// is attached, the source backend PUT failed, or no rule on the
833 /// source bucket matches the (key, tags) tuple. The `body` is the
834 /// post-compression / post-encryption `Bytes` that was sent to
835 /// the source backend (refcount-cloned), and `metadata` is the
836 /// metadata map that already includes the manifest /
837 /// `s4-encrypted` markers — the replica decodes through the same
838 /// path. The destination PUT runs through `Arc<B>::put_object`.
839 ///
840 /// ## v0.8.2 #61: generation token + shadow-key destination
841 ///
842 /// `pending_version` is the source-side `PutOutcome` minted by the
843 /// caller's versioning branch (or `None` for unversioned /
844 /// suspended buckets). When `pending_version.versioned_response`
845 /// is `true`, the dispatcher writes the destination under the same
846 /// shadow path the source uses (`<key>.__s4ver__/<vid>`) so the
847 /// destination's version chain receives the new version the same
848 /// way `?versionId=` GET resolves it. Closes audit C-1.
849 ///
850 /// The dispatcher also mints a fresh `generation` token before
851 /// spawning, threaded through to [`crate::replication::
852 /// replicate_object`]. Closes audit C-3 — a stale retry of an
853 /// older PUT can no longer overwrite the destination's newer bytes
854 /// because the CAS guard sees the higher stored generation and
855 /// drops its destination write.
856 ///
857 /// ## Asymmetric versioning policy (out of scope)
858 ///
859 /// We assume source + destination buckets share the same
860 /// versioning policy (both Enabled or both Suspended /
861 /// Unversioned). Cross-bucket policy queries would require a
862 /// backend round-trip per replication, which is not worth it for
863 /// the single-instance scope. Operators who configure asymmetric
864 /// versioning will see destination-side `?versionId=` lookups
865 /// miss — documented as out-of-scope until a future per-rule
866 /// `destination_versioning_policy` knob lands.
867 // 8 args is the post-#61 shape: replication needs the
868 // source bucket+key, the canonical tag set for rule-matching,
869 // the post-codec body+metadata for the destination PUT, the
870 // backend-success gate, and the pending version-id for the
871 // shadow-key destination override. A shape struct would just
872 // split the (single) call site so opt for the inline form.
873 #[allow(clippy::too_many_arguments)]
874 fn spawn_replication_if_matched(
875 &self,
876 source_bucket: &str,
877 source_key: &str,
878 request_tags: &Option<crate::tagging::TagSet>,
879 body: &bytes::Bytes,
880 metadata: &Option<std::collections::HashMap<String, String>>,
881 backend_ok: bool,
882 pending_version: Option<&crate::versioning::PutOutcome>,
883 ) where
884 B: Send + Sync + 'static,
885 {
886 if !backend_ok {
887 return;
888 }
889 let Some(mgr) = self.replication.as_ref() else {
890 return;
891 };
892 // Pull the request's tags into the (k, v) shape the matcher
893 // expects. The tagging manager would have the canonical
894 // post-PUT view but at this point in the pipeline it's
895 // already been written above; for the rule-match decision
896 // the request's tags are sufficient (= the tags this PUT
897 // applies, S3 PutObject is full-replace on tags).
898 let object_tags: Vec<(String, String)> = request_tags
899 .as_ref()
900 .map(|ts| ts.iter().cloned().collect())
901 .unwrap_or_default();
902 let Some(rule) = mgr.match_rule(source_bucket, source_key, &object_tags) else {
903 return;
904 };
905 // v0.8.2 #61: mint the per-PUT generation BEFORE the eager
906 // Pending stamp so the stamp itself carries the right
907 // generation (the CAS in `record_status_if_newer` would
908 // otherwise see a `generation=0` Pending and accept any
909 // stale retry).
910 let generation = mgr.next_generation();
911 // Eagerly mark the source key as Pending so a HEAD between
912 // the source PUT returning and the spawned task completing
913 // surfaces the in-flight state. CAS-guarded so a slower
914 // older PUT can't downgrade a newer Completed back to Pending.
915 let _ = mgr.record_status_if_newer(
916 source_bucket,
917 source_key,
918 generation,
919 crate::replication::ReplicationStatus::Pending,
920 );
921 // v0.8.2 #61: derive the destination storage key. For a
922 // versioning-Enabled source the destination receives the
923 // same shadow-key path so a `?versionId=<vid>` GET on the
924 // destination resolves through the same lookup the source
925 // uses. Suspended / Unversioned sources keep the logical
926 // key (= `None` override = dispatcher uses `source_key`).
927 let destination_key_override = pending_version
928 .filter(|pv| pv.versioned_response)
929 .map(|pv| versioned_shadow_key(source_key, &pv.version_id));
930 // v0.8.3 #68 (audit M-1): capture the source object's Object
931 // Lock state so the dispatcher can decorate the destination
932 // PUT with the matching AWS-wire lock headers. Without this,
933 // a Compliance / Governance / legal-hold protected source
934 // would replicate to a destination where DELETE succeeds
935 // (the WORM posture would only hold on the source).
936 let source_lock_state = self
937 .object_lock
938 .as_ref()
939 .and_then(|mgr| mgr.get(source_bucket, source_key));
940 // v0.8.3 #68: hand the destination-side ObjectLockManager to
941 // the dispatcher closure so we can persist the propagated
942 // lock state on successful destination PUT (the destination
943 // PUT below bypasses S4Service::put_object — we drive the
944 // backend directly — so the explicit_lock_mode commit block
945 // in put_object never fires for replicas. We replay it here
946 // against the destination key.)
947 let dest_lock_mgr = self.object_lock.as_ref().map(Arc::clone);
948 let mgr_cl = Arc::clone(mgr);
949 let backend = Arc::clone(&self.backend);
950 let body_cl = body.clone();
951 let metadata_cl = metadata.clone();
952 let source_bucket_cl = source_bucket.to_owned();
953 let source_key_cl = source_key.to_owned();
954 let source_lock_state_for_closure = source_lock_state.clone();
955 let source_bucket_for_warn = source_bucket.to_owned();
956 // v0.8.5 #86 (audit M-2): bound the in-flight replication queue
957 // depth. Acquire happens INSIDE the spawned task (not on the
958 // listener path) so a saturated semaphore back-pressures the
959 // dispatcher pool without stalling the source PUT response —
960 // the source has already returned 200 to the client by the time
961 // the spawn body runs. A failed `acquire_owned` only happens
962 // when the semaphore is closed (we never close it, so the
963 // logged-and-skipped fallback is unreachable in practice).
964 let semaphore = Arc::clone(&self.replication_semaphore);
965 tokio::spawn(async move {
966 let _permit = match semaphore.acquire_owned().await {
967 Ok(p) => p,
968 Err(e) => {
969 tracing::warn!(
970 bucket = %source_bucket_cl,
971 key = %source_key_cl,
972 "S4 replication dispatcher could not acquire semaphore permit (closed? {e}); skipping replica"
973 );
974 return;
975 }
976 };
977 let do_put = move |dest_bucket: String,
978 dest_key: String,
979 dest_body: bytes::Bytes,
980 dest_meta: Option<std::collections::HashMap<String, String>>| {
981 let backend = Arc::clone(&backend);
982 let dest_lock_mgr = dest_lock_mgr.clone();
983 let lock_state = source_lock_state_for_closure.clone();
984 let warn_src = source_bucket_for_warn.clone();
985 async move {
986 let req = S3Request {
987 input: PutObjectInput {
988 bucket: dest_bucket.clone(),
989 key: dest_key.clone(),
990 body: Some(bytes_to_blob(dest_body)),
991 metadata: dest_meta,
992 ..Default::default()
993 },
994 method: http::Method::PUT,
995 uri: "/".parse().unwrap(),
996 headers: http::HeaderMap::new(),
997 extensions: http::Extensions::new(),
998 credentials: None,
999 region: None,
1000 service: None,
1001 trailing_headers: None,
1002 };
1003 let put_result = backend
1004 .put_object(req)
1005 .await
1006 .map(|_| ())
1007 .map_err(|e| format!("destination put_object: {e}"));
1008 // v0.8.3 #68: on successful destination PUT,
1009 // persist the propagated lock state into the
1010 // destination's ObjectLockManager so a subsequent
1011 // DELETE on the destination is refused. Three cases:
1012 // - PUT failed → skip (no replica to protect)
1013 // - lock_state None → nothing to propagate
1014 // - dest manager None (operator misconfig)
1015 // → log warn-once + bump skip metric
1016 if put_result.is_ok()
1017 && let Some(state) = lock_state
1018 {
1019 match dest_lock_mgr {
1020 Some(ref mgr) => {
1021 mgr.set(&dest_bucket, &dest_key, state);
1022 }
1023 None => {
1024 crate::replication::warn_lock_propagation_skipped(
1025 &warn_src,
1026 &dest_bucket,
1027 );
1028 }
1029 }
1030 }
1031 put_result
1032 }
1033 };
1034 // v0.8.5 #81 (audit H-7): wrap the dispatcher body in
1035 // `futures::FutureExt::catch_unwind` so a panic inside
1036 // `replicate_object` (or any of the user-supplied closures
1037 // it drives — `do_put`, the destination backend, the lock
1038 // manager) does NOT bubble out of the detached task as a
1039 // `JoinError` that no operator dashboard scrapes. Caught
1040 // panics bump `s4_dispatcher_panics_total{kind="replication"}`
1041 // + log at ERROR with the panic payload, so silent feature
1042 // degradation (= every replication PUT panicking and
1043 // dropping the replica without any visible signal) becomes
1044 // a first-class metric the operator can alert on.
1045 //
1046 // `AssertUnwindSafe` is required because the inner future
1047 // captures `Arc<...>` clones + a `do_put` closure that are
1048 // not `UnwindSafe` by default; the safety contract here is
1049 // "we don't continue using any of those captures after the
1050 // panic" which trivially holds (we drop them and return).
1051 use futures::FutureExt as _;
1052 let dispatcher_kind = "replication";
1053 let fut = crate::replication::replicate_object(
1054 rule,
1055 source_bucket_cl,
1056 source_key_cl,
1057 body_cl,
1058 metadata_cl,
1059 do_put,
1060 mgr_cl,
1061 generation,
1062 destination_key_override,
1063 source_lock_state,
1064 );
1065 if let Err(panic) = std::panic::AssertUnwindSafe(fut).catch_unwind().await {
1066 let panic_msg = panic
1067 .downcast_ref::<&'static str>()
1068 .copied()
1069 .map(str::to_owned)
1070 .or_else(|| panic.downcast_ref::<String>().cloned())
1071 .unwrap_or_else(|| "(non-string panic payload)".to_owned());
1072 tracing::error!(
1073 kind = dispatcher_kind,
1074 panic_payload = %panic_msg,
1075 "S4 dispatcher task panicked (caught by catch_unwind, runtime not poisoned)"
1076 );
1077 crate::metrics::record_dispatcher_panic(dispatcher_kind);
1078 }
1079 });
1080 }
1081
1082 /// v0.6 #42: attach the in-memory MFA-Delete enforcement manager.
1083 /// Once set, every DELETE / DELETE-version / delete-marker /
1084 /// `PutBucketVersioning` request against a bucket whose MFA-Delete
1085 /// state is `Enabled` requires a valid `x-amz-mfa: <serial> <code>`
1086 /// header (RFC 6238 6-digit TOTP); the gate is a no-op for buckets
1087 /// where MFA-Delete is `Disabled` (S3 default).
1088 #[must_use]
1089 pub fn with_mfa_delete(mut self, mgr: Arc<crate::mfa::MfaDeleteManager>) -> Self {
1090 self.mfa_delete = Some(mgr);
1091 self
1092 }
1093
1094 /// v0.6 #42: borrow the attached MFA-Delete manager (test /
1095 /// introspection — used by the snapshot path in `main.rs` to call
1096 /// `to_json` for restart-recoverable state).
1097 #[must_use]
1098 pub fn mfa_delete_manager(&self) -> Option<&Arc<crate::mfa::MfaDeleteManager>> {
1099 self.mfa_delete.as_ref()
1100 }
1101
1102 /// v0.6 #38: attach the in-memory CORS configuration manager. Once
1103 /// set, `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
1104 /// route through the manager instead of forwarding to the backend,
1105 /// and [`Self::handle_preflight`] becomes useful for the (future)
1106 /// listener-side OPTIONS interceptor.
1107 #[must_use]
1108 pub fn with_cors(mut self, mgr: Arc<crate::cors::CorsManager>) -> Self {
1109 self.cors = Some(mgr);
1110 self
1111 }
1112
1113 /// v0.6 #38: Borrow the attached CORS manager (test / introspection).
1114 #[must_use]
1115 pub fn cors_manager(&self) -> Option<&Arc<crate::cors::CorsManager>> {
1116 self.cors.as_ref()
1117 }
1118
1119 /// v0.6 #38: evaluate a CORS preflight request against the bucket's
1120 /// configured rules and, if a rule matches, return the headers that
1121 /// the (future) listener-side OPTIONS interceptor must put on the
1122 /// 200 response: `Access-Control-Allow-Origin`, `Access-Control-
1123 /// Allow-Methods`, `Access-Control-Allow-Headers`, optionally
1124 /// `Access-Control-Max-Age` and `Access-Control-Expose-Headers`.
1125 ///
1126 /// Returns `None` when no manager is attached, no config is
1127 /// registered for the bucket, or no rule matches the (origin,
1128 /// method, headers) triple. The caller is responsible for turning
1129 /// `None` into the appropriate 403 response.
1130 ///
1131 /// **Note:** the OPTIONS routing itself (i.e. wiring this method
1132 /// into the hyper-util listener path) is a follow-up — s3s does not
1133 /// surface OPTIONS as a typed S3 handler, so this method is
1134 /// currently call-able only from inside other handlers and tests.
1135 #[must_use]
1136 pub fn handle_preflight(
1137 &self,
1138 bucket: &str,
1139 origin: &str,
1140 method: &str,
1141 request_headers: &[String],
1142 ) -> Option<std::collections::HashMap<String, String>> {
1143 let mgr = self.cors.as_ref()?;
1144 let rule = mgr.match_preflight(bucket, origin, method, request_headers)?;
1145 let mut h = std::collections::HashMap::new();
1146 // Echo the matched origin back. If the rule used "*" we still
1147 // echo "*" (S3 spec — the spec does not require us to echo the
1148 // *requesting* origin when the wildcard matched).
1149 let allow_origin = if rule.allowed_origins.iter().any(|o| o == "*") {
1150 "*".to_string()
1151 } else {
1152 origin.to_string()
1153 };
1154 h.insert("Access-Control-Allow-Origin".to_string(), allow_origin);
1155 h.insert(
1156 "Access-Control-Allow-Methods".to_string(),
1157 rule.allowed_methods.join(", "),
1158 );
1159 if !rule.allowed_headers.is_empty() {
1160 // For the Allow-Headers response, echo back the rule's
1161 // pattern list verbatim (S3 echoes the configured list,
1162 // including "*" if present). Browsers honour exact-match
1163 // rules.
1164 h.insert(
1165 "Access-Control-Allow-Headers".to_string(),
1166 rule.allowed_headers.join(", "),
1167 );
1168 }
1169 if let Some(secs) = rule.max_age_seconds {
1170 h.insert("Access-Control-Max-Age".to_string(), secs.to_string());
1171 }
1172 if !rule.expose_headers.is_empty() {
1173 h.insert(
1174 "Access-Control-Expose-Headers".to_string(),
1175 rule.expose_headers.join(", "),
1176 );
1177 }
1178 Some(h)
1179 }
1180
1181 /// v0.5 #32: enable strict compliance mode. Every PUT must carry an
1182 /// SSE indicator (server-side encryption header or SSE-C customer
1183 /// key); requests without one are rejected with 400 InvalidRequest.
1184 /// Boot-time prerequisite checking lives in the binary
1185 /// (`validate_compliance_mode`) so this flag is purely the runtime
1186 /// switch.
1187 #[must_use]
1188 pub fn with_compliance_strict(mut self, on: bool) -> Self {
1189 self.compliance_strict = on;
1190 self
1191 }
1192
1193 /// v0.5 #30: attach the in-memory Object Lock (WORM) enforcement
1194 /// manager. Once set, `delete_object` and overwrite-path
1195 /// `put_object` refuse operations on locked keys with HTTP 403
1196 /// `AccessDenied`; new PUTs to a bucket with a default retention
1197 /// policy auto-create per-object lock state.
1198 #[must_use]
1199 pub fn with_object_lock(mut self, mgr: Arc<crate::object_lock::ObjectLockManager>) -> Self {
1200 self.object_lock = Some(mgr);
1201 self
1202 }
1203
1204 /// v0.7 #45: borrow the attached Object Lock manager (read-only —
1205 /// the lifecycle scanner uses this to skip currently-locked objects
1206 /// before issuing `delete_object`, since an Object Lock always wins
1207 /// over Lifecycle Expiration in AWS S3 semantics). Mirrors the
1208 /// shape of [`Self::lifecycle_manager`] /
1209 /// [`Self::tag_manager`] — purely additive accessor, no handler
1210 /// behaviour change.
1211 #[must_use]
1212 pub fn object_lock_manager(&self) -> Option<&Arc<crate::object_lock::ObjectLockManager>> {
1213 self.object_lock.as_ref()
1214 }
1215
1216 /// v0.5 #28: attach an SSE-KMS backend. `default_key_id` is used
1217 /// when a PUT requests SSE-KMS without naming a specific KMS key
1218 /// (operators set this to mirror AWS S3's bucket-default key).
1219 #[must_use]
1220 pub fn with_kms_backend(
1221 mut self,
1222 kms: Arc<dyn crate::kms::KmsBackend>,
1223 default_key_id: Option<String>,
1224 ) -> Self {
1225 self.kms = Some(kms);
1226 self.kms_default_key_id = default_key_id;
1227 self
1228 }
1229
1230 /// v0.5 #34: attach the first-class versioning state machine. Once
1231 /// set, this `S4Service` owns the per-bucket versioning state +
1232 /// per-(bucket, key) version chain; `put_object` / `get_object` /
1233 /// `delete_object` / `list_object_versions` /
1234 /// `get_bucket_versioning` / `put_bucket_versioning` consult the
1235 /// manager instead of passing through to the backend. The backend
1236 /// is still used as the byte store: Suspended / Unversioned buckets
1237 /// keep using `<key>` directly (legacy), Enabled buckets redirect
1238 /// each version's bytes to a shadow key
1239 /// (`<key>.__s4ver__/<version-id>`) so older versions survive newer
1240 /// PUTs to the same logical key.
1241 #[must_use]
1242 pub fn with_versioning(mut self, mgr: Arc<crate::versioning::VersioningManager>) -> Self {
1243 self.versioning = Some(mgr);
1244 self
1245 }
1246
1247 /// v0.8.5 #86 (audit M-3): borrow the attached versioning manager so
1248 /// the SIGUSR1 snapshot dump-back hook in `main.rs` can re-emit the
1249 /// in-memory state to the operator's `--versioning-state-file`
1250 /// without restarting the gateway. Mirrors the shape of
1251 /// [`Self::object_lock_manager`] / [`Self::lifecycle_manager`] —
1252 /// purely additive accessor, no handler behaviour change.
1253 #[must_use]
1254 pub fn versioning_manager(&self) -> Option<&Arc<crate::versioning::VersioningManager>> {
1255 self.versioning.as_ref()
1256 }
1257
1258 /// v0.8.5 #86 (audit M-2): override the default replication-dispatch
1259 /// concurrency cap (1024). Wired by the `--replication-max-concurrent`
1260 /// CLI flag in `main.rs`. Operators running heavy cross-region
1261 /// fan-out may need to raise this; operators on memory-constrained
1262 /// hosts may need to lower it. The new value replaces the existing
1263 /// `Semaphore` (so calling this after dispatchers are already in
1264 /// flight is fine — the in-flight tasks hold permits from the old
1265 /// semaphore which is dropped when its last permit is released).
1266 /// A `max` of 0 would deadlock all replicas; the value is silently
1267 /// clamped to 1 instead.
1268 #[must_use]
1269 pub fn with_replication_max_concurrent(mut self, max: usize) -> Self {
1270 let max = max.max(1);
1271 self.replication_semaphore = Arc::new(tokio::sync::Semaphore::new(max));
1272 self
1273 }
1274
1275 /// v0.8.5 #86 (audit M-2): borrow the in-flight replication
1276 /// concurrency permit pool. Tests inspect `available_permits()`
1277 /// after invoking `spawn_replication_if_matched` to verify the
1278 /// dispatcher actually `acquire_owned`s before kicking off the
1279 /// destination PUT.
1280 #[must_use]
1281 pub fn replication_semaphore(&self) -> &Arc<tokio::sync::Semaphore> {
1282 &self.replication_semaphore
1283 }
1284
1285 /// v0.4 #21 (kept for back-compat): attach a single SSE-S4 key.
1286 /// Internally wraps it in a 1-slot keyring with id=1 active, so
1287 /// new objects ride the v0.5 S4E2 frame while previously-written
1288 /// S4E1 bytes (this same key) still decrypt via the keyring's S4E1
1289 /// fallback path. Operators wanting true rotation should call
1290 /// [`Self::with_sse_keyring`] instead.
1291 #[must_use]
1292 pub fn with_sse_key(mut self, key: crate::sse::SharedSseKey) -> Self {
1293 let keyring = crate::sse::SseKeyring::new(1, key);
1294 self.sse_keyring = Some(std::sync::Arc::new(keyring));
1295 self
1296 }
1297
1298 /// v0.5 #29: attach a multi-key SSE-S4 keyring. PUT encrypts under
1299 /// the active key (S4E2 frame stamped with that key's id); GET
1300 /// dispatches on the body's magic — S4E1 falls back to trying every
1301 /// key in the ring (active first) so v0.4 objects survive a
1302 /// migration; S4E2 looks up the explicit key_id from the header.
1303 #[must_use]
1304 pub fn with_sse_keyring(mut self, keyring: crate::sse::SharedSseKeyring) -> Self {
1305 self.sse_keyring = Some(keyring);
1306 self
1307 }
1308
1309 /// v0.8 #52: opt the SSE-S4 PUT path into the chunked S4E5 frame
1310 /// (so the matching GET can stream-decrypt chunk-by-chunk
1311 /// instead of buffering the entire body before tag verify).
1312 /// `bytes` is the plaintext slice size — typically 1 MiB; 0
1313 /// disables the path and reverts to the legacy S4E2 buffered
1314 /// frame.
1315 ///
1316 /// SSE-C (S4E3) and SSE-KMS (S4E4) are intentionally untouched:
1317 /// the chunked envelopes for those flows are a follow-up issue
1318 /// (the customer-key wire surface needs separate version
1319 /// negotiation).
1320 ///
1321 /// Has no effect when `with_sse_keyring` / `with_sse_key` is
1322 /// not also set — the chunked path runs only on the SSE-S4
1323 /// branch of `put_object`.
1324 #[must_use]
1325 pub fn with_sse_chunk_size(mut self, bytes: usize) -> Self {
1326 self.sse_chunk_size = bytes;
1327 self
1328 }
1329
1330 /// v0.4 #20: attach an S3-style access-log emitter. Each completed
1331 /// PUT / GET / DELETE / List handler emits one entry into the
1332 /// emitter's buffer; a background flusher (started separately, see
1333 /// [`crate::access_log::AccessLog::spawn_flusher`]) writes hourly
1334 /// rotated `.log` files into the configured directory.
1335 #[must_use]
1336 pub fn with_access_log(mut self, log: crate::access_log::SharedAccessLog) -> Self {
1337 self.access_log = Some(log);
1338 self
1339 }
1340
1341 /// Capture the per-request access-log preamble before the request is
1342 /// consumed by the backend call. Returns `None` if no access logger
1343 /// is configured (cheap early-out so the handler doesn't pay the
1344 /// header-clone cost when access logging is off).
1345 fn access_log_preamble<I>(&self, req: &S3Request<I>) -> Option<AccessLogPreamble> {
1346 self.access_log.as_ref()?;
1347 Some(AccessLogPreamble {
1348 // v0.8.11 CRIT-4 fix: same trust gate as `request_context`.
1349 // Recording a client-controllable header in the access log
1350 // would poison forensic queries; leave it `None` until the
1351 // operator declares X-Forwarded-For is set by a trusted
1352 // proxy.
1353 remote_ip: if self.trust_x_forwarded_for {
1354 req.headers
1355 .get("x-forwarded-for")
1356 .and_then(|v| v.to_str().ok())
1357 .and_then(|raw| raw.split(',').next())
1358 .map(|s| s.trim().to_owned())
1359 } else {
1360 None
1361 },
1362 requester: Self::principal_of(req).map(str::to_owned),
1363 request_uri: format!("{} {}", req.method, req.uri.path()),
1364 user_agent: req
1365 .headers
1366 .get("user-agent")
1367 .and_then(|v| v.to_str().ok())
1368 .map(str::to_owned),
1369 })
1370 }
1371
1372 /// Internal — called by handlers at end-of-request with a captured
1373 /// preamble. Best-effort: swallows the await fast (clones Arc +
1374 /// pushes), no error propagation back to the request path.
1375 #[allow(clippy::too_many_arguments)]
1376 async fn record_access(
1377 &self,
1378 preamble: Option<AccessLogPreamble>,
1379 operation: &'static str,
1380 bucket: &str,
1381 key: Option<&str>,
1382 http_status: u16,
1383 bytes_sent: u64,
1384 object_size: u64,
1385 total_time_ms: u64,
1386 error_code: Option<&str>,
1387 ) {
1388 let (Some(log), Some(p)) = (self.access_log.as_ref(), preamble) else {
1389 return;
1390 };
1391 log.record(crate::access_log::AccessLogEntry {
1392 time: std::time::SystemTime::now(),
1393 bucket: bucket.to_owned(),
1394 remote_ip: p.remote_ip,
1395 requester: p.requester,
1396 operation,
1397 key: key.map(str::to_owned),
1398 request_uri: p.request_uri,
1399 http_status,
1400 error_code: error_code.map(str::to_owned),
1401 bytes_sent,
1402 object_size,
1403 total_time_ms,
1404 user_agent: p.user_agent,
1405 })
1406 .await;
1407 }
1408
1409 /// v0.4 #19: attach a per-(principal, bucket) token-bucket rate limiter.
1410 /// When set, every PUT / GET / DELETE / List / Copy / multipart op is
1411 /// throttle-checked before the policy gate; throttled requests return
1412 /// `S3ErrorCode::SlowDown` (HTTP 503) and bump
1413 /// `s4_rate_limit_throttled_total{principal,bucket}`.
1414 #[must_use]
1415 pub fn with_rate_limits(mut self, rl: crate::rate_limit::SharedRateLimits) -> Self {
1416 self.rate_limits = Some(rl);
1417 self
1418 }
1419
1420 /// Helper used by request handlers to apply the rate limit. Returns
1421 /// `Ok(())` when allowed (or no rate limiter is configured), or a
1422 /// `SlowDown` S3Error otherwise.
1423 fn enforce_rate_limit<I>(&self, req: &S3Request<I>, bucket: &str) -> S3Result<()> {
1424 let Some(rl) = self.rate_limits.as_ref() else {
1425 return Ok(());
1426 };
1427 let principal_id = Self::principal_of(req);
1428 if !rl.check(principal_id, bucket) {
1429 crate::metrics::record_rate_limit_throttle(principal_id.unwrap_or("-"), bucket);
1430 return Err(S3Error::with_message(
1431 S3ErrorCode::SlowDown,
1432 format!("rate-limited: bucket={bucket}"),
1433 ));
1434 }
1435 Ok(())
1436 }
1437
1438 /// Tell the policy evaluator that the listener is reached over TLS
1439 /// (or ACME). When `true`, the `aws:SecureTransport` Condition key
1440 /// resolves to `true`. Defaults to `false`.
1441 #[must_use]
1442 pub fn with_secure_transport(mut self, on: bool) -> Self {
1443 self.secure_transport = on;
1444 self
1445 }
1446
1447 #[must_use]
1448 pub fn with_max_body_bytes(mut self, n: usize) -> Self {
1449 self.max_body_bytes = n;
1450 self
1451 }
1452
1453 /// Attach an optional bucket policy (v0.2 #7). When `Some(...)`, every
1454 /// PUT / GET / DELETE / List handler runs `policy.evaluate(...)` before
1455 /// delegating to the backend; failures return `S3ErrorCode::AccessDenied`.
1456 /// When `None` (the default), no policy enforcement happens.
1457 #[must_use]
1458 pub fn with_policy(mut self, policy: crate::policy::SharedPolicy) -> Self {
1459 self.policy = Some(policy);
1460 self
1461 }
1462
1463 /// Pull the SigV4 access key id off the request's credentials, if any.
1464 /// Used as the `principal_id` for policy evaluation.
1465 fn principal_of<I>(req: &S3Request<I>) -> Option<&str> {
1466 req.credentials.as_ref().map(|c| c.access_key.as_str())
1467 }
1468
1469 /// v0.3 #13: build the per-request policy context from the incoming
1470 /// `S3Request`. Pulls `aws:UserAgent` from the User-Agent header,
1471 /// `aws:SourceIp` from the standard `X-Forwarded-For` header (most
1472 /// production deployments are behind an LB / reverse proxy that sets
1473 /// this), `aws:CurrentTime` from the system clock, and
1474 /// `aws:SecureTransport` from the per-listener TLS flag.
1475 fn request_context<I>(&self, req: &S3Request<I>) -> crate::policy::RequestContext {
1476 let user_agent = req
1477 .headers
1478 .get("user-agent")
1479 .and_then(|v| v.to_str().ok())
1480 .map(str::to_owned);
1481 // v0.8.11 CRIT-4 fix: `X-Forwarded-For` is a client-controllable
1482 // header. Trusting it unconditionally lets any public-internet
1483 // request claim it came from a trusted CIDR (e.g.
1484 // `curl -H 'X-Forwarded-For: 10.0.0.1'` to satisfy a
1485 // `Condition: NotIpAddress aws:SourceIp [10.0.0.0/8]` Deny).
1486 // We now only consume the header when the operator has
1487 // declared "this gateway sits behind a trusted reverse proxy
1488 // that scrubs client-supplied values" via
1489 // `with_trust_x_forwarded_for(true)` /
1490 // `--trust-x-forwarded-for`. Default leaves `source_ip` as
1491 // `None`, which fails closed for IP-allowlist Allow rules
1492 // and fails open for IP-blocklist Deny rules — operators
1493 // who need either case behind a public listener must opt in
1494 // or move the gate to the reverse proxy. The leftmost
1495 // comma-separated token is the originator per the
1496 // `X-Forwarded-For: client, proxy1, proxy2` convention.
1497 let source_ip = if self.trust_x_forwarded_for {
1498 req.headers
1499 .get("x-forwarded-for")
1500 .and_then(|v| v.to_str().ok())
1501 .and_then(|raw| raw.split(',').next())
1502 .and_then(|s| s.trim().parse().ok())
1503 } else {
1504 None
1505 };
1506 crate::policy::RequestContext {
1507 source_ip,
1508 user_agent,
1509 request_time: Some(std::time::SystemTime::now()),
1510 secure_transport: self.secure_transport,
1511 existing_object_tags: None,
1512 request_object_tags: None,
1513 extra: Default::default(),
1514 }
1515 }
1516
1517 /// Helper used by request handlers to enforce the optional policy.
1518 /// Returns `Ok(())` when allowed (or no policy is configured), or an
1519 /// `AccessDenied` S3Error otherwise. Bumps the policy denial Prometheus
1520 /// counter on deny.
1521 fn enforce_policy<I>(
1522 &self,
1523 req: &S3Request<I>,
1524 action: &'static str,
1525 bucket: &str,
1526 key: Option<&str>,
1527 ) -> S3Result<()> {
1528 self.enforce_policy_with_extra(req, action, bucket, key, None, None)
1529 }
1530
1531 /// v0.6 #39: variant of [`Self::enforce_policy`] that lets the
1532 /// caller plumb tag context (existing-on-object + on-request) into
1533 /// the policy evaluator. Both arguments default to `None`, in
1534 /// which case the resulting `RequestContext` is identical to
1535 /// [`Self::enforce_policy`]'s — so for handlers that don't deal
1536 /// with tags this is a transparent no-op.
1537 fn enforce_policy_with_extra<I>(
1538 &self,
1539 req: &S3Request<I>,
1540 action: &'static str,
1541 bucket: &str,
1542 key: Option<&str>,
1543 request_tags: Option<&crate::tagging::TagSet>,
1544 existing_tags: Option<&crate::tagging::TagSet>,
1545 ) -> S3Result<()> {
1546 let Some(policy) = self.policy.as_ref() else {
1547 return Ok(());
1548 };
1549 let principal_id = Self::principal_of(req);
1550 let mut ctx = self.request_context(req);
1551 if let Some(t) = request_tags {
1552 ctx.request_object_tags = Some(t.clone());
1553 }
1554 if let Some(t) = existing_tags {
1555 ctx.existing_object_tags = Some(t.clone());
1556 }
1557 let decision = policy.evaluate_with(action, bucket, key, principal_id, &ctx);
1558 if decision.allow {
1559 Ok(())
1560 } else {
1561 crate::metrics::record_policy_denial(action, bucket);
1562 tracing::info!(
1563 action,
1564 bucket,
1565 key = ?key,
1566 principal = ?principal_id,
1567 source_ip = ?ctx.source_ip,
1568 user_agent = ?ctx.user_agent,
1569 secure_transport = ctx.secure_transport,
1570 matched_sid = ?decision.matched_sid,
1571 effect = ?decision.matched_effect,
1572 "S4 policy denied request"
1573 );
1574 Err(S3Error::with_message(
1575 S3ErrorCode::AccessDenied,
1576 format!("denied by S4 policy: {action} on bucket={bucket}"),
1577 ))
1578 }
1579 }
1580
1581 /// テスト用: backend を取り戻す (test helper、production では使わない).
1582 /// v0.6 #40 で `backend` が `Arc<B>` 化したので `Arc::try_unwrap` で
1583 /// 1-clone の場合のみ返す。共有されている (= replication dispatcher が
1584 /// 同じ Arc を持っていて未完了) 場合は `Err` を返さず panic させる
1585 /// (test 用途専用 helper の caller 契約を維持)。
1586 pub fn into_backend(self) -> B {
1587 Arc::try_unwrap(self.backend).unwrap_or_else(|_| {
1588 panic!("into_backend: backend Arc still shared (replication dispatcher in flight?)")
1589 })
1590 }
1591
1592 /// 必要 frame だけを backend に Range GET し、frame parse + decompress + slice
1593 /// した結果を返す sidecar fast path。Range request の **帯域節約版**。
1594 async fn partial_range_get(
1595 &self,
1596 req: &S3Request<GetObjectInput>,
1597 plan: s4_codec::index::RangePlan,
1598 client_start: u64,
1599 client_end_exclusive: u64,
1600 total_original: u64,
1601 get_start: Instant,
1602 ) -> S3Result<S3Response<GetObjectOutput>> {
1603 // 必要 byte 範囲だけを backend に partial GET
1604 let backend_range = s3s::dto::Range::Int {
1605 first: plan.byte_start,
1606 last: Some(plan.byte_end_exclusive - 1),
1607 };
1608 let backend_input = GetObjectInput {
1609 bucket: req.input.bucket.clone(),
1610 key: req.input.key.clone(),
1611 range: Some(backend_range),
1612 ..Default::default()
1613 };
1614 let backend_req = S3Request {
1615 input: backend_input,
1616 method: req.method.clone(),
1617 uri: req.uri.clone(),
1618 headers: req.headers.clone(),
1619 extensions: http::Extensions::new(),
1620 credentials: req.credentials.clone(),
1621 region: req.region.clone(),
1622 service: req.service.clone(),
1623 trailing_headers: None,
1624 };
1625 let mut backend_resp = self.backend.get_object(backend_req).await?;
1626 let blob = backend_resp.output.body.take().ok_or_else(|| {
1627 S3Error::with_message(
1628 S3ErrorCode::InternalError,
1629 "backend partial GET returned empty body",
1630 )
1631 })?;
1632 let bytes = collect_blob(blob, self.max_body_bytes)
1633 .await
1634 .map_err(internal("collect partial body"))?;
1635
1636 // frame parse + decompress
1637 let mut combined = BytesMut::new();
1638 for frame in FrameIter::new(bytes) {
1639 let (header, payload) = frame.map_err(|e| {
1640 S3Error::with_message(
1641 S3ErrorCode::InternalError,
1642 format!("partial-range frame parse: {e}"),
1643 )
1644 })?;
1645 let chunk_manifest = ChunkManifest {
1646 codec: header.codec,
1647 original_size: header.original_size,
1648 compressed_size: header.compressed_size,
1649 crc32c: header.crc32c,
1650 };
1651 let decompressed = self
1652 .registry
1653 .decompress(payload, &chunk_manifest)
1654 .await
1655 .map_err(internal("partial-range decompress"))?;
1656 combined.extend_from_slice(&decompressed);
1657 }
1658 let combined = combined.freeze();
1659 let sliced = combined
1660 .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
1661
1662 // response 組立て
1663 let returned_size = sliced.len() as u64;
1664 backend_resp.output.content_length = Some(returned_size as i64);
1665 backend_resp.output.content_range = Some(format!(
1666 "bytes {client_start}-{}/{total_original}",
1667 client_end_exclusive - 1
1668 ));
1669 backend_resp.output.checksum_crc32 = None;
1670 backend_resp.output.checksum_crc32c = None;
1671 backend_resp.output.checksum_crc64nvme = None;
1672 backend_resp.output.checksum_sha1 = None;
1673 backend_resp.output.checksum_sha256 = None;
1674 backend_resp.output.e_tag = None;
1675 backend_resp.output.body = Some(bytes_to_blob(sliced));
1676 backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
1677
1678 let elapsed = get_start.elapsed();
1679 crate::metrics::record_get(
1680 "partial",
1681 plan.byte_end_exclusive - plan.byte_start,
1682 returned_size,
1683 elapsed.as_secs_f64(),
1684 true,
1685 );
1686 info!(
1687 op = "get_object",
1688 bucket = %req.input.bucket,
1689 key = %req.input.key,
1690 bytes_in = plan.byte_end_exclusive - plan.byte_start,
1691 bytes_out = returned_size,
1692 total_object_size = total_original,
1693 range = true,
1694 path = "sidecar-partial",
1695 latency_ms = elapsed.as_millis() as u64,
1696 "S4 partial Range GET via sidecar index"
1697 );
1698 Ok(backend_resp)
1699 }
1700
1701 /// `<key>.s4index` sidecar object を backend に書く。失敗しても本体 PUT は
1702 /// 成功扱いにしたいので、err は warn ログのみ (Range GET の partial path が
1703 /// 使えなくなるが、full read fallback で意味的には正しい結果を返す)。
1704 async fn write_sidecar(&self, bucket: &str, key: &str, index: &FrameIndex) {
1705 let bytes = encode_index(index);
1706 let len = bytes.len() as i64;
1707 let sidecar = sidecar_key(key);
1708 // v0.7 #49: synthetic re-entry URI must be percent-encoded; if
1709 // the (already legally-arbitrary) S3 key produces something we
1710 // cannot encode at all, drop the sidecar PUT (the GET path
1711 // falls back to a full read on a missing sidecar) instead of
1712 // panicking on `parse().unwrap()`.
1713 let uri = match safe_object_uri(bucket, &sidecar) {
1714 Ok(u) => u,
1715 Err(e) => {
1716 tracing::warn!(
1717 bucket,
1718 key,
1719 "S4 write_sidecar skipped (key not URI-encodable): {e}"
1720 );
1721 return;
1722 }
1723 };
1724 let put_input = PutObjectInput {
1725 bucket: bucket.into(),
1726 key: sidecar,
1727 body: Some(bytes_to_blob(bytes)),
1728 content_length: Some(len),
1729 content_type: Some("application/x-s4-index".into()),
1730 ..Default::default()
1731 };
1732 let put_req = S3Request {
1733 input: put_input,
1734 method: http::Method::PUT,
1735 uri,
1736 headers: http::HeaderMap::new(),
1737 extensions: http::Extensions::new(),
1738 credentials: None,
1739 region: None,
1740 service: None,
1741 trailing_headers: None,
1742 };
1743 if let Err(e) = self.backend.put_object(put_req).await {
1744 tracing::warn!(
1745 bucket,
1746 key,
1747 "S4 write_sidecar failed (Range GET will fall back to full read): {e}"
1748 );
1749 }
1750 }
1751
1752 /// v0.8.4 #73 H-2: confirm that the sidecar we just decoded still
1753 /// describes the current backend object before we trust its frame
1754 /// offsets for a partial Range GET. The sidecar carries the source
1755 /// `etag` and `compressed_size` that were observed at PUT time; we
1756 /// HEAD the backend object and compare.
1757 ///
1758 /// Decision matrix:
1759 /// - sidecar `source_etag = None` (legacy v1 / build_index_from_body
1760 /// that wasn't stamped) → return `true` (best-effort, preserves
1761 /// pre-v0.8.4 behaviour for existing on-disk sidecars).
1762 /// - HEAD fails → return `false` (we can't tell either way; full GET
1763 /// path will surface the real backend error to the client).
1764 /// - HEAD ETag matches → `true`.
1765 /// - HEAD ETag differs OR HEAD size differs from
1766 /// `source_compressed_size` → `false` (sidecar stale or attacker-
1767 /// written; fall back to full GET).
1768 async fn sidecar_version_binding_ok(
1769 &self,
1770 bucket: &str,
1771 key: &str,
1772 index: &FrameIndex,
1773 ) -> bool {
1774 let Some(ref expected_etag) = index.source_etag else {
1775 // Legacy sidecar without the v0.8.4 #73 H-2 binding —
1776 // back-compat: trust it (the partial fetch is the same
1777 // best-effort path that v0.8.3 and earlier shipped).
1778 return true;
1779 };
1780 let head_input = HeadObjectInput {
1781 bucket: bucket.into(),
1782 key: key.into(),
1783 ..Default::default()
1784 };
1785 let uri = match safe_object_uri(bucket, key) {
1786 Ok(u) => u,
1787 Err(_) => return false,
1788 };
1789 let head_req = S3Request {
1790 input: head_input,
1791 method: http::Method::HEAD,
1792 uri,
1793 headers: http::HeaderMap::new(),
1794 extensions: http::Extensions::new(),
1795 credentials: None,
1796 region: None,
1797 service: None,
1798 trailing_headers: None,
1799 };
1800 let head = match self.backend.head_object(head_req).await {
1801 Ok(r) => r.output,
1802 Err(e) => {
1803 tracing::debug!(
1804 bucket,
1805 key,
1806 "S4 sidecar version-binding HEAD failed, falling back to full GET: {e}"
1807 );
1808 return false;
1809 }
1810 };
1811 // ETag is a strong-vs-weak enum; we compare on the unwrapped string
1812 // form (matches what the PUT path stamped — see below).
1813 let live_etag = head.e_tag.as_ref().map(|t| t.value());
1814 if live_etag != Some(expected_etag.as_str()) {
1815 tracing::debug!(
1816 bucket,
1817 key,
1818 "sidecar stale (ETag mismatch), falling back to full GET (sidecar={:?}, live={:?})",
1819 expected_etag,
1820 live_etag,
1821 );
1822 return false;
1823 }
1824 if let Some(expected_size) = index.source_compressed_size
1825 && let Some(live_size) = head.content_length
1826 && live_size as u64 != expected_size
1827 {
1828 tracing::debug!(
1829 bucket,
1830 key,
1831 "sidecar stale (size mismatch), falling back to full GET (sidecar={}, live={})",
1832 expected_size,
1833 live_size,
1834 );
1835 return false;
1836 }
1837 true
1838 }
1839
1840 /// `<key>.s4index` sidecar を backend から読み出す。なければ None。
1841 async fn read_sidecar(&self, bucket: &str, key: &str) -> Option<FrameIndex> {
1842 let sidecar = sidecar_key(key);
1843 // v0.7 #49: same encode-or-bail treatment as write_sidecar.
1844 let uri = safe_object_uri(bucket, &sidecar).ok()?;
1845 let get_input = GetObjectInput {
1846 bucket: bucket.into(),
1847 key: sidecar,
1848 ..Default::default()
1849 };
1850 let get_req = S3Request {
1851 input: get_input,
1852 method: http::Method::GET,
1853 uri,
1854 headers: http::HeaderMap::new(),
1855 extensions: http::Extensions::new(),
1856 credentials: None,
1857 region: None,
1858 service: None,
1859 trailing_headers: None,
1860 };
1861 let resp = self.backend.get_object(get_req).await.ok()?;
1862 let blob = resp.output.body?;
1863 let bytes = collect_blob(blob, 64 * 1024 * 1024).await.ok()?;
1864 decode_index(bytes).ok()
1865 }
1866
1867 /// Multipart object (frame 列) を解凍 → 元 bytes を再構築。
1868 ///
1869 /// **per-frame codec dispatch**: 各 frame header に codec_id が入っているので、
1870 /// frame ごとに registry が違う codec を呼ぶことができる。同一 object 内で
1871 /// 異なる codec が混在していても透過的に解凍可能 (parquet 風 mixed columns 等)。
1872 async fn decompress_multipart(&self, bytes: bytes::Bytes) -> S3Result<bytes::Bytes> {
1873 let mut out = BytesMut::new();
1874 // v0.8.15 H-h: cap the *aggregate* decoded output. Each
1875 // individual frame is already bounded by
1876 // `validate_decompress_manifest` (default 5 GiB per frame),
1877 // but a forged multi-frame body can declare many frames
1878 // each near the limit — without an object-level ceiling, a
1879 // single GET could pin tens of GiB of plaintext in
1880 // `BytesMut::extend_from_slice`. Use the gateway's
1881 // `max_body_bytes` (same cap that bounds PUT bodies) so a
1882 // GET can never produce more plaintext than a PUT can ever
1883 // legitimately have stored.
1884 let aggregate_cap = self.max_body_bytes;
1885 let mut produced: usize = 0;
1886 for frame in FrameIter::new(bytes) {
1887 let (header, payload) = frame.map_err(|e| {
1888 S3Error::with_message(
1889 S3ErrorCode::InternalError,
1890 format!("multipart frame parse: {e}"),
1891 )
1892 })?;
1893 let chunk_manifest = ChunkManifest {
1894 codec: header.codec,
1895 original_size: header.original_size,
1896 compressed_size: header.compressed_size,
1897 crc32c: header.crc32c,
1898 };
1899 // v0.8.15 H-h: pre-flight check on the declared
1900 // `original_size` so a forged manifest claiming a frame
1901 // that would push us past the cap is rejected before we
1902 // start decoding. Defence-in-depth alongside the
1903 // post-decode `produced` check below.
1904 if (produced as u64).saturating_add(header.original_size) > aggregate_cap as u64 {
1905 return Err(S3Error::with_message(
1906 S3ErrorCode::InternalError,
1907 format!(
1908 "multipart aggregate output exceeds cap: would reach \
1909 {produced_total} bytes after this frame, cap is {aggregate_cap}",
1910 produced_total = (produced as u64).saturating_add(header.original_size),
1911 ),
1912 ));
1913 }
1914 let decompressed = self
1915 .registry
1916 .decompress(payload, &chunk_manifest)
1917 .await
1918 .map_err(internal("multipart frame decompress"))?;
1919 produced = produced.saturating_add(decompressed.len());
1920 if produced > aggregate_cap {
1921 return Err(S3Error::with_message(
1922 S3ErrorCode::InternalError,
1923 format!(
1924 "multipart aggregate output exceeded cap: {produced} bytes \
1925 emitted, cap is {aggregate_cap}"
1926 ),
1927 ));
1928 }
1929 out.extend_from_slice(&decompressed);
1930 }
1931 Ok(out.freeze())
1932 }
1933}
1934
1935/// Parse a CopySourceRange header value (`bytes=N-M`, `bytes=N-`, `bytes=-N`)
1936/// into the s3s::dto::Range used by the GetObject path. The S3 spec only
1937/// allows `bytes=N-M` for upload_part_copy (no suffix or open-ended), so
1938/// reject the other variants for parity with AWS.
1939fn parse_copy_source_range(s: &str) -> Result<s3s::dto::Range, String> {
1940 let rest = s
1941 .strip_prefix("bytes=")
1942 .ok_or_else(|| format!("CopySourceRange must start with 'bytes=', got {s:?}"))?;
1943 let (a, b) = rest
1944 .split_once('-')
1945 .ok_or_else(|| format!("CopySourceRange must be 'bytes=N-M', got {s:?}"))?;
1946 let first: u64 = a
1947 .parse()
1948 .map_err(|_| format!("CopySourceRange first byte not a number: {a:?}"))?;
1949 let last: u64 = b
1950 .parse()
1951 .map_err(|_| format!("CopySourceRange last byte not a number: {b:?}"))?;
1952 if last < first {
1953 return Err(format!("CopySourceRange last < first: {s:?}"));
1954 }
1955 Ok(s3s::dto::Range::Int {
1956 first,
1957 last: Some(last),
1958 })
1959}
1960
1961/// v0.5 #34: synthesize the backend storage key for a given
1962/// (logical key, version-id) pair on an Enabled-versioning bucket.
1963///
1964/// Uses the `__s4ver__/` infix because:
1965/// - it's not a substring of `.s4index` / `.s4ver` natural keys (no false-positive
1966/// listing filter collisions)
1967/// - directory-style separator keeps S3 console "browse by prefix" UX intact
1968/// (versions roll up under one virtual folder per object)
1969/// - human-readable on debug logs / `aws s3 ls`
1970///
1971/// `list_objects` / `list_objects_v2` / `list_object_versions` MUST filter
1972/// keys containing `.__s4ver__/` from results so customers don't see internal
1973/// shadow objects.
1974pub fn versioned_shadow_key(key: &str, version_id: &str) -> String {
1975 format!("{key}.__s4ver__/{version_id}")
1976}
1977
1978/// Test for the marker substring used by [`versioned_shadow_key`]. Cheap str
1979/// scan; both list_objects filter and the GET passthrough check use this.
1980fn is_versioning_shadow_key(key: &str) -> bool {
1981 key.contains(".__s4ver__/")
1982}
1983
1984/// v0.6 #42: wall-clock seconds since the UNIX epoch — fed to
1985/// `mfa::check_mfa` so the TOTP verifier can match the client's
1986/// authenticator app's view of "now". Falls back to `0` on the
1987/// (impossible-in-practice) clock-before-1970 path so the verifier
1988/// rejects rather than panicking.
1989fn current_unix_secs() -> u64 {
1990 std::time::SystemTime::now()
1991 .duration_since(std::time::UNIX_EPOCH)
1992 .map(|d| d.as_secs())
1993 .unwrap_or(0)
1994}
1995
1996/// v0.6 #42: translate an `MfaError` into the matching S3 wire error.
1997///
1998/// - `Missing` / `SerialMismatch` / `InvalidCode` → `403 AccessDenied`
1999/// (S3 spec for MFA Delete: every gating failure surfaces as
2000/// `AccessDenied`, not a separate `MFA*` code).
2001/// - `Malformed` → `400 InvalidRequest` (the request itself is
2002/// syntactically broken, not a permission issue).
2003fn mfa_error_to_s3(e: crate::mfa::MfaError) -> S3Error {
2004 match e {
2005 crate::mfa::MfaError::Missing => S3Error::with_message(
2006 S3ErrorCode::AccessDenied,
2007 "MFA token required for this operation",
2008 ),
2009 crate::mfa::MfaError::Malformed => {
2010 S3Error::with_message(S3ErrorCode::InvalidRequest, "malformed x-amz-mfa header")
2011 }
2012 crate::mfa::MfaError::SerialMismatch => S3Error::with_message(
2013 S3ErrorCode::AccessDenied,
2014 "MFA serial does not match configured device",
2015 ),
2016 crate::mfa::MfaError::InvalidCode => {
2017 S3Error::with_message(S3ErrorCode::AccessDenied, "invalid MFA code")
2018 }
2019 }
2020}
2021
2022fn is_multipart_object(metadata: &Option<Metadata>) -> bool {
2023 metadata
2024 .as_ref()
2025 .and_then(|m| m.get(META_MULTIPART))
2026 .map(|v| v == "true")
2027 .unwrap_or(false)
2028}
2029
2030const META_CODEC: &str = "s4-codec";
2031const META_ORIGINAL_SIZE: &str = "s4-original-size";
2032const META_COMPRESSED_SIZE: &str = "s4-compressed-size";
2033const META_CRC32C: &str = "s4-crc32c";
2034/// Multipart upload で per-part frame format を使ったオブジェクトであることを示す。
2035/// GET 時にこの flag を見て frame parser を起動する。
2036const META_MULTIPART: &str = "s4-multipart";
2037/// v0.2 #4: single-PUT でも S4F2 framed format で書かれていることを示す。
2038/// 旧 v0.1 single-PUT は raw 圧縮 bytes (この flag なし)。GET 時にこの flag を
2039/// 見て framed 経路 (= multipart と同じ FrameIter parse) に流す。
2040const META_FRAMED: &str = "s4-framed";
2041
2042fn is_framed_v2_object(metadata: &Option<Metadata>) -> bool {
2043 metadata
2044 .as_ref()
2045 .and_then(|m| m.get(META_FRAMED))
2046 .map(|v| v == "true")
2047 .unwrap_or(false)
2048}
2049
2050/// v0.4 #21: detect SSE-S4 by the metadata flag we set on PUT.
2051fn is_sse_encrypted(metadata: &Option<Metadata>) -> bool {
2052 metadata
2053 .as_ref()
2054 .and_then(|m| m.get("s4-encrypted"))
2055 .map(|v| v == "aes-256-gcm")
2056 .unwrap_or(false)
2057}
2058
2059/// v0.5 #27: pull the three SSE-C headers off an input struct. The S3
2060/// contract is "all three or none" — partial sets are a 400.
2061///
2062/// Returns `Ok(None)` when no SSE-C headers were sent (server-managed or
2063/// no encryption), `Ok(Some(material))` on validated client key, and
2064/// `Err` for malformed or partial inputs.
2065fn extract_sse_c_material(
2066 algorithm: &Option<String>,
2067 key: &Option<String>,
2068 md5: &Option<String>,
2069) -> S3Result<Option<crate::sse::CustomerKeyMaterial>> {
2070 match (algorithm, key, md5) {
2071 (None, None, None) => Ok(None),
2072 (Some(a), Some(k), Some(m)) => crate::sse::parse_customer_key_headers(a, k, m)
2073 .map(Some)
2074 .map_err(sse_c_error_to_s3),
2075 _ => Err(S3Error::with_message(
2076 S3ErrorCode::InvalidRequest,
2077 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
2078 )),
2079 }
2080}
2081
2082/// v0.5 #28: detect SSE-KMS request — `x-amz-server-side-encryption: aws:kms`.
2083/// Returns the key-id to wrap under, falling back to the gateway default.
2084fn extract_kms_key_id(
2085 sse: &Option<ServerSideEncryption>,
2086 sse_kms_key_id: &Option<String>,
2087 gateway_default: Option<&str>,
2088) -> Option<String> {
2089 let asks_for_kms = sse
2090 .as_ref()
2091 .map(|s| s.as_str() == ServerSideEncryption::AWS_KMS)
2092 .unwrap_or(false);
2093 if !asks_for_kms {
2094 return None;
2095 }
2096 sse_kms_key_id
2097 .clone()
2098 .or_else(|| gateway_default.map(str::to_owned))
2099}
2100
2101/// v0.5 #28: map kms module errors to AWS-shaped S3 error codes.
2102/// `KeyNotFound` is operator misconfig (400); `BackendUnavailable` is a
2103/// transient KMS outage (503). Other variants are 500 InternalError.
2104fn kms_error_to_s3(e: crate::kms::KmsError) -> S3Error {
2105 use crate::kms::KmsError as K;
2106 match e {
2107 K::KeyNotFound { key_id } => S3Error::with_message(
2108 S3ErrorCode::InvalidArgument,
2109 format!("KMS key not found: {key_id}"),
2110 ),
2111 K::BackendUnavailable { message } => S3Error::with_message(
2112 S3ErrorCode::ServiceUnavailable,
2113 format!("KMS backend unavailable: {message}"),
2114 ),
2115 other => S3Error::with_message(S3ErrorCode::InternalError, format!("KMS error: {other}")),
2116 }
2117}
2118
2119/// v0.5 #27: map sse module errors to AWS-shaped S3 error codes.
2120/// `WrongCustomerKey` → 403 AccessDenied (matches AWS behaviour);
2121/// `InvalidCustomerKey` / algorithm / required / unexpected → 400.
2122fn sse_c_error_to_s3(e: crate::sse::SseError) -> S3Error {
2123 use crate::sse::SseError as E;
2124 match e {
2125 E::WrongCustomerKey => S3Error::with_message(
2126 S3ErrorCode::AccessDenied,
2127 "SSE-C key does not match the key used at PUT time",
2128 ),
2129 E::InvalidCustomerKey { reason } => {
2130 S3Error::with_message(S3ErrorCode::InvalidArgument, format!("SSE-C: {reason}"))
2131 }
2132 E::CustomerKeyAlgorithmUnsupported { algo } => S3Error::with_message(
2133 S3ErrorCode::InvalidArgument,
2134 format!("SSE-C unsupported algorithm: {algo:?} (only AES256 is allowed)"),
2135 ),
2136 E::CustomerKeyRequired => S3Error::with_message(
2137 S3ErrorCode::InvalidRequest,
2138 "object is SSE-C encrypted; supply x-amz-server-side-encryption-customer-* headers",
2139 ),
2140 E::CustomerKeyUnexpected => S3Error::with_message(
2141 S3ErrorCode::InvalidRequest,
2142 "object is not SSE-C encrypted; do not send x-amz-server-side-encryption-customer-* headers",
2143 ),
2144 other => S3Error::with_message(S3ErrorCode::InternalError, format!("SSE error: {other}")),
2145 }
2146}
2147
2148fn extract_manifest(metadata: &Option<Metadata>) -> Option<ChunkManifest> {
2149 let m = metadata.as_ref()?;
2150 let codec = m
2151 .get(META_CODEC)
2152 .and_then(|s| s.parse::<CodecKind>().ok())?;
2153 let original_size = m.get(META_ORIGINAL_SIZE)?.parse().ok()?;
2154 let compressed_size = m.get(META_COMPRESSED_SIZE)?.parse().ok()?;
2155 let crc32c = m.get(META_CRC32C)?.parse().ok()?;
2156 Some(ChunkManifest {
2157 codec,
2158 original_size,
2159 compressed_size,
2160 crc32c,
2161 })
2162}
2163
2164fn write_manifest(metadata: &mut Option<Metadata>, manifest: &ChunkManifest) {
2165 let meta = metadata.get_or_insert_with(Default::default);
2166 meta.insert(META_CODEC.into(), manifest.codec.as_str().into());
2167 meta.insert(
2168 META_ORIGINAL_SIZE.into(),
2169 manifest.original_size.to_string(),
2170 );
2171 meta.insert(
2172 META_COMPRESSED_SIZE.into(),
2173 manifest.compressed_size.to_string(),
2174 );
2175 meta.insert(META_CRC32C.into(), manifest.crc32c.to_string());
2176}
2177
2178fn internal<E: std::fmt::Display>(prefix: &'static str) -> impl FnOnce(E) -> S3Error {
2179 move |e| S3Error::with_message(S3ErrorCode::InternalError, format!("{prefix}: {e}"))
2180}
2181
2182/// v0.6 #41: map a `select::SelectError` to the S3 error surface. AWS
2183/// uses a domain-specific `InvalidSqlExpression` code for parse / unsupported
2184/// errors, but s3s 0.13 doesn't expose that as a typed variant — we
2185/// fall back to the well-known `InvalidRequest` 400 with a descriptive
2186/// message that includes the original error context.
2187fn select_error_to_s3(e: crate::select::SelectError, fmt: &str) -> S3Error {
2188 use crate::select::SelectError;
2189 match e {
2190 SelectError::Parse(msg) => S3Error::with_message(
2191 S3ErrorCode::InvalidRequest,
2192 format!("SQL parse error: {msg}"),
2193 ),
2194 SelectError::UnsupportedFeature(msg) => S3Error::with_message(
2195 S3ErrorCode::InvalidRequest,
2196 format!("unsupported SQL feature: {msg}"),
2197 ),
2198 SelectError::RowEval(msg) => S3Error::with_message(
2199 S3ErrorCode::InvalidRequest,
2200 format!("SQL row evaluation error: {msg}"),
2201 ),
2202 SelectError::InputFormat(msg) => S3Error::with_message(
2203 S3ErrorCode::InvalidRequest,
2204 format!("{fmt} input format error: {msg}"),
2205 ),
2206 }
2207}
2208
2209/// v0.5 #30: parse the `x-amz-bypass-governance-retention` header into a
2210/// boolean flag. AWS S3 accepts `true` (case-insensitive); any other value
2211/// (including missing) is treated as `false`.
2212fn parse_bypass_governance_header(headers: &http::HeaderMap) -> bool {
2213 headers
2214 .get("x-amz-bypass-governance-retention")
2215 .and_then(|v| v.to_str().ok())
2216 .map(|s| s.eq_ignore_ascii_case("true"))
2217 .unwrap_or(false)
2218}
2219
2220/// Convert s3s `Timestamp` into a `chrono::DateTime<Utc>` by formatting it
2221/// as an RFC3339 string and re-parsing through `chrono`. The string format
2222/// avoids pulling the `time` crate (transitive dep of s3s, not declared by
2223/// s4-server) into our direct deps. Returns `None` if the format/parse fails
2224/// or the value is outside `chrono`'s supported range.
2225fn timestamp_to_chrono_utc(ts: &Timestamp) -> Option<chrono::DateTime<chrono::Utc>> {
2226 let mut buf = Vec::new();
2227 ts.format(s3s::dto::TimestampFormat::DateTime, &mut buf)
2228 .ok()?;
2229 let s = std::str::from_utf8(&buf).ok()?;
2230 chrono::DateTime::parse_from_rfc3339(s)
2231 .ok()
2232 .map(|dt| dt.with_timezone(&chrono::Utc))
2233}
2234
2235/// Inverse of [`timestamp_to_chrono_utc`] — emit RFC3339 (the s3s
2236/// `DateTime` wire format) and re-parse via `Timestamp::parse`.
2237fn chrono_utc_to_timestamp(dt: chrono::DateTime<chrono::Utc>) -> Timestamp {
2238 // chrono's RFC3339 output format matches s3s' parser ("...Z" with
2239 // optional sub-second precision). Fall back to UNIX_EPOCH if anything
2240 // unexpected happens — we never produce malformed strings, so this
2241 // branch is unreachable in practice.
2242 let s = dt.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
2243 Timestamp::parse(s3s::dto::TimestampFormat::DateTime, &s).unwrap_or_default()
2244}
2245
2246/// v0.6 #39: convert our internal [`crate::tagging::TagSet`] into the
2247/// s3s `Vec<Tag>` wire shape used on `GetObject/BucketTaggingOutput`.
2248/// Both halves of every pair land in the `Some(_)` slot — AWS marks
2249/// the field optional but always populates it on response.
2250fn tagset_to_aws(set: &crate::tagging::TagSet) -> Vec<Tag> {
2251 set.iter()
2252 .map(|(k, v)| Tag {
2253 key: Some(k.clone()),
2254 value: Some(v.clone()),
2255 })
2256 .collect()
2257}
2258
2259/// v0.6 #39: inverse of [`tagset_to_aws`] for input handlers. Missing
2260/// keys / values become empty strings (mirrors AWS, which rejects
2261/// `<Key/>` with InvalidTag at the parser layer; downstream
2262/// `TagSet::validate` then enforces our size limits).
2263fn aws_to_tagset(tags: &[Tag]) -> Result<crate::tagging::TagSet, crate::tagging::TagError> {
2264 let pairs = tags
2265 .iter()
2266 .map(|t| {
2267 (
2268 t.key.clone().unwrap_or_default(),
2269 t.value.clone().unwrap_or_default(),
2270 )
2271 })
2272 .collect();
2273 crate::tagging::TagSet::from_pairs(pairs)
2274}
2275
2276/// `Range` request を decompressed object サイズ `total` に適用して `(start, end_exclusive)`
2277/// を返す。`Range::Int { first, last }` は `bytes=first-last` (last は inclusive)、
2278/// `Range::Suffix { length }` は末尾 `length` byte。S3 仕様に準拠。
2279pub fn resolve_range(range: &s3s::dto::Range, total: u64) -> Result<(u64, u64), String> {
2280 if total == 0 {
2281 return Err("cannot range-get zero-length object".into());
2282 }
2283 match range {
2284 s3s::dto::Range::Int { first, last } => {
2285 let start = *first;
2286 let end_inclusive = match last {
2287 Some(l) => (*l).min(total - 1),
2288 None => total - 1,
2289 };
2290 if start > end_inclusive || start >= total {
2291 return Err(format!(
2292 "range bytes={start}-{:?} out of object size {total}",
2293 last
2294 ));
2295 }
2296 Ok((start, end_inclusive + 1))
2297 }
2298 s3s::dto::Range::Suffix { length } => {
2299 let len = (*length).min(total);
2300 Ok((total - len, total))
2301 }
2302 }
2303}
2304
2305#[async_trait::async_trait]
2306impl<B: S3> S3 for S4Service<B> {
2307 // === 圧縮を挟む path (PUT) ===
2308 #[tracing::instrument(
2309 name = "s4.put_object",
2310 skip(self, req),
2311 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_in, bytes_out, latency_ms)
2312 )]
2313 async fn put_object(
2314 &self,
2315 mut req: S3Request<PutObjectInput>,
2316 ) -> S3Result<S3Response<PutObjectOutput>> {
2317 let put_start = Instant::now();
2318 let put_bucket = req.input.bucket.clone();
2319 let put_key = req.input.key.clone();
2320 // v0.8.15 M-1: reject user PUTs targeting reserved sidecar
2321 // names (`<key>.s4index`). Without this gate, a user
2322 // uploading `report.s4index` would have their object silently
2323 // hidden from `ListObjectsV2` (the list filter strips the
2324 // `.s4index` suffix) and risk being deleted by the sidecar-
2325 // cleanup path on a sibling DeleteObject. Fail fast with the
2326 // AWS-canonical `InvalidObjectName` code.
2327 if s4_codec::index::is_reserved_sidecar_key(&put_key) {
2328 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
2329 .unwrap_or(S3ErrorCode::InvalidArgument);
2330 return Err(S3Error::with_message(
2331 code,
2332 format!(
2333 "object key {put_key:?} is reserved (suffix `{}` is used for S4 internal \
2334 sidecars); pick a different key",
2335 s4_codec::index::SIDECAR_SUFFIX,
2336 ),
2337 ));
2338 }
2339 let access_preamble = self.access_log_preamble(&req);
2340 self.enforce_rate_limit(&req, &put_bucket)?;
2341 // v0.6 #39: parse `x-amz-tagging` (URL-encoded query string) so
2342 // the IAM policy gate sees the request's tags via
2343 // `s3:RequestObjectTag/<key>`. `existing_object_tags` is also
2344 // resolved from the Tagging manager (when wired) so
2345 // `s3:ExistingObjectTag/<key>` works on overwrite.
2346 let request_tags: Option<crate::tagging::TagSet> = req
2347 .input
2348 .tagging
2349 .as_deref()
2350 .map(crate::tagging::parse_tagging_header)
2351 .transpose()
2352 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
2353 let existing_tags: Option<crate::tagging::TagSet> = self
2354 .tagging
2355 .as_ref()
2356 .and_then(|m| m.get_object_tags(&put_bucket, &put_key));
2357 self.enforce_policy_with_extra(
2358 &req,
2359 "s3:PutObject",
2360 &put_bucket,
2361 Some(&put_key),
2362 request_tags.as_ref(),
2363 existing_tags.as_ref(),
2364 )?;
2365 // v0.5 #30: an Object Lock-protected key cannot be overwritten by
2366 // a non-versioned PUT (Suspended / Unversioned bucket). Enabled
2367 // bucket PUTs are exempt because they materialise a fresh
2368 // version under a shadow key (`<key>.__s4ver__/<vid>`) — the
2369 // locked version's bytes are untouched. The check mirrors the
2370 // delete path (Compliance never bypassable, Governance via the
2371 // bypass header, legal hold never).
2372 if let Some(mgr) = self.object_lock.as_ref()
2373 && let Some(state) = mgr.get(&put_bucket, &put_key)
2374 {
2375 let bucket_versioned_enabled = self
2376 .versioning
2377 .as_ref()
2378 .map(|v| v.state(&put_bucket) == crate::versioning::VersioningState::Enabled)
2379 .unwrap_or(false);
2380 if !bucket_versioned_enabled {
2381 let bypass = parse_bypass_governance_header(&req.headers);
2382 let now = chrono::Utc::now();
2383 if !state.can_delete(now, bypass) {
2384 crate::metrics::record_policy_denial("s3:PutObject", &put_bucket);
2385 return Err(S3Error::with_message(
2386 S3ErrorCode::AccessDenied,
2387 "Access Denied because object protected by object lock",
2388 ));
2389 }
2390 }
2391 }
2392 // v0.5 #30: per-PUT explicit retention / legal hold (S3
2393 // `x-amz-object-lock-mode`, `x-amz-object-lock-retain-until-date`,
2394 // `x-amz-object-lock-legal-hold`). Captured before the body
2395 // moves into the backend; persisted into the manager only on
2396 // backend success below.
2397 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
2398 .input
2399 .object_lock_mode
2400 .as_ref()
2401 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
2402 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
2403 .input
2404 .object_lock_retain_until_date
2405 .as_ref()
2406 .and_then(timestamp_to_chrono_utc);
2407 let explicit_legal_hold_on: Option<bool> = req
2408 .input
2409 .object_lock_legal_hold_status
2410 .as_ref()
2411 .map(|s| s.as_str().eq_ignore_ascii_case("ON"));
2412 if let Some(blob) = req.input.body.take() {
2413 // Sample 4 KiB から codec を決定。streaming-aware codec なら streaming
2414 // compress fast path、そうでなければ従来の collect-then-compress。
2415 let (sample, rest_stream) = peek_sample(blob, SAMPLE_BYTES)
2416 .await
2417 .map_err(internal("peek put sample"))?;
2418 let sample_len = sample.len().min(SAMPLE_BYTES);
2419 // v0.8 #56: pass the request's Content-Length (when present) so
2420 // the sampling dispatcher can promote large objects to a GPU
2421 // codec. Chunked transfers (no Content-Length) keep CPU.
2422 let total_size_hint = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2423 let kind = self
2424 .dispatcher
2425 .pick_with_size_hint(&sample[..sample_len], total_size_hint)
2426 .await;
2427
2428 // Passthrough buys nothing from S4F2 wrapping (no compression =
2429 // no per-chunk frame to skip past) and the +28-byte header
2430 // overhead breaks size-sensitive callers that expect a true
2431 // pass-through. So passthrough always uses the legacy raw-blob
2432 // path; only compressing codecs go through the framed path.
2433 //
2434 // v0.8.14 follow-up to #127 MED-B: the previous attempt
2435 // forced the buffered path whenever the client supplied
2436 // any whole-body checksum so `verify_client_body_checksums`
2437 // could run. Modern AWS SDKs auto-add an
2438 // `x-amz-checksum-crc32` trailer by default, which made
2439 // every SDK PUT lose the streaming-framed path and
2440 // therefore lose its sidecar — silent data path
2441 // regression caught by
2442 // `range_get_falls_back_to_full_when_sidecar_etag_stale`
2443 // and `upload_part_copy_propagates_source_version_id`
2444 // on the MinIO E2E job. The streaming PUT path now
2445 // passes through unchanged; client-supplied checksums on
2446 // streaming PUTs are NOT verified (same fail-open as
2447 // pre-v0.8.12). The buffered PUT branch and UploadPart
2448 // do verify, which covers the buffered upload case the
2449 // HIGH-12 audit was scoped to. True streaming verify
2450 // (tee-into-hasher on the chained input) remains the
2451 // tracked follow-up.
2452 let use_framed = supports_streaming_compress(kind) && kind != CodecKind::Passthrough;
2453 let (compressed, manifest, is_framed) = if use_framed {
2454 // streaming fast path: input は memory に collect しない
2455 let chained = chain_sample_with_rest(sample, rest_stream);
2456 debug!(
2457 bucket = ?req.input.bucket,
2458 key = ?req.input.key,
2459 codec = kind.as_str(),
2460 path = "streaming-framed",
2461 "S4 put_object: compressing (streaming, S4F2 multi-frame)"
2462 );
2463 // v0.4 #16: pick the chunk size based on the request's
2464 // Content-Length when known, falling back to the 4 MiB
2465 // default for chunked transfers.
2466 let chunk_size = pick_chunk_size(req.input.content_length.map(|n| n as u64));
2467 // v0.8.4 #73 M2: pass the request's Content-Length so
2468 // streaming_compress_to_frames can fail-fast on a mid-PUT
2469 // truncation (client disconnect after sending half the
2470 // body). `None` is the chunked-Transfer-Encoding case
2471 // where the upstream genuinely doesn't know the size and
2472 // the backend's framing layer is the only truncation
2473 // signal we have.
2474 let expected_input_size =
2475 req.input.content_length.and_then(|n| u64::try_from(n).ok());
2476 let (body, manifest) = streaming_compress_to_frames(
2477 chained,
2478 Arc::clone(&self.registry),
2479 kind,
2480 chunk_size,
2481 expected_input_size,
2482 )
2483 .await
2484 .map_err(|e| match e {
2485 s4_codec::CodecError::TruncatedStream { expected, got } => {
2486 // 400 IncompleteBody: client advertised N bytes
2487 // but disconnected after `got`. Mirrors AWS S3's
2488 // canonical error code for the same shape so SDK
2489 // retries kick in instead of treating the PUT as
2490 // a successful upload of a half-body.
2491 S3Error::with_message(
2492 S3ErrorCode::IncompleteBody,
2493 format!("PUT body truncated: expected {expected} bytes, got {got}"),
2494 )
2495 }
2496 // v0.8.15 M-4: 400
2497 // `RequestBodyLengthMismatch` for over-length
2498 // bodies. AWS S3 returns this when the declared
2499 // `Content-Length` is smaller than the wire body;
2500 // S4 used to silently accept the surplus bytes.
2501 // `IncompleteBody` is the closest typed variant
2502 // in the s3s enum — we widen the message so the
2503 // SDK / curl side sees the shape unambiguously.
2504 s4_codec::CodecError::OverlengthStream { expected, got } => {
2505 let code = S3ErrorCode::from_bytes(b"RequestBodyLengthMismatch")
2506 .unwrap_or(S3ErrorCode::IncompleteBody);
2507 S3Error::with_message(
2508 code,
2509 format!(
2510 "PUT body length mismatch: Content-Length declared {expected} \
2511 bytes, body carried at least {got}"
2512 ),
2513 )
2514 }
2515 other => internal("streaming framed compress")(other),
2516 })?;
2517 (body, manifest, true)
2518 } else {
2519 // GPU codec 等で streaming-aware でないものは bytes-buffered path
2520 // (raw 圧縮 bytes、framed なし — back-compat 互換 path)
2521 let bytes = collect_with_sample(sample, rest_stream, self.max_body_bytes)
2522 .await
2523 .map_err(internal("collect put body (buffered path)"))?;
2524 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
2525 // checksum algorithms against the received body on
2526 // the buffered path. The streaming-framed branch
2527 // above redirects here when ANY checksum header is
2528 // present (#127 MED-B), so this is the single
2529 // checkpoint for client-supplied integrity.
2530 verify_client_body_checksums(
2531 &bytes,
2532 req.input.content_md5.as_deref(),
2533 req.input.checksum_crc32.as_deref(),
2534 req.input.checksum_crc32c.as_deref(),
2535 req.input.checksum_sha1.as_deref(),
2536 req.input.checksum_sha256.as_deref(),
2537 req.input.checksum_crc64nvme.as_deref(),
2538 )?;
2539 debug!(
2540 bucket = ?req.input.bucket,
2541 key = ?req.input.key,
2542 bytes = bytes.len(),
2543 codec = kind.as_str(),
2544 path = "buffered",
2545 "S4 put_object: compressing (buffered, raw blob)"
2546 );
2547 // v0.8 #55: telemetry-returning compress so we can stamp
2548 // GPU-pipeline Prometheus metrics (`s4_gpu_compress_seconds`,
2549 // throughput gauge, OOM counter) for nvcomp / dietgpu codecs.
2550 // CPU codecs come back with `gpu_seconds = None` and the
2551 // stamp helper short-circuits — no extra cost on CPU path.
2552 let (compress_res, tel) = self.registry.compress_with_telemetry(bytes, kind).await;
2553 stamp_gpu_compress_telemetry(&tel);
2554 let (body, m) = compress_res.map_err(internal("registry compress"))?;
2555 (body, m, false)
2556 };
2557
2558 write_manifest(&mut req.input.metadata, &manifest);
2559 if is_framed {
2560 // v0.2 #4: framed body であることを GET 側に伝える meta flag。
2561 req.input
2562 .metadata
2563 .get_or_insert_with(Default::default)
2564 .insert(META_FRAMED.into(), "true".into());
2565 }
2566 // 重要: content_length を圧縮後サイズで更新する。
2567 // これを忘れると下流 (aws-sdk-s3 → S3) が宣言サイズ分の bytes を
2568 // 待ち続けて RequestTimeout で失敗する (S3 仕様)。
2569 req.input.content_length = Some(compressed.len() as i64);
2570 // body を書き換えたので、客側が送ってきた original body 用の
2571 // checksum / MD5 ヘッダは無効化する (そのまま転送すると下流 S3 が
2572 // XAmzContentChecksumMismatch を返す)。S4 自身の整合性は
2573 // ChunkManifest.crc32c で担保している。
2574 req.input.checksum_algorithm = None;
2575 req.input.checksum_crc32 = None;
2576 req.input.checksum_crc32c = None;
2577 req.input.checksum_crc64nvme = None;
2578 req.input.checksum_sha1 = None;
2579 req.input.checksum_sha256 = None;
2580 req.input.content_md5 = None;
2581 let original_size = manifest.original_size;
2582 let compressed_size = manifest.compressed_size;
2583 let codec_label = manifest.codec.as_str();
2584 // (sidecar_index is built below, after the SSE-mode
2585 // extraction, so v0.8.12 HIGH-10 can short-circuit the
2586 // build when the on-disk bytes are about to be encrypted.)
2587 // v0.4 #21 / v0.5 #29 / v0.5 #27: encrypt-after-compress.
2588 // Precedence:
2589 // - SSE-C headers present → per-request customer key (S4E3)
2590 // - server-managed keyring configured → active key (S4E2)
2591 // - neither → no encryption (raw compressed body)
2592 // The `s4-encrypted: aes-256-gcm` metadata flag is set in
2593 // both encrypted modes; the on-disk frame magic distinguishes
2594 // S4E1 / S4E2 / S4E3 so GET picks the right decrypt path.
2595 // v0.7 #48 BUG-2/3 fix: take() the SSE fields off req.input
2596 // so the encryption headers are NOT forwarded to the
2597 // backend. S4 owns the encrypt-then-store contract; if we
2598 // leave the headers in place, real S3-compat backends
2599 // (MinIO / AWS) try to apply their own SSE on top and
2600 // either reject (MinIO requires HTTPS for SSE-C) or fail
2601 // (MinIO has no KMS configured). MemoryBackend ignored
2602 // these so mock tests passed.
2603 let sse_c_alg = req.input.sse_customer_algorithm.take();
2604 let sse_c_key = req.input.sse_customer_key.take();
2605 let sse_c_md5 = req.input.sse_customer_key_md5.take();
2606 let sse_header = req.input.server_side_encryption.take();
2607 let sse_kms_key = req.input.ssekms_key_id.take();
2608 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
2609 // v0.5 #28: SSE-KMS request? Resolves to None unless the
2610 // request asks for `aws:kms` AND a key id is available
2611 // (explicit header or gateway default). When set, we'll
2612 // generate a per-object DEK below.
2613 let kms_key_id = extract_kms_key_id(
2614 &sse_header,
2615 &sse_kms_key,
2616 self.kms_default_key_id.as_deref(),
2617 );
2618 // v0.8.12 HIGH-10 fix: the sidecar offsets describe the
2619 // pre-encrypt `compressed` body, but the bytes the
2620 // backend stores when any SSE mode is active are
2621 // *post-encrypt* (different length, different layout).
2622 // A Range GET on an SSE-encrypted object would slice the
2623 // ciphertext at the stale offsets, hand the wrong bytes
2624 // to the frame parser, and 500. Suppress the sidecar
2625 // entirely when SSE is going to be applied below;
2626 // encrypted-object Range GET falls back to the buffered
2627 // path (decrypt full body → frame parse → slice), trading
2628 // partial-fetch performance for correctness. An
2629 // encryption-aware sidecar format is a follow-up issue.
2630 let will_encrypt =
2631 sse_c_material.is_some() || kms_key_id.is_some() || self.sse_keyring.is_some();
2632 let sidecar_index = if is_framed && !will_encrypt {
2633 s4_codec::index::build_index_from_body(&compressed).ok()
2634 } else {
2635 None
2636 };
2637 // v0.5 #32: in compliance-strict mode, every PUT must
2638 // declare SSE — either client-supplied (SSE-C), KMS, or by
2639 // virtue of a server-side keyring being configured (which
2640 // applies SSE-S4 to every PUT automatically). Requests that
2641 // would otherwise land as plain compressed bytes are
2642 // rejected with 400 InvalidRequest.
2643 if self.compliance_strict
2644 && sse_c_material.is_none()
2645 && kms_key_id.is_none()
2646 && self.sse_keyring.is_none()
2647 && sse_header.as_ref().map(|s| s.as_str()) != Some(ServerSideEncryption::AES256)
2648 {
2649 return Err(S3Error::with_message(
2650 S3ErrorCode::InvalidRequest,
2651 "compliance-mode strict: PUT must include x-amz-server-side-encryption \
2652 (AES256 or aws:kms) or x-amz-server-side-encryption-customer-* headers",
2653 ));
2654 }
2655 // SSE-C and SSE-KMS are mutually exclusive on a single PUT
2656 // (AWS S3 returns 400 InvalidArgument). SSE-C wins by spec.
2657 if sse_c_material.is_some() && kms_key_id.is_some() {
2658 return Err(S3Error::with_message(
2659 S3ErrorCode::InvalidArgument,
2660 "SSE-C and SSE-KMS cannot be used together on the same PUT",
2661 ));
2662 }
2663 // KMS path needs to call generate_dek().await before the
2664 // body_to_send branch; capture the result here.
2665 //
2666 // v0.8.1 #58: the plaintext DEK lives in three places
2667 // during one PUT:
2668 //
2669 // 1. The `Zeroizing<Vec<u8>>` returned by `generate_dek`
2670 // — wiped when the binding `dek` falls out of scope at
2671 // the end of this `if`-arm.
2672 // 2. The stack `[u8; 32]` we copy into for `SseSource::Kms`
2673 // — wrapped in `Zeroizing<[u8; 32]>` so it's wiped when
2674 // the outer `kms_wrap` `Option` is dropped at the end
2675 // of `put_object`.
2676 // 3. AES-GCM internal key state inside the `aes-gcm`
2677 // crate during `encrypt_with_source` — out of scope
2678 // for this fix; tracked separately in v0.8.2.
2679 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
2680 if let Some(ref key_id) = kms_key_id {
2681 let kms = self.kms.as_ref().ok_or_else(|| {
2682 S3Error::with_message(
2683 S3ErrorCode::InvalidRequest,
2684 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
2685 )
2686 })?;
2687 // `dek` is `Zeroizing<Vec<u8>>`; deref + slice access
2688 // works unchanged via `Deref<Target=Vec<u8>>`.
2689 let (dek, wrapped) = kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
2690 if dek.len() != 32 {
2691 return Err(S3Error::with_message(
2692 S3ErrorCode::InternalError,
2693 format!(
2694 "KMS backend returned a DEK of {} bytes (expected 32)",
2695 dek.len()
2696 ),
2697 ));
2698 }
2699 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
2700 zeroize::Zeroizing::new([0u8; 32]);
2701 dek_arr.copy_from_slice(&dek);
2702 // `dek` (the `Zeroizing<Vec<u8>>`) is dropped at the
2703 // end of this scope, wiping the heap allocation.
2704 Some((dek_arr, wrapped))
2705 } else {
2706 None
2707 };
2708 // v0.7 #48 BUG-4 fix: stamp the SSE *type* into metadata
2709 // alongside `s4-encrypted` so HEAD (which doesn't fetch the
2710 // body) can echo the correct `x-amz-server-side-encryption`
2711 // value. Without this, HEAD on an SSE-KMS object would not
2712 // echo `aws:kms` because the frame magic is only available
2713 // on the body (which HEAD doesn't read).
2714 let body_to_send = if let Some(ref m) = sse_c_material {
2715 let meta = req.input.metadata.get_or_insert_with(Default::default);
2716 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2717 meta.insert("s4-sse-type".into(), "AES256".into());
2718 meta.insert(
2719 "s4-sse-c-key-md5".into(),
2720 base64::engine::general_purpose::STANDARD.encode(m.key_md5),
2721 );
2722 crate::sse::encrypt_with_source(
2723 &compressed,
2724 crate::sse::SseSource::CustomerKey {
2725 key: &m.key,
2726 key_md5: &m.key_md5,
2727 },
2728 )
2729 } else if let Some((ref dek, ref wrapped)) = kms_wrap {
2730 let meta = req.input.metadata.get_or_insert_with(Default::default);
2731 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2732 meta.insert("s4-sse-type".into(), "aws:kms".into());
2733 meta.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
2734 // v0.8.1 #58: `dek` is `&Zeroizing<[u8; 32]>`; `SseSource::Kms`
2735 // wants `&[u8; 32]`. Rust auto-derefs `&Zeroizing<T>` to
2736 // `&T` here via `Deref<Target=T>`, so the binding picks
2737 // up the inner array reference without copying. The array
2738 // stays in the `Zeroizing` wrapper that owns it and gets
2739 // wiped when `kms_wrap` drops at the end of `put_object`.
2740 let dek_ref: &[u8; 32] = dek;
2741 crate::sse::encrypt_with_source(
2742 &compressed,
2743 crate::sse::SseSource::Kms {
2744 dek: dek_ref,
2745 wrapped,
2746 },
2747 )
2748 } else if let Some(keyring) = self.sse_keyring.as_ref() {
2749 // SSE-S4 is server-driven transparent encryption; the
2750 // client didn't ask for SSE. We stamp `s4-encrypted`
2751 // (internal flag the GET path needs) but deliberately
2752 // do NOT stamp `s4-sse-type` — that lights up the HEAD
2753 // echo of `x-amz-server-side-encryption: AES256`,
2754 // which would falsely advertise AWS-style SSE-S3
2755 // semantics the operator didn't request.
2756 let meta = req.input.metadata.get_or_insert_with(Default::default);
2757 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2758 // v0.8 #52: when `--sse-chunk-size > 0` is configured,
2759 // emit the chunked S4E5 frame so the matching GET can
2760 // stream-decrypt instead of buffering 5 GiB before
2761 // emitting a byte. Falls back to the buffered S4E2
2762 // frame at chunk_size=0 (default) so existing
2763 // deployments are bit-for-bit unchanged.
2764 if self.sse_chunk_size > 0 {
2765 crate::sse::encrypt_v2_chunked(&compressed, keyring, self.sse_chunk_size)
2766 .map_err(|e| {
2767 S3Error::with_message(
2768 S3ErrorCode::InternalError,
2769 format!("SSE-S4 chunked encrypt failed: {e}"),
2770 )
2771 })?
2772 } else {
2773 crate::sse::encrypt_v2(&compressed, keyring)
2774 }
2775 } else {
2776 compressed.clone()
2777 };
2778 // v0.6 #40: capture the about-to-be-sent body + metadata so
2779 // the replication dispatcher (run after the source PUT
2780 // succeeds) can hand the same backend bytes to the
2781 // destination bucket. `Bytes` clone is cheap (refcounted).
2782 let replication_body = body_to_send.clone();
2783 let replication_metadata = req.input.metadata.clone();
2784 // v0.7 #48 BUG-1 fix: SSE encryption (S4E1/E2/E3/E4 frames)
2785 // makes the body longer than the post-compression bytes
2786 // (header + nonce + tag overhead). The earlier
2787 // content_length stamp at compressed.len() is now stale, so
2788 // re-stamp from the actual bytes about to be sent or the
2789 // backend (real S3 / MinIO) rejects with
2790 // `StreamLengthMismatch`. MemoryBackend never validated
2791 // this, which is why mock-only tests passed.
2792 req.input.content_length = Some(body_to_send.len() as i64);
2793 req.input.body = Some(bytes_to_blob(body_to_send));
2794 // v0.5 #34: pre-allocate a version-id when the bucket is
2795 // Enabled, then redirect the backend storage key to the
2796 // shadow path so older versions survive newer PUTs.
2797 // Suspended / Unversioned buckets keep using the plain
2798 // `<key>` (S3 spec: Suspended overwrites the same backend
2799 // object). Pre-allocation (instead of recording after PUT)
2800 // ensures the shadow key + the response's
2801 // `x-amz-version-id` use the same vid.
2802 let pending_version: Option<crate::versioning::PutOutcome> = self
2803 .versioning
2804 .as_ref()
2805 .map(|mgr| mgr.state(&put_bucket))
2806 .map(|state| match state {
2807 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
2808 version_id: crate::versioning::VersioningManager::new_version_id(),
2809 versioned_response: true,
2810 },
2811 crate::versioning::VersioningState::Suspended
2812 | crate::versioning::VersioningState::Unversioned => {
2813 crate::versioning::PutOutcome {
2814 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
2815 versioned_response: false,
2816 }
2817 }
2818 });
2819 if let Some(ref pv) = pending_version
2820 && pv.versioned_response
2821 {
2822 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
2823 }
2824 // v0.8.4 #73 H-2: capture the to-be-stored body length BEFORE
2825 // the move into `req.input` is consumed by the backend call.
2826 // The sidecar's `source_compressed_size` is checked against
2827 // the live HEAD `Content-Length` on Range GET to detect a
2828 // backend-side mutation.
2829 let backend_object_size = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2830 let mut backend_resp = self.backend.put_object(req).await;
2831 if let Some(mut idx) = sidecar_index
2832 && let Ok(ref resp) = backend_resp
2833 && idx.entries.len() > 1
2834 {
2835 // 1 chunk しかない (small object) なら sidecar は意味がない (=
2836 // partial fetch しても full body と同じ範囲) ので省略。
2837 // Sidecar は user-visible key で書く (latest version の
2838 // partial fetch path 用)。Old versions の Range GET は今 task
2839 // の scope 外 (full read fallback でも意味的には正しい)。
2840 //
2841 // v0.8.4 #73 H-2: stamp the version-binding fields the
2842 // GET path needs to detect a stale / attacker-written
2843 // sidecar. ETag comes from the backend's PUT response —
2844 // when missing (some backends don't return an ETag) we
2845 // synthesize a CRC-derived stable identifier so the
2846 // sidecar still binds to *something*; the GET HEAD will
2847 // see the same backend ETag (None vs None) and treat the
2848 // pair as consistent.
2849 let source_etag = resp.output.e_tag.as_ref().map(|t| t.value().to_string());
2850 idx.source_etag = source_etag;
2851 idx.source_compressed_size = backend_object_size;
2852 self.write_sidecar(&put_bucket, &put_key, &idx).await;
2853 }
2854 // v0.5 #34: commit the new version into the manager only on
2855 // backend success. Use the pre-allocated vid so the response
2856 // header and the chain entry agree.
2857 if let (Some(mgr), Some(pv), Ok(resp)) = (
2858 self.versioning.as_ref(),
2859 pending_version.as_ref(),
2860 backend_resp.as_mut(),
2861 ) {
2862 let etag = resp
2863 .output
2864 .e_tag
2865 .clone()
2866 .map(ETag::into_value)
2867 .unwrap_or_else(|| format!("\"crc32c-{}\"", manifest.crc32c));
2868 let now = chrono::Utc::now();
2869 mgr.commit_put_with_version(
2870 &put_bucket,
2871 &put_key,
2872 crate::versioning::VersionEntry {
2873 version_id: pv.version_id.clone(),
2874 etag,
2875 size: original_size,
2876 is_delete_marker: false,
2877 created_at: now,
2878 },
2879 );
2880 if pv.versioned_response {
2881 resp.output.version_id = Some(pv.version_id.clone());
2882 }
2883 }
2884 // v0.5 #27: AWS S3 echoes the SSE-C headers back on success
2885 // so the client knows the server actually applied the
2886 // requested algorithm and which key fingerprint matched.
2887 if let (Some(m), Ok(resp)) = (sse_c_material.as_ref(), backend_resp.as_mut()) {
2888 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
2889 resp.output.sse_customer_key_md5 =
2890 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
2891 }
2892 // v0.5 #28: SSE-KMS echo — `aws:kms` + the canonical key id
2893 // the backend returned (AWS KMS returns the ARN even when
2894 // the request used an alias).
2895 if let (Some((_, wrapped)), Ok(resp)) = (kms_wrap.as_ref(), backend_resp.as_mut()) {
2896 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
2897 ServerSideEncryption::AWS_KMS,
2898 ));
2899 resp.output.ssekms_key_id = Some(wrapped.key_id.clone());
2900 }
2901 // v0.5 #30: persist any per-PUT explicit retention / legal
2902 // hold the client supplied, then auto-apply the bucket
2903 // default (no-op when state is already populated). The
2904 // explicit fields take precedence — the bucket-default
2905 // helper bails out as soon as it sees any retention.
2906 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
2907 if explicit_lock_mode.is_some()
2908 || explicit_retain_until.is_some()
2909 || explicit_legal_hold_on.is_some()
2910 {
2911 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
2912 if let Some(m) = explicit_lock_mode {
2913 state.mode = Some(m);
2914 }
2915 if let Some(u) = explicit_retain_until {
2916 state.retain_until = Some(u);
2917 }
2918 if let Some(lh) = explicit_legal_hold_on {
2919 state.legal_hold_on = lh;
2920 }
2921 mgr.set(&put_bucket, &put_key, state);
2922 }
2923 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
2924 }
2925 let _ = (original_size, compressed_size); // mute unused warnings
2926 let elapsed = put_start.elapsed();
2927 crate::metrics::record_put(
2928 codec_label,
2929 original_size,
2930 compressed_size,
2931 elapsed.as_secs_f64(),
2932 backend_resp.is_ok(),
2933 );
2934 // v0.4 #20: structured access-log entry (best-effort).
2935 self.record_access(
2936 access_preamble,
2937 "REST.PUT.OBJECT",
2938 &put_bucket,
2939 Some(&put_key),
2940 if backend_resp.is_ok() { 200 } else { 500 },
2941 compressed_size,
2942 original_size,
2943 elapsed.as_millis() as u64,
2944 backend_resp.as_ref().err().map(|e| e.code().as_str()),
2945 )
2946 .await;
2947 info!(
2948 op = "put_object",
2949 bucket = %put_bucket,
2950 key = %put_key,
2951 codec = codec_label,
2952 bytes_in = original_size,
2953 bytes_out = compressed_size,
2954 ratio = format!(
2955 "{:.3}",
2956 if original_size == 0 { 1.0 } else { compressed_size as f64 / original_size as f64 }
2957 ),
2958 latency_ms = elapsed.as_millis() as u64,
2959 ok = backend_resp.is_ok(),
2960 "S4 put completed"
2961 );
2962 // v0.6 #35: fire bucket-notification destinations (best-effort,
2963 // detached). Skipped when no manager is attached or when the
2964 // bucket has no rule matching `s3:ObjectCreated:Put` for this
2965 // key.
2966 if backend_resp.is_ok()
2967 && let Some(mgr) = self.notifications.as_ref()
2968 {
2969 let dests = mgr.match_destinations(
2970 &put_bucket,
2971 &crate::notifications::EventType::ObjectCreatedPut,
2972 &put_key,
2973 );
2974 if !dests.is_empty() {
2975 let etag = backend_resp
2976 .as_ref()
2977 .ok()
2978 .and_then(|r| r.output.e_tag.clone())
2979 .map(ETag::into_value);
2980 let version_id = pending_version
2981 .as_ref()
2982 .filter(|pv| pv.versioned_response)
2983 .map(|pv| pv.version_id.clone());
2984 tokio::spawn(crate::notifications::dispatch_event(
2985 Arc::clone(mgr),
2986 put_bucket.clone(),
2987 put_key.clone(),
2988 crate::notifications::EventType::ObjectCreatedPut,
2989 Some(original_size),
2990 etag,
2991 version_id,
2992 format!("S4-{}", uuid::Uuid::new_v4()),
2993 ));
2994 }
2995 }
2996 // v0.6 #39: persist parsed `x-amz-tagging` tags into the
2997 // tagging manager on a successful PUT. AWS PutObject's
2998 // tagging is a full-replace operation (not a merge), so
2999 // any pre-existing entry for `(bucket, key)` is overwritten.
3000 if backend_resp.is_ok()
3001 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3002 {
3003 mgr.put_object_tags(&put_bucket, &put_key, tags);
3004 }
3005 // v0.6 #40: cross-bucket replication fire-point. On
3006 // successful source PUT, consult the replication manager;
3007 // when an enabled rule matches, mark the source key
3008 // `Pending` and spawn a detached task that PUTs the same
3009 // backend bytes + metadata to the rule's destination
3010 // bucket. The dispatcher itself records `Completed` /
3011 // `Failed` and bumps the drop counter on retry-budget
3012 // exhaustion.
3013 self.spawn_replication_if_matched(
3014 &put_bucket,
3015 &put_key,
3016 &request_tags,
3017 &replication_body,
3018 &replication_metadata,
3019 backend_resp.is_ok(),
3020 pending_version.as_ref(),
3021 );
3022 return backend_resp;
3023 }
3024 // Body-less PUT (rare: zero-length object). Mirror the body-full
3025 // versioning hooks so list_object_versions / GET-by-version still see
3026 // empty-body objects in the chain.
3027 let pending_version: Option<crate::versioning::PutOutcome> = self
3028 .versioning
3029 .as_ref()
3030 .map(|mgr| mgr.state(&put_bucket))
3031 .map(|state| match state {
3032 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3033 version_id: crate::versioning::VersioningManager::new_version_id(),
3034 versioned_response: true,
3035 },
3036 _ => crate::versioning::PutOutcome {
3037 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3038 versioned_response: false,
3039 },
3040 });
3041 if let Some(ref pv) = pending_version
3042 && pv.versioned_response
3043 {
3044 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3045 }
3046 let mut backend_resp = self.backend.put_object(req).await;
3047 if let (Some(mgr), Some(pv), Ok(resp)) = (
3048 self.versioning.as_ref(),
3049 pending_version.as_ref(),
3050 backend_resp.as_mut(),
3051 ) {
3052 let etag = resp
3053 .output
3054 .e_tag
3055 .clone()
3056 .map(ETag::into_value)
3057 .unwrap_or_default();
3058 let now = chrono::Utc::now();
3059 mgr.commit_put_with_version(
3060 &put_bucket,
3061 &put_key,
3062 crate::versioning::VersionEntry {
3063 version_id: pv.version_id.clone(),
3064 etag,
3065 size: 0,
3066 is_delete_marker: false,
3067 created_at: now,
3068 },
3069 );
3070 if pv.versioned_response {
3071 resp.output.version_id = Some(pv.version_id.clone());
3072 }
3073 }
3074 // v0.5 #30: same explicit-then-default lock-state commit as the
3075 // body-bearing branch above, so a zero-length PUT also picks up
3076 // bucket-default retention.
3077 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3078 if explicit_lock_mode.is_some()
3079 || explicit_retain_until.is_some()
3080 || explicit_legal_hold_on.is_some()
3081 {
3082 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3083 if let Some(m) = explicit_lock_mode {
3084 state.mode = Some(m);
3085 }
3086 if let Some(u) = explicit_retain_until {
3087 state.retain_until = Some(u);
3088 }
3089 if let Some(lh) = explicit_legal_hold_on {
3090 state.legal_hold_on = lh;
3091 }
3092 mgr.set(&put_bucket, &put_key, state);
3093 }
3094 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3095 }
3096 // v0.6 #35: same notification fire-point as the body-bearing PUT
3097 // branch above (zero-length objects still match `ObjectCreated:Put`
3098 // rules per the AWS event taxonomy).
3099 if backend_resp.is_ok()
3100 && let Some(mgr) = self.notifications.as_ref()
3101 {
3102 let dests = mgr.match_destinations(
3103 &put_bucket,
3104 &crate::notifications::EventType::ObjectCreatedPut,
3105 &put_key,
3106 );
3107 if !dests.is_empty() {
3108 let etag = backend_resp
3109 .as_ref()
3110 .ok()
3111 .and_then(|r| r.output.e_tag.clone())
3112 .map(ETag::into_value);
3113 let version_id = pending_version
3114 .as_ref()
3115 .filter(|pv| pv.versioned_response)
3116 .map(|pv| pv.version_id.clone());
3117 tokio::spawn(crate::notifications::dispatch_event(
3118 Arc::clone(mgr),
3119 put_bucket.clone(),
3120 put_key.clone(),
3121 crate::notifications::EventType::ObjectCreatedPut,
3122 Some(0),
3123 etag,
3124 version_id,
3125 format!("S4-{}", uuid::Uuid::new_v4()),
3126 ));
3127 }
3128 }
3129 // v0.6 #39: persist parsed `x-amz-tagging` for the body-less
3130 // (zero-length) PUT branch too — same shape as the body-bearing
3131 // branch above.
3132 if backend_resp.is_ok()
3133 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3134 {
3135 mgr.put_object_tags(&put_bucket, &put_key, tags);
3136 }
3137 // v0.6 #40: cross-bucket replication for the zero-length PUT
3138 // branch — same shape as the body-bearing branch above.
3139 // v0.8.2 #61: pass `pending_version` so a versioned source's
3140 // destination receives the same shadow-key path.
3141 self.spawn_replication_if_matched(
3142 &put_bucket,
3143 &put_key,
3144 &request_tags,
3145 &bytes::Bytes::new(),
3146 &None,
3147 backend_resp.is_ok(),
3148 pending_version.as_ref(),
3149 );
3150 backend_resp
3151 }
3152
3153 // === 圧縮を解く path (GET) ===
3154 #[tracing::instrument(
3155 name = "s4.get_object",
3156 skip(self, req),
3157 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_out, range, path)
3158 )]
3159 async fn get_object(
3160 &self,
3161 mut req: S3Request<GetObjectInput>,
3162 ) -> S3Result<S3Response<GetObjectOutput>> {
3163 let get_start = Instant::now();
3164 let get_bucket = req.input.bucket.clone();
3165 let get_key = req.input.key.clone();
3166 self.enforce_rate_limit(&req, &get_bucket)?;
3167 self.enforce_policy(&req, "s3:GetObject", &get_bucket, Some(&get_key))?;
3168 // Range request の事前検出 (decompress 後 slice する path に使う)。
3169 let range_request = req.input.range.take();
3170 // v0.5 #27: pull SSE-C material from the input headers before
3171 // the request is moved into the backend. A header parse error
3172 // fails fast (no body fetch). The material is consumed below
3173 // when decrypting an S4E3-framed body; the SSE-C headers on
3174 // `req.input` are cleared so the backend doesn't see them.
3175 let sse_c_alg = req.input.sse_customer_algorithm.take();
3176 let sse_c_key = req.input.sse_customer_key.take();
3177 let sse_c_md5 = req.input.sse_customer_key_md5.take();
3178 let get_sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3179
3180 // v0.5 #34: route the GET through the VersioningManager when
3181 // attached AND the bucket is in a versioning-aware state.
3182 // Resolves which version to fetch (explicit `?versionId=` query
3183 // param vs. chain latest), translates a delete-marker into 404
3184 // NoSuchKey, and rewrites the backend storage key to the shadow
3185 // path (`<key>.__s4ver__/<vid>`) for non-null Enabled-bucket
3186 // versions. `resolved_version_id` is stamped onto the response
3187 // so clients see a coherent `x-amz-version-id` header.
3188 //
3189 // When the bucket is Unversioned (or no manager attached), the
3190 // chain-resolution step is skipped and the request flows
3191 // through the existing single-key path unchanged.
3192 let resolved_version_id: Option<String> = match self.versioning.as_ref() {
3193 Some(mgr)
3194 if mgr.state(&get_bucket) != crate::versioning::VersioningState::Unversioned =>
3195 {
3196 let req_vid = req.input.version_id.take();
3197 let entry = match req_vid.as_deref() {
3198 Some(vid) => {
3199 mgr.lookup_version(&get_bucket, &get_key, vid)
3200 .ok_or_else(|| {
3201 S3Error::with_message(
3202 S3ErrorCode::NoSuchVersion,
3203 format!("no such version: {vid}"),
3204 )
3205 })?
3206 }
3207 None => mgr.lookup_latest(&get_bucket, &get_key).ok_or_else(|| {
3208 S3Error::with_message(
3209 S3ErrorCode::NoSuchKey,
3210 format!("no such key: {get_key}"),
3211 )
3212 })?,
3213 };
3214 if entry.is_delete_marker {
3215 // S3 spec: GET without versionId on a
3216 // delete-marker latest → 404 NoSuchKey + the
3217 // response carries `x-amz-delete-marker: true`.
3218 // GET with explicit versionId pointing at a delete
3219 // marker → 405 MethodNotAllowed; we surface
3220 // NoSuchKey here for both since s3s collapses them
3221 // into the same not-found error path.
3222 return Err(S3Error::with_message(
3223 S3ErrorCode::NoSuchKey,
3224 format!("delete marker is the current version of {get_key}"),
3225 ));
3226 }
3227 if entry.version_id != crate::versioning::NULL_VERSION_ID {
3228 req.input.key = versioned_shadow_key(&get_key, &entry.version_id);
3229 }
3230 Some(entry.version_id)
3231 }
3232 _ => None,
3233 };
3234
3235 // ====== Range GET の partial-fetch fast path (sidecar index 利用) ======
3236 // sidecar `<key>.s4index` が存在し、multipart-framed object であれば
3237 // 必要 frame だけを backend に Range GET し帯域節約する。
3238 //
3239 // v0.8.4 #73 H-2: BEFORE trusting the sidecar's frame offsets,
3240 // verify the source object hasn't been overwritten / mutated since
3241 // the sidecar was stamped. The sidecar carries the backend ETag
3242 // captured at PUT time (`source_etag`); a HEAD against the current
3243 // backend object tells us the live ETag. If they disagree we treat
3244 // the sidecar as stale and fall through to the full-GET path —
3245 // returning the wrong frames for a Range request would surface as
3246 // a CRC mismatch deeper in the stack but would also potentially
3247 // disclose unrelated frames if a hostile operator wrote the
3248 // sidecar themselves. Fail-open to "full read" is the safe default.
3249 //
3250 // Legacy v1 sidecars (no `source_etag` populated) keep the old
3251 // best-effort behaviour so existing on-disk indexes don't suddenly
3252 // start missing the partial-fetch path.
3253 if let Some(ref r) = range_request
3254 && let Some(index) = self.read_sidecar(&req.input.bucket, &req.input.key).await
3255 && self
3256 .sidecar_version_binding_ok(&req.input.bucket, &req.input.key, &index)
3257 .await
3258 {
3259 let total = index.total_original_size();
3260 let (start, end_exclusive) = match resolve_range(r, total) {
3261 Ok(v) => v,
3262 Err(e) => {
3263 return Err(S3Error::with_message(S3ErrorCode::InvalidRange, e));
3264 }
3265 };
3266 if let Some(plan) = index.lookup_range(start, end_exclusive) {
3267 return self
3268 .partial_range_get(&req, plan, start, end_exclusive, total, get_start)
3269 .await;
3270 }
3271 }
3272 let mut resp = self.backend.get_object(req).await?;
3273 // v0.5 #34: stamp the resolved version-id so the client sees a
3274 // coherent `x-amz-version-id` header (only for chains owned by
3275 // the manager — Unversioned buckets / no-manager paths never
3276 // set this).
3277 if let Some(ref vid) = resolved_version_id {
3278 resp.output.version_id = Some(vid.clone());
3279 }
3280 let is_multipart = is_multipart_object(&resp.output.metadata);
3281 let is_framed_v2 = is_framed_v2_object(&resp.output.metadata);
3282 // v0.2 #4: framed-v2 single-PUT は多 frame parse が必要なので
3283 // multipart と同じ path に流す。
3284 let needs_frame_parse = is_multipart || is_framed_v2;
3285 let manifest_opt = extract_manifest(&resp.output.metadata);
3286
3287 if !needs_frame_parse && manifest_opt.is_none() {
3288 // S4 が書いていないオブジェクトは透過 (raw bucket pre-existing object 等)
3289 debug!("S4 get_object: object lacks s4-codec metadata, returning as-is");
3290 return Ok(resp);
3291 }
3292
3293 if let Some(blob) = resp.output.body.take() {
3294 // v0.4 #21 / v0.5 #27: if the object was stored under SSE
3295 // (metadata flag `s4-encrypted: aes-256-gcm`), decrypt
3296 // before any frame parse / streaming decompress. Encrypted
3297 // bodies are opaque to the codec; this also forces the
3298 // buffered path because AES-GCM needs the full body for tag
3299 // verify. SSE-C uses the per-request customer key, SSE-S4
3300 // falls back to the configured keyring.
3301 let blob = if is_sse_encrypted(&resp.output.metadata) {
3302 let body = collect_blob(blob, self.max_body_bytes)
3303 .await
3304 .map_err(internal("collect SSE-encrypted body"))?;
3305 // v0.5 #28: peek the frame magic to route the right
3306 // decrypt path. S4E4 means SSE-KMS — unwrap the DEK
3307 // through the KMS backend (async). S4E1/E2/E3 take
3308 // the sync path (keyring or customer key).
3309 //
3310 // v0.8 #52 (S4E5) / v0.8.1 #57 (S4E6): the chunked
3311 // SSE-S4 frames take the *streaming* path — we hand
3312 // the response body a per-chunk verify-and-emit
3313 // Stream so the client sees chunk 0 plaintext after
3314 // one chunk-worth of AES-GCM verify (vs. waiting
3315 // for the whole body's tag), and the gateway no
3316 // longer needs to materialize the full plaintext
3317 // in memory before responding. SSE-C is out of
3318 // scope for the chunked path (chunked S4E3 is a
3319 // follow-up), so this branch requires the SSE-S4
3320 // keyring to be wired and `get_sse_c_material` to
3321 // be absent — otherwise we surface a clear
3322 // misconfiguration error instead of silently
3323 // falling through to the buffered chunked path.
3324 // v0.8.11 CRIT-1 fix: the chunked stream early-return is
3325 // only correct when the decrypted body IS the user's
3326 // plaintext as-stored. If the object went through the
3327 // codec (compressed) or carries S4F2 frames, returning
3328 // the decrypt stream directly hands the client
3329 // compressed / framed bytes. Restrict the early-return
3330 // to codec=Passthrough + non-framed objects; everything
3331 // else falls through to the buffered path, which
3332 // decrypt-buffers S4E5/S4E6 via
3333 // `decrypt_chunked_buffered_default` and then runs the
3334 // existing decompress pipeline.
3335 let chunked_streaming_safe = !needs_frame_parse
3336 && manifest_opt
3337 .as_ref()
3338 .map(|m| m.codec == CodecKind::Passthrough)
3339 .unwrap_or(false);
3340 if matches!(crate::sse::peek_magic(&body), Some("S4E5") | Some("S4E6"))
3341 && get_sse_c_material.is_none()
3342 && chunked_streaming_safe
3343 {
3344 let keyring_arc = self.sse_keyring.clone().ok_or_else(|| {
3345 S3Error::with_message(
3346 S3ErrorCode::InvalidRequest,
3347 "object is SSE-S4 encrypted (S4E5/S4E6) but no --sse-s4-key is configured on this gateway",
3348 )
3349 })?;
3350 let body_len = body.len() as u64;
3351 let stream = crate::sse::decrypt_chunked_stream(body, keyring_arc.as_ref());
3352 // Stream is `'static` (the keyring borrow is
3353 // consumed up front; the cipher lives inside
3354 // the stream state — see decrypt_chunked_stream
3355 // doc), so we can move it straight into a
3356 // StreamingBlob without lifetime gymnastics.
3357 use futures::StreamExt;
3358 let mapped = stream.map(|r| {
3359 r.map_err(|e| std::io::Error::other(format!("SSE-S4 chunked decrypt: {e}")))
3360 });
3361 use s3s::dto::StreamingBlob;
3362 resp.output.body = Some(StreamingBlob::wrap(mapped));
3363 // Plaintext content_length is unknown until all
3364 // chunks have been verified; null it out so the
3365 // ByteStream wrapper reports `unknown` to the
3366 // HTTP layer (which then emits chunked transfer-
3367 // encoding) rather than lying about the size.
3368 resp.output.content_length = None;
3369 // The backend's checksums + ETag describe the
3370 // encrypted body (S4E5/S4E6 wire format), not
3371 // the plaintext we're about to stream — clear them
3372 // so the AWS SDK doesn't fail the GET with a
3373 // ChecksumMismatch on a successful round-trip.
3374 // Mirrors the streaming-zstd path at L1180-1185.
3375 resp.output.checksum_crc32 = None;
3376 resp.output.checksum_crc32c = None;
3377 resp.output.checksum_crc64nvme = None;
3378 resp.output.checksum_sha1 = None;
3379 resp.output.checksum_sha256 = None;
3380 resp.output.e_tag = None;
3381 let elapsed = get_start.elapsed();
3382 crate::metrics::record_get(
3383 "sse-s4-chunked",
3384 body_len,
3385 body_len,
3386 elapsed.as_secs_f64(),
3387 true,
3388 );
3389 return Ok(resp);
3390 }
3391 let plain = match crate::sse::peek_magic(&body) {
3392 Some("S4E4") => {
3393 let kms = self.kms.as_ref().ok_or_else(|| {
3394 S3Error::with_message(
3395 S3ErrorCode::InvalidRequest,
3396 "object is SSE-KMS encrypted but no --kms-local-dir / --kms-aws-region is configured on this gateway",
3397 )
3398 })?;
3399 let kms_ref: &dyn crate::kms::KmsBackend = kms.as_ref();
3400 crate::sse::decrypt_with_kms(&body, kms_ref)
3401 .await
3402 .map_err(|e| match e {
3403 crate::sse::SseError::KmsBackend(k) => kms_error_to_s3(k),
3404 other => S3Error::with_message(
3405 S3ErrorCode::InternalError,
3406 format!("SSE-KMS decrypt failed: {other}"),
3407 ),
3408 })?
3409 }
3410 _ => {
3411 if let Some(ref m) = get_sse_c_material {
3412 crate::sse::decrypt(
3413 &body,
3414 crate::sse::SseSource::CustomerKey {
3415 key: &m.key,
3416 key_md5: &m.key_md5,
3417 },
3418 )
3419 .map_err(sse_c_error_to_s3)?
3420 } else {
3421 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
3422 S3Error::with_message(
3423 S3ErrorCode::InvalidRequest,
3424 "object is SSE-S4 encrypted but no --sse-s4-key is configured on this gateway",
3425 )
3426 })?;
3427 crate::sse::decrypt(&body, keyring).map_err(|e| {
3428 S3Error::with_message(
3429 S3ErrorCode::InternalError,
3430 format!("SSE-S4 decrypt failed: {e}"),
3431 )
3432 })?
3433 }
3434 }
3435 };
3436 // v0.5 #28: parse out the on-disk wrapped DEK's key id
3437 // so the GET response can echo `x-amz-server-side-encryption-aws-kms-key-id`.
3438 if matches!(crate::sse::peek_magic(&body), Some("S4E4"))
3439 && let Ok(hdr) = crate::sse::parse_s4e4_header(&body)
3440 {
3441 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
3442 ServerSideEncryption::AWS_KMS,
3443 ));
3444 resp.output.ssekms_key_id = Some(hdr.key_id.to_string());
3445 }
3446 bytes_to_blob(plain)
3447 } else if let Some(ref m) = get_sse_c_material {
3448 // Client sent SSE-C headers for an unencrypted object —
3449 // mirror AWS S3's 400 InvalidRequest.
3450 let _ = m;
3451 return Err(sse_c_error_to_s3(
3452 crate::sse::SseError::CustomerKeyUnexpected,
3453 ));
3454 } else {
3455 blob
3456 };
3457 // v0.5 #27: SSE-C echo on success — algorithm + key MD5
3458 // tell the client that the supplied key was the one used.
3459 if let Some(ref m) = get_sse_c_material {
3460 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
3461 resp.output.sse_customer_key_md5 =
3462 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
3463 }
3464 // ====== Streaming fast path (CpuZstd, non-multipart, codec supports it) ======
3465 // 大規模 object (e.g. 5 GB) を memory に collect すると OOM するので、
3466 // codec が streaming-aware なら body を chunk-by-chunk で decompress して
3467 // 即座に client に流す。
3468 //
3469 // ただし Range request 時は streaming できない (slice するため total bytes
3470 // が必要) → buffered path に fall through。
3471 if range_request.is_none()
3472 && !needs_frame_parse
3473 && let Some(ref m) = manifest_opt
3474 && supports_streaming_decompress(m.codec)
3475 && m.codec == CodecKind::CpuZstd
3476 {
3477 // v0.8.4 #73 H-1: wrap the decompressor output in a
3478 // rolling-CRC32C verifier so a tampered ciphertext (or a
3479 // backend-side corruption that the zstd decoder happens
3480 // to "successfully" decode into wrong bytes) surfaces as
3481 // a streaming error tail at EOF instead of silently
3482 // delivering corrupt plaintext to the client. The wrap
3483 // is a pure pass-through during the body — no extra
3484 // buffering, TTFB unaffected — and the integrity
3485 // decision lands at the last chunk.
3486 let decompressed_blob = cpu_zstd_decompress_stream(blob);
3487 let verified_reader = Crc32cVerifyingReader::new(
3488 blob_to_async_read(decompressed_blob),
3489 m.crc32c,
3490 m.original_size,
3491 );
3492 let verified_blob = async_read_to_blob(verified_reader);
3493 resp.output.content_length = Some(m.original_size as i64);
3494 resp.output.checksum_crc32 = None;
3495 resp.output.checksum_crc32c = None;
3496 resp.output.checksum_crc64nvme = None;
3497 resp.output.checksum_sha1 = None;
3498 resp.output.checksum_sha256 = None;
3499 resp.output.e_tag = None;
3500 resp.output.body = Some(verified_blob);
3501 let elapsed = get_start.elapsed();
3502 crate::metrics::record_get(
3503 m.codec.as_str(),
3504 m.compressed_size,
3505 m.original_size,
3506 elapsed.as_secs_f64(),
3507 true,
3508 );
3509 info!(
3510 op = "get_object",
3511 bucket = %get_bucket,
3512 key = %get_key,
3513 codec = m.codec.as_str(),
3514 bytes_in = m.compressed_size,
3515 bytes_out = m.original_size,
3516 path = "streaming",
3517 setup_latency_ms = elapsed.as_millis() as u64,
3518 "S4 get started (streaming)"
3519 );
3520 return Ok(resp);
3521 }
3522 // Passthrough: そのまま流す (Range なしの場合のみ streaming)
3523 if range_request.is_none()
3524 && !needs_frame_parse
3525 && let Some(ref m) = manifest_opt
3526 && m.codec == CodecKind::Passthrough
3527 {
3528 resp.output.content_length = Some(m.original_size as i64);
3529 resp.output.checksum_crc32 = None;
3530 resp.output.checksum_crc32c = None;
3531 resp.output.checksum_crc64nvme = None;
3532 resp.output.checksum_sha1 = None;
3533 resp.output.checksum_sha256 = None;
3534 resp.output.e_tag = None;
3535 resp.output.body = Some(blob);
3536 debug!("S4 get_object: passthrough streaming");
3537 return Ok(resp);
3538 }
3539
3540 // ====== Buffered slow path (multipart frame parser, GPU codecs) ======
3541 let bytes = collect_blob(blob, self.max_body_bytes)
3542 .await
3543 .map_err(internal("collect get body"))?;
3544
3545 let decompressed = if needs_frame_parse {
3546 // multipart objects と framed-v2 single-PUT objects は同じ
3547 // S4F2 frame 列なので decompress_multipart で統一処理
3548 self.decompress_multipart(bytes).await?
3549 } else {
3550 let manifest = manifest_opt.as_ref().expect("non-multipart guarded above");
3551 self.registry
3552 .decompress(bytes, manifest)
3553 .await
3554 .map_err(internal("registry decompress"))?
3555 };
3556
3557 // Range request があれば slice。なければ full body を返す。
3558 let total_size = decompressed.len() as u64;
3559 let (final_bytes, status_override) = if let Some(r) = range_request.as_ref() {
3560 let (start, end) = resolve_range(r, total_size)
3561 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
3562 let sliced = decompressed.slice(start as usize..end as usize);
3563 resp.output.content_range = Some(format!(
3564 "bytes {start}-{}/{total_size}",
3565 end.saturating_sub(1)
3566 ));
3567 (sliced, Some(http::StatusCode::PARTIAL_CONTENT))
3568 } else {
3569 (decompressed, None)
3570 };
3571 // 解凍後の真のサイズを返す (S3 client は content_length を信頼するので
3572 // 圧縮 size のままだと downstream が body を途中で切ってしまう)
3573 resp.output.content_length = Some(final_bytes.len() as i64);
3574 // 圧縮済 bytes の checksum を返すと AWS SDK 側で StreamingError
3575 // (ChecksumMismatch) になる。ETag も backend が返した「圧縮済 bytes の
3576 // MD5/checksum」なので意味的にズレる — クリアして S4 自身の crc32c
3577 // (manifest 内 / frame 内) で integrity を保証する設計にする。
3578 resp.output.checksum_crc32 = None;
3579 resp.output.checksum_crc32c = None;
3580 resp.output.checksum_crc64nvme = None;
3581 resp.output.checksum_sha1 = None;
3582 resp.output.checksum_sha256 = None;
3583 resp.output.e_tag = None;
3584 let returned_size = final_bytes.len() as u64;
3585 let codec_label = manifest_opt
3586 .as_ref()
3587 .map(|m| m.codec.as_str())
3588 .unwrap_or("multipart");
3589 resp.output.body = Some(bytes_to_blob(final_bytes));
3590 if let Some(status) = status_override {
3591 resp.status = Some(status);
3592 }
3593 let elapsed = get_start.elapsed();
3594 crate::metrics::record_get(codec_label, 0, returned_size, elapsed.as_secs_f64(), true);
3595 info!(
3596 op = "get_object",
3597 bucket = %get_bucket,
3598 key = %get_key,
3599 codec = codec_label,
3600 bytes_out = returned_size,
3601 total_object_size = total_size,
3602 range = range_request.is_some(),
3603 path = "buffered",
3604 latency_ms = elapsed.as_millis() as u64,
3605 "S4 get completed (buffered)"
3606 );
3607 }
3608 // v0.6 #40: echo the recorded `x-amz-replication-status` so
3609 // consumers can poll progress (PENDING / COMPLETED / FAILED).
3610 if let Some(mgr) = self.replication.as_ref()
3611 && let Some(status) = mgr.lookup_status(&get_bucket, &get_key)
3612 {
3613 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3614 status.as_aws_str().to_owned(),
3615 ));
3616 }
3617 Ok(resp)
3618 }
3619
3620 // === passthrough delegations ===
3621 async fn head_bucket(
3622 &self,
3623 req: S3Request<HeadBucketInput>,
3624 ) -> S3Result<S3Response<HeadBucketOutput>> {
3625 self.backend.head_bucket(req).await
3626 }
3627 async fn list_buckets(
3628 &self,
3629 req: S3Request<ListBucketsInput>,
3630 ) -> S3Result<S3Response<ListBucketsOutput>> {
3631 self.backend.list_buckets(req).await
3632 }
3633 async fn create_bucket(
3634 &self,
3635 req: S3Request<CreateBucketInput>,
3636 ) -> S3Result<S3Response<CreateBucketOutput>> {
3637 self.backend.create_bucket(req).await
3638 }
3639 async fn delete_bucket(
3640 &self,
3641 req: S3Request<DeleteBucketInput>,
3642 ) -> S3Result<S3Response<DeleteBucketOutput>> {
3643 self.backend.delete_bucket(req).await
3644 }
3645 async fn head_object(
3646 &self,
3647 req: S3Request<HeadObjectInput>,
3648 ) -> S3Result<S3Response<HeadObjectOutput>> {
3649 // v0.6 #40: capture bucket/key before req is consumed so the
3650 // replication-status echo can look the entry up.
3651 let head_bucket = req.input.bucket.clone();
3652 let head_key = req.input.key.clone();
3653 let mut resp = self.backend.head_object(req).await?;
3654 if let Some(manifest) = extract_manifest(&resp.output.metadata) {
3655 // 客側には decompress 後の意味のある content_length / checksum を返す。
3656 // backend が返す圧縮済 bytes の checksum / e_tag は意味が違うため除去
3657 // (S4 は manifest 内の crc32c で integrity を担保する)。
3658 resp.output.content_length = Some(manifest.original_size as i64);
3659 resp.output.checksum_crc32 = None;
3660 resp.output.checksum_crc32c = None;
3661 resp.output.checksum_crc64nvme = None;
3662 resp.output.checksum_sha1 = None;
3663 resp.output.checksum_sha256 = None;
3664 resp.output.e_tag = None;
3665 }
3666 // v0.6 #40: echo `x-amz-replication-status` (PENDING / COMPLETED
3667 // / FAILED) so consumers can poll progress without a GET.
3668 if let Some(mgr) = self.replication.as_ref()
3669 && let Some(status) = mgr.lookup_status(&head_bucket, &head_key)
3670 {
3671 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3672 status.as_aws_str().to_owned(),
3673 ));
3674 }
3675 // v0.7 #48 BUG-4 fix: HEAD must echo SSE indicators so SDKs
3676 // and pipelines see the same posture they got on PUT. The PUT
3677 // path stamps `s4-sse-type` metadata for exactly this — HEAD
3678 // doesn't fetch the body, so it can't peek frame magic.
3679 if let Some(meta) = resp.output.metadata.as_ref()
3680 && let Some(sse_type) = meta.get("s4-sse-type")
3681 {
3682 {
3683 match sse_type.as_str() {
3684 "aws:kms" => {
3685 resp.output.server_side_encryption = Some(
3686 ServerSideEncryption::from_static(ServerSideEncryption::AWS_KMS),
3687 );
3688 if let Some(key_id) = meta.get("s4-sse-kms-key-id") {
3689 resp.output.ssekms_key_id = Some(key_id.clone());
3690 }
3691 }
3692 _ => {
3693 resp.output.server_side_encryption = Some(
3694 ServerSideEncryption::from_static(ServerSideEncryption::AES256),
3695 );
3696 if let Some(md5) = meta.get("s4-sse-c-key-md5") {
3697 resp.output.sse_customer_algorithm =
3698 Some(crate::sse::SSE_C_ALGORITHM.into());
3699 resp.output.sse_customer_key_md5 = Some(md5.clone());
3700 }
3701 }
3702 }
3703 }
3704 }
3705 Ok(resp)
3706 }
3707 async fn delete_object(
3708 &self,
3709 mut req: S3Request<DeleteObjectInput>,
3710 ) -> S3Result<S3Response<DeleteObjectOutput>> {
3711 let bucket = req.input.bucket.clone();
3712 let key = req.input.key.clone();
3713 self.enforce_rate_limit(&req, &bucket)?;
3714 self.enforce_policy(&req, "s3:DeleteObject", &bucket, Some(&key))?;
3715 // v0.6 #42: MFA Delete enforcement. When the bucket has
3716 // MFA-Delete = Enabled, every DELETE / DELETE-version /
3717 // delete-marker form needs `x-amz-mfa: <serial> <code>` (RFC 6238
3718 // 6-digit TOTP). Runs *before* the WORM / versioning routers so
3719 // a missing token is denied for free regardless of which delete
3720 // path the request would otherwise take.
3721 if let Some(mgr) = self.mfa_delete.as_ref()
3722 && mgr.is_enabled(&bucket)
3723 {
3724 let header = req.input.mfa.as_deref();
3725 if let Err(e) = crate::mfa::check_mfa(&bucket, header, mgr, current_unix_secs()) {
3726 crate::metrics::record_mfa_delete_denial(&bucket);
3727 return Err(mfa_error_to_s3(e));
3728 }
3729 }
3730 // v0.5 #30: refuse the delete while a WORM lock is in effect.
3731 // Compliance can never be bypassed; Governance can be overridden
3732 // via `x-amz-bypass-governance-retention: true`; legal hold
3733 // never. The check happens before the versioning router so a
3734 // locked object can't be soft-deleted (delete-marker push) on an
3735 // Enabled bucket either — S3 spec says lock applies to all
3736 // delete forms.
3737 if let Some(mgr) = self.object_lock.as_ref()
3738 && let Some(state) = mgr.get(&bucket, &key)
3739 {
3740 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
3741 // v0.8.12 HIGH-7 fix: the bypass header alone used to be
3742 // enough to override Governance retention. AWS spec
3743 // requires the caller hold `s3:BypassGovernanceRetention`
3744 // for the target ARN; without that, the header is
3745 // silently ignored (not an error — it lines up with how
3746 // AWS' canonical behaviour treats unprivileged callers).
3747 let bypass_allowed = if bypass_header {
3748 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
3749 .is_ok()
3750 } else {
3751 false
3752 };
3753 let now = chrono::Utc::now();
3754 if !state.can_delete(now, bypass_allowed) {
3755 crate::metrics::record_policy_denial("s3:DeleteObject", &bucket);
3756 return Err(S3Error::with_message(
3757 S3ErrorCode::AccessDenied,
3758 "Access Denied because object protected by object lock",
3759 ));
3760 }
3761 }
3762 // v0.5 #34: route DELETE through the VersioningManager when the
3763 // bucket is in a versioning-aware state.
3764 //
3765 // - Enabled bucket, no version_id → push a delete marker into
3766 // the chain. NO backend object is touched (older versions
3767 // stay reachable via specific-version GET).
3768 // - Enabled / Suspended bucket, with version_id → physical
3769 // delete. Backend bytes at the shadow key (or `<key>` for
3770 // `null`) are removed; chain entry is dropped. If the deleted
3771 // entry was a delete marker, no backend bytes exist for it
3772 // (record-only).
3773 // - Suspended bucket, no version_id → push a "null" delete
3774 // marker (S3 spec); backend bytes at `<key>` are physically
3775 // removed (same as legacy).
3776 // - Unversioned bucket → fall through to legacy passthrough.
3777 if let Some(mgr) = self.versioning.as_ref() {
3778 let state = mgr.state(&bucket);
3779 if state != crate::versioning::VersioningState::Unversioned {
3780 let req_vid = req.input.version_id.take();
3781 if let Some(vid) = req_vid {
3782 // Specific-version DELETE: touch backend bytes only
3783 // when the entry was a real version (not a delete
3784 // marker, which has no backend bytes).
3785 let outcome = mgr.record_delete_specific(&bucket, &key, &vid);
3786 let backend_target = if vid == crate::versioning::NULL_VERSION_ID {
3787 key.clone()
3788 } else {
3789 versioned_shadow_key(&key, &vid)
3790 };
3791 let was_real_version = outcome
3792 .as_ref()
3793 .map(|o| !o.is_delete_marker)
3794 .unwrap_or(false);
3795 if was_real_version {
3796 // Best-effort backend cleanup; missing bytes
3797 // are not an error (e.g. shadow key already
3798 // GC'd).
3799 let backend_input = DeleteObjectInput {
3800 bucket: bucket.clone(),
3801 key: backend_target,
3802 ..Default::default()
3803 };
3804 let backend_req = S3Request {
3805 input: backend_input,
3806 method: http::Method::DELETE,
3807 uri: req.uri.clone(),
3808 headers: req.headers.clone(),
3809 extensions: http::Extensions::new(),
3810 credentials: req.credentials.clone(),
3811 region: req.region.clone(),
3812 service: req.service.clone(),
3813 trailing_headers: None,
3814 };
3815 let _ = self.backend.delete_object(backend_req).await;
3816 }
3817 let mut output = DeleteObjectOutput {
3818 version_id: Some(vid.clone()),
3819 ..Default::default()
3820 };
3821 if let Some(o) = outcome.as_ref()
3822 && o.is_delete_marker
3823 {
3824 output.delete_marker = Some(true);
3825 }
3826 // v0.6 #35: specific-version DELETE always counts as
3827 // a hard `ObjectRemoved:Delete` event (the chain
3828 // entry, marker or not, is gone after this call).
3829 self.fire_delete_notification(
3830 &bucket,
3831 &key,
3832 crate::notifications::EventType::ObjectRemovedDelete,
3833 Some(vid.clone()),
3834 );
3835 return Ok(S3Response::new(output));
3836 }
3837 // No version_id: record a delete marker (state-aware).
3838 let outcome = mgr.record_delete(&bucket, &key);
3839 if state == crate::versioning::VersioningState::Suspended {
3840 // Suspended buckets also evict the prior `<key>`
3841 // bytes (the previous null version is gone too).
3842 let backend_input = DeleteObjectInput {
3843 bucket: bucket.clone(),
3844 key: key.clone(),
3845 ..Default::default()
3846 };
3847 let backend_req = S3Request {
3848 input: backend_input,
3849 method: http::Method::DELETE,
3850 uri: req.uri.clone(),
3851 headers: req.headers.clone(),
3852 extensions: http::Extensions::new(),
3853 credentials: req.credentials.clone(),
3854 region: req.region.clone(),
3855 service: req.service.clone(),
3856 trailing_headers: None,
3857 };
3858 let _ = self.backend.delete_object(backend_req).await;
3859 }
3860 let output = DeleteObjectOutput {
3861 delete_marker: Some(true),
3862 version_id: outcome.version_id.clone(),
3863 ..Default::default()
3864 };
3865 // v0.6 #35: versioned bucket DELETE without a version-id
3866 // creates a delete marker — the dedicated AWS event
3867 // taxonomy entry. Suspended-state buckets also push a
3868 // (null) marker, so the same event fires there.
3869 self.fire_delete_notification(
3870 &bucket,
3871 &key,
3872 crate::notifications::EventType::ObjectRemovedDeleteMarker,
3873 outcome.version_id,
3874 );
3875 return Ok(S3Response::new(output));
3876 }
3877 }
3878 // Legacy / Unversioned path: physical delete on the backend +
3879 // best-effort sidecar cleanup (mirrors v0.4 behaviour).
3880 let resp = self.backend.delete_object(req).await?;
3881 // v0.5 #30: drop any per-object lock state once the delete has
3882 // succeeded so the freed key can be re-armed by a future PUT
3883 // under the bucket default. Reaching here implies the lock had
3884 // already passed `can_delete` above, so this is purely cleanup.
3885 if let Some(mgr) = self.object_lock.as_ref() {
3886 mgr.clear(&bucket, &key);
3887 }
3888 // v0.6 #39: drop any object-level tag set on physical delete —
3889 // the freed key starts a fresh tag history if a future PUT
3890 // re-creates it. (Versioned-delete branches above return early
3891 // and do NOT touch tags, mirroring AWS where tag state is
3892 // attached to the logical key, not the version chain.)
3893 if let Some(mgr) = self.tagging.as_ref() {
3894 mgr.delete_object_tags(&bucket, &key);
3895 }
3896 let sidecar = sidecar_key(&key);
3897 // v0.7 #49: skip the sidecar DELETE if the key + sidecar suffix
3898 // can't be encoded into a request URI — the primary delete
3899 // already succeeded and a stale sidecar is harmless (Range GET
3900 // re-validates the underlying object on next read).
3901 if let Ok(uri) = safe_object_uri(&bucket, &sidecar) {
3902 let sidecar_input = DeleteObjectInput {
3903 bucket: bucket.clone(),
3904 key: sidecar,
3905 ..Default::default()
3906 };
3907 let sidecar_req = S3Request {
3908 input: sidecar_input,
3909 method: http::Method::DELETE,
3910 uri,
3911 headers: http::HeaderMap::new(),
3912 extensions: http::Extensions::new(),
3913 credentials: None,
3914 region: None,
3915 service: None,
3916 trailing_headers: None,
3917 };
3918 let _ = self.backend.delete_object(sidecar_req).await;
3919 }
3920 // v0.6 #35: legacy unversioned-bucket hard delete fires the
3921 // canonical `ObjectRemoved:Delete` event.
3922 self.fire_delete_notification(
3923 &bucket,
3924 &key,
3925 crate::notifications::EventType::ObjectRemovedDelete,
3926 None,
3927 );
3928 Ok(resp)
3929 }
3930 async fn delete_objects(
3931 &self,
3932 req: S3Request<DeleteObjectsInput>,
3933 ) -> S3Result<S3Response<DeleteObjectsOutput>> {
3934 // v0.6 #42: MFA Delete applies once to the whole batch (S3 spec:
3935 // when MFA-Delete is on the bucket, a missing / invalid token
3936 // fails the entire DeleteObjects request, not per-object).
3937 if let Some(mgr) = self.mfa_delete.as_ref()
3938 && mgr.is_enabled(&req.input.bucket)
3939 {
3940 let header = req.input.mfa.as_deref();
3941 if let Err(e) =
3942 crate::mfa::check_mfa(&req.input.bucket, header, mgr, current_unix_secs())
3943 {
3944 crate::metrics::record_mfa_delete_denial(&req.input.bucket);
3945 return Err(mfa_error_to_s3(e));
3946 }
3947 }
3948 // v0.8.11 CRIT-3 fix: route every entry through the gated
3949 // per-object `delete_object` path so Object Lock, IAM policy,
3950 // versioning, tagging, sidecar cleanup and notification fan-
3951 // out all fire for batch DELETE. The previous
3952 // `self.backend.delete_objects(req).await` straight-through
3953 // bypassed every gate, so a `legal_hold=on` key listed inside
3954 // a DeleteObjects XML was happily removed.
3955 //
3956 // S3 spec note: DeleteObjects is "best-effort per object" —
3957 // a failure on one key surfaces as an `Errors` entry without
3958 // aborting the rest of the batch. Quiet-mode suppresses the
3959 // `Deleted` list (errors are still reported). We honour both.
3960 let bucket = req.input.bucket.clone();
3961 let bypass_governance = req.input.bypass_governance_retention.unwrap_or(false);
3962 let mfa_header = req.input.mfa.clone();
3963 let quiet = req.input.delete.quiet.unwrap_or(false);
3964 let mut deleted: Vec<DeletedObject> = Vec::new();
3965 let mut errors: Vec<s3s::dto::Error> = Vec::new();
3966 for ident in req.input.delete.objects.iter() {
3967 let key = ident.key.clone();
3968 let version_id = ident.version_id.clone();
3969 let per_input = DeleteObjectInput {
3970 bucket: bucket.clone(),
3971 key: key.clone(),
3972 version_id: version_id.clone(),
3973 bypass_governance_retention: Some(bypass_governance),
3974 mfa: mfa_header.clone(),
3975 ..Default::default()
3976 };
3977 let per_uri = match safe_object_uri(&bucket, &key) {
3978 Ok(u) => u,
3979 Err(_) => {
3980 errors.push(s3s::dto::Error {
3981 code: Some("InvalidArgument".to_owned()),
3982 key: Some(key),
3983 message: Some("object key is not URI-encodable".to_owned()),
3984 version_id,
3985 });
3986 continue;
3987 }
3988 };
3989 let per_req = S3Request {
3990 input: per_input,
3991 method: http::Method::DELETE,
3992 uri: per_uri,
3993 headers: req.headers.clone(),
3994 extensions: http::Extensions::new(),
3995 credentials: req.credentials.clone(),
3996 region: req.region.clone(),
3997 service: req.service.clone(),
3998 trailing_headers: None,
3999 };
4000 match self.delete_object(per_req).await {
4001 Ok(resp) => {
4002 let out = resp.output;
4003 // DeleteObjectOutput doesn't surface a separate
4004 // `delete_marker_version_id`; the marker's version
4005 // id is whatever `version_id` carries (when the
4006 // versioning manager pushed a delete-marker, that
4007 // field already holds the marker's vid).
4008 let vid = out.version_id.clone().or(version_id);
4009 deleted.push(DeletedObject {
4010 key: Some(key),
4011 version_id: vid.clone(),
4012 delete_marker: out.delete_marker,
4013 delete_marker_version_id: vid,
4014 });
4015 }
4016 Err(e) => {
4017 let code_str = e.code().as_str().to_owned();
4018 let msg = e.message().unwrap_or(code_str.as_str()).to_owned();
4019 errors.push(s3s::dto::Error {
4020 code: Some(code_str),
4021 key: Some(key),
4022 message: Some(msg),
4023 version_id,
4024 });
4025 }
4026 }
4027 }
4028 let output = DeleteObjectsOutput {
4029 deleted: if quiet || deleted.is_empty() {
4030 None
4031 } else {
4032 Some(deleted)
4033 },
4034 errors: if errors.is_empty() {
4035 None
4036 } else {
4037 Some(errors)
4038 },
4039 ..Default::default()
4040 };
4041 Ok(S3Response::new(output))
4042 }
4043 async fn copy_object(
4044 &self,
4045 mut req: S3Request<CopyObjectInput>,
4046 ) -> S3Result<S3Response<CopyObjectOutput>> {
4047 // copy is conceptually "GetObject src + PutObject dst" — enforce both.
4048 let dst_bucket = req.input.bucket.clone();
4049 let dst_key = req.input.key.clone();
4050 // v0.8.15 M-1: same reserved-name guard as `put_object`. A
4051 // copy whose destination would land at `<x>.s4index` carries
4052 // the same listing / cleanup hazards.
4053 if s4_codec::index::is_reserved_sidecar_key(&dst_key) {
4054 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
4055 .unwrap_or(S3ErrorCode::InvalidArgument);
4056 return Err(S3Error::with_message(
4057 code,
4058 format!(
4059 "destination key {dst_key:?} is reserved (suffix `{}` is used for S4 \
4060 internal sidecars)",
4061 s4_codec::index::SIDECAR_SUFFIX,
4062 ),
4063 ));
4064 }
4065 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
4066 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4067 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
4068 }
4069 // S4-aware copy: source object に s4-* metadata がある場合、それを
4070 // destination に確実に preserve する。
4071 //
4072 // - MetadataDirective::COPY (default): backend が source metadata を
4073 // そのまま copy するので S4 metadata も自動で渡る。介入不要
4074 // - MetadataDirective::REPLACE: 客が指定した metadata で source を
4075 // 上書き → s4-* metadata が消えると destination は decompress 不能に
4076 // なる (silent corruption)。S4 が source metadata を HEAD で取得し、
4077 // s4-* fields を input.metadata に強制 merge する
4078 let needs_merge = req
4079 .input
4080 .metadata_directive
4081 .as_ref()
4082 .map(|d| d.as_str() == MetadataDirective::REPLACE)
4083 .unwrap_or(false);
4084 if needs_merge && let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4085 let head_input = HeadObjectInput {
4086 bucket: bucket.to_string(),
4087 key: key.to_string(),
4088 ..Default::default()
4089 };
4090 let head_req = S3Request {
4091 input: head_input,
4092 method: req.method.clone(),
4093 uri: req.uri.clone(),
4094 headers: req.headers.clone(),
4095 extensions: http::Extensions::new(),
4096 credentials: req.credentials.clone(),
4097 region: req.region.clone(),
4098 service: req.service.clone(),
4099 trailing_headers: None,
4100 };
4101 if let Ok(head) = self.backend.head_object(head_req).await
4102 && let Some(src_meta) = head.output.metadata.as_ref()
4103 {
4104 let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4105 // v0.8.15 M-2: drop ANY client-supplied `s4-*` key
4106 // first. The reserved `s4-*` namespace describes the
4107 // wire format the codec layer relies on
4108 // (`s4-original-size`, `s4-crc32c`, `s4-codec`,
4109 // `s4-multipart`, `s4-framed`, plus the SSE flags
4110 // `s4-encrypted` / `s4-sse-type` / `s4-sse-c-key-md5`
4111 // / `s4-sse-kms-key-id`). The pre-M-2 code used
4112 // `or_insert_with` which *preferred* the client's
4113 // value — a malicious client could
4114 // `aws s3 cp s3://src s3://dst
4115 // --metadata-directive REPLACE
4116 // --metadata 's4-original-size=5368709120'`
4117 // and persuade S4 to misread the body on the next
4118 // GET (silent data corruption or DoS through
4119 // mis-sized buffer alloc). Strip the namespace and
4120 // force the source values back in.
4121 dest_meta.retain(|k, _| !k.to_ascii_lowercase().starts_with("s4-"));
4122 for key in [
4123 META_CODEC,
4124 META_ORIGINAL_SIZE,
4125 META_COMPRESSED_SIZE,
4126 META_CRC32C,
4127 META_MULTIPART,
4128 META_FRAMED,
4129 ] {
4130 if let Some(v) = src_meta.get(key) {
4131 dest_meta.insert(key.to_string(), v.clone());
4132 }
4133 }
4134 // SSE markers are equally reserved — propagate any
4135 // source flags so a copy of an encrypted object stays
4136 // marked as encrypted at the destination.
4137 for sse_key in [
4138 "s4-encrypted",
4139 "s4-sse-type",
4140 "s4-sse-c-key-md5",
4141 "s4-sse-kms-key-id",
4142 ] {
4143 if let Some(v) = src_meta.get(sse_key) {
4144 dest_meta.insert(sse_key.to_string(), v.clone());
4145 }
4146 }
4147 debug!(
4148 src_bucket = %bucket,
4149 src_key = %key,
4150 "S4 copy_object: replaced client s4-* metadata with source values across REPLACE directive (v0.8.15 M-2)"
4151 );
4152 }
4153 }
4154 self.backend.copy_object(req).await
4155 }
4156 async fn list_objects(
4157 &self,
4158 req: S3Request<ListObjectsInput>,
4159 ) -> S3Result<S3Response<ListObjectsOutput>> {
4160 self.enforce_rate_limit(&req, &req.input.bucket)?;
4161 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4162 let mut resp = self.backend.list_objects(req).await?;
4163 // S4 内部 object (`*.s4index` sidecar、`.__s4ver__/` shadow versions
4164 // — v0.5 #34) を顧客から隠す。
4165 if let Some(contents) = resp.output.contents.as_mut() {
4166 contents.retain(|o| {
4167 o.key
4168 .as_ref()
4169 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4170 .unwrap_or(true)
4171 });
4172 }
4173 Ok(resp)
4174 }
4175 async fn list_objects_v2(
4176 &self,
4177 req: S3Request<ListObjectsV2Input>,
4178 ) -> S3Result<S3Response<ListObjectsV2Output>> {
4179 self.enforce_rate_limit(&req, &req.input.bucket)?;
4180 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4181 let mut resp = self.backend.list_objects_v2(req).await?;
4182 if let Some(contents) = resp.output.contents.as_mut() {
4183 let before = contents.len();
4184 contents.retain(|o| {
4185 o.key
4186 .as_ref()
4187 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4188 .unwrap_or(true)
4189 });
4190 // key_count も補正 (S3 spec compliance)
4191 if let Some(kc) = resp.output.key_count.as_mut() {
4192 *kc -= (before - contents.len()) as i32;
4193 }
4194 }
4195 Ok(resp)
4196 }
4197 /// v0.4 #17: filter S4-internal sidecars from versioned listings.
4198 /// v0.5 #34: when a [`crate::versioning::VersioningManager`] is
4199 /// attached AND the bucket is in a versioning-aware state, build
4200 /// the `Versions` / `DeleteMarkers` arrays directly from the
4201 /// in-memory chain (paginated + ordered the S3 way: key asc,
4202 /// version newest-first inside each key). Otherwise fall back to
4203 /// passthrough + sidecar-filter (legacy v0.4 behaviour).
4204 async fn list_object_versions(
4205 &self,
4206 req: S3Request<ListObjectVersionsInput>,
4207 ) -> S3Result<S3Response<ListObjectVersionsOutput>> {
4208 self.enforce_rate_limit(&req, &req.input.bucket)?;
4209 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4210 // v0.5 #34: VersioningManager-owned path.
4211 if let Some(mgr) = self.versioning.as_ref()
4212 && mgr.state(&req.input.bucket) != crate::versioning::VersioningState::Unversioned
4213 {
4214 let max_keys = req.input.max_keys.unwrap_or(1000) as usize;
4215 let page = mgr.list_versions(
4216 &req.input.bucket,
4217 req.input.prefix.as_deref(),
4218 req.input.key_marker.as_deref(),
4219 req.input.version_id_marker.as_deref(),
4220 max_keys,
4221 );
4222 let versions: Vec<ObjectVersion> = page
4223 .versions
4224 .into_iter()
4225 .map(|e| ObjectVersion {
4226 key: Some(e.key),
4227 version_id: Some(e.version_id),
4228 is_latest: Some(e.is_latest),
4229 e_tag: Some(ETag::Strong(e.etag)),
4230 size: Some(e.size as i64),
4231 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4232 ..Default::default()
4233 })
4234 .collect();
4235 let delete_markers: Vec<DeleteMarkerEntry> = page
4236 .delete_markers
4237 .into_iter()
4238 .map(|e| DeleteMarkerEntry {
4239 key: Some(e.key),
4240 version_id: Some(e.version_id),
4241 is_latest: Some(e.is_latest),
4242 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4243 ..Default::default()
4244 })
4245 .collect();
4246 let output = ListObjectVersionsOutput {
4247 name: Some(req.input.bucket.clone()),
4248 prefix: req.input.prefix.clone(),
4249 key_marker: req.input.key_marker.clone(),
4250 version_id_marker: req.input.version_id_marker.clone(),
4251 max_keys: req.input.max_keys,
4252 versions: if versions.is_empty() {
4253 None
4254 } else {
4255 Some(versions)
4256 },
4257 delete_markers: if delete_markers.is_empty() {
4258 None
4259 } else {
4260 Some(delete_markers)
4261 },
4262 is_truncated: Some(page.is_truncated),
4263 next_key_marker: page.next_key_marker,
4264 next_version_id_marker: page.next_version_id_marker,
4265 ..Default::default()
4266 };
4267 return Ok(S3Response::new(output));
4268 }
4269 // Legacy passthrough path (v0.4 #17 sidecar filter retained).
4270 let mut resp = self.backend.list_object_versions(req).await?;
4271 if let Some(versions) = resp.output.versions.as_mut() {
4272 versions.retain(|v| {
4273 v.key
4274 .as_ref()
4275 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4276 .unwrap_or(true)
4277 });
4278 }
4279 if let Some(markers) = resp.output.delete_markers.as_mut() {
4280 markers.retain(|m| {
4281 m.key
4282 .as_ref()
4283 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4284 .unwrap_or(true)
4285 });
4286 }
4287 Ok(resp)
4288 }
4289
4290 async fn create_multipart_upload(
4291 &self,
4292 mut req: S3Request<CreateMultipartUploadInput>,
4293 ) -> S3Result<S3Response<CreateMultipartUploadOutput>> {
4294 // v0.8.12 HIGH-9 fix: gate multipart Create on `s3:PutObject` —
4295 // the destination is conceptually about to host a new object,
4296 // matching what `put_object` enforces L2078. Without this, a
4297 // bucket policy denying `s3:PutObject` was bypassable simply
4298 // by switching the client to the multipart wire path.
4299 let mp_bucket = req.input.bucket.clone();
4300 let mp_key = req.input.key.clone();
4301 // v0.8.15 M-1: reserved-name guard on the multipart entry too.
4302 if s4_codec::index::is_reserved_sidecar_key(&mp_key) {
4303 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
4304 .unwrap_or(S3ErrorCode::InvalidArgument);
4305 return Err(S3Error::with_message(
4306 code,
4307 format!(
4308 "object key {mp_key:?} is reserved (suffix `{}` is used for S4 internal \
4309 sidecars)",
4310 s4_codec::index::SIDECAR_SUFFIX,
4311 ),
4312 ));
4313 }
4314 self.enforce_policy(&req, "s3:PutObject", &mp_bucket, Some(&mp_key))?;
4315 self.enforce_rate_limit(&req, &mp_bucket)?;
4316 // Multipart object は per-part 圧縮 + frame 形式で書く。GET 時に
4317 // frame parse を起動するため、object metadata に flag を立てる。
4318 // codec は dispatcher の default kind を採用 (per-part 別 codec は Phase 2)。
4319 let codec_kind = self.registry.default_kind();
4320 let meta = req.input.metadata.get_or_insert_with(Default::default);
4321 meta.insert(META_MULTIPART.into(), "true".into());
4322 meta.insert(META_CODEC.into(), codec_kind.as_str().into());
4323 // v0.8 #54 BUG-10 fix: take() the SSE request fields off
4324 // `req.input` so they are NOT forwarded to the backend on
4325 // CreateMultipartUpload. Same root cause as v0.7 #48 BUG-2/3 on
4326 // single-PUT — MinIO rejects SSE-C with "HTTPS required" and
4327 // SSE-KMS with "KMS not configured" when the headers reach it.
4328 // S4 owns the encrypt-then-store contract; we capture the
4329 // recipe in `multipart_state` here and apply it on Complete.
4330 let sse_c_alg = req.input.sse_customer_algorithm.take();
4331 let sse_c_key = req.input.sse_customer_key.take();
4332 let sse_c_md5 = req.input.sse_customer_key_md5.take();
4333 let sse_header = req.input.server_side_encryption.take();
4334 let sse_kms_key = req.input.ssekms_key_id.take();
4335 // Strip the encryption-context too — leaving it would make
4336 // MinIO try to validate it against a non-existent KMS key.
4337 let _ = req.input.ssekms_encryption_context.take();
4338 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
4339 let kms_key_id = extract_kms_key_id(
4340 &sse_header,
4341 &sse_kms_key,
4342 self.kms_default_key_id.as_deref(),
4343 );
4344 // SSE-C / SSE-KMS exclusivity (mirrors put_object L1870).
4345 if sse_c_material.is_some() && kms_key_id.is_some() {
4346 return Err(S3Error::with_message(
4347 S3ErrorCode::InvalidArgument,
4348 "SSE-C and SSE-KMS cannot be used together on the same multipart upload",
4349 ));
4350 }
4351 let sse_mode = if let Some(ref m) = sse_c_material {
4352 // v0.8.2 #62 (H-6 audit fix): wrap the customer-supplied
4353 // 32-byte key in `Zeroizing` so abandoned uploads (or
4354 // normal Complete/Abort) wipe the key bytes on drop. The
4355 // `key_md5` is the public fingerprint and stays as a
4356 // bare `[u8; 16]`.
4357 crate::multipart_state::MultipartSseMode::SseC {
4358 key: zeroize::Zeroizing::new(m.key),
4359 key_md5: m.key_md5,
4360 }
4361 } else if let Some(ref kid) = kms_key_id {
4362 // KMS pre-flight: fail at Create rather than at Complete if
4363 // the gateway has no KMS backend wired (mirrors the
4364 // put_object L1879 check).
4365 if self.kms.is_none() {
4366 return Err(S3Error::with_message(
4367 S3ErrorCode::InvalidRequest,
4368 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4369 ));
4370 }
4371 crate::multipart_state::MultipartSseMode::SseKms {
4372 key_id: kid.clone(),
4373 }
4374 } else if self.sse_keyring.is_some() {
4375 // SSE-S4: server-driven transparent encryption. Activates
4376 // whenever the gateway has a keyring configured AND the
4377 // client didn't pick a different SSE mode.
4378 crate::multipart_state::MultipartSseMode::SseS4
4379 } else {
4380 crate::multipart_state::MultipartSseMode::None
4381 };
4382 // v0.8 #54 BUG-9 fix: parse the Tagging header on Create. The
4383 // single-PUT path does this on PutObject; the multipart path
4384 // captures it now and commits via TagManager on Complete.
4385 let request_tags: Option<crate::tagging::TagSet> = req
4386 .input
4387 .tagging
4388 .as_deref()
4389 .map(crate::tagging::parse_tagging_header)
4390 .transpose()
4391 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
4392 // Strip the `Tagging` field off the input so the backend
4393 // doesn't try to apply it (no-op on MinIO but keeps the wire
4394 // clean).
4395 let _ = req.input.tagging.take();
4396 // Object Lock recipe (BUG-7 — captured here, applied on Complete).
4397 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
4398 .input
4399 .object_lock_mode
4400 .as_ref()
4401 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
4402 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
4403 .input
4404 .object_lock_retain_until_date
4405 .as_ref()
4406 .and_then(timestamp_to_chrono_utc);
4407 let explicit_legal_hold_on: bool = req
4408 .input
4409 .object_lock_legal_hold_status
4410 .as_ref()
4411 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
4412 .unwrap_or(false);
4413 let bucket = req.input.bucket.clone();
4414 let key = req.input.key.clone();
4415 debug!(
4416 bucket = %bucket,
4417 key = %key,
4418 codec = codec_kind.as_str(),
4419 sse = ?sse_mode,
4420 "S4 create_multipart_upload: marking object for per-part compression"
4421 );
4422 let mut resp = self.backend.create_multipart_upload(req).await?;
4423 // Stash the per-upload context only after the backend handed
4424 // us an upload_id (failed Creates leave nothing in the store).
4425 if let Some(upload_id) = resp.output.upload_id.as_ref() {
4426 self.multipart_state.put(
4427 upload_id,
4428 crate::multipart_state::MultipartUploadContext {
4429 bucket,
4430 key,
4431 sse: sse_mode.clone(),
4432 tags: request_tags,
4433 object_lock_mode: explicit_lock_mode,
4434 object_lock_retain_until: explicit_retain_until,
4435 object_lock_legal_hold: explicit_legal_hold_on,
4436 },
4437 );
4438 }
4439 // SSE-C / SSE-KMS response echo (mirrors put_object L2036-L2050).
4440 match &sse_mode {
4441 crate::multipart_state::MultipartSseMode::SseC { key_md5, .. } => {
4442 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
4443 resp.output.sse_customer_key_md5 =
4444 Some(base64::engine::general_purpose::STANDARD.encode(key_md5));
4445 }
4446 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
4447 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
4448 ServerSideEncryption::AWS_KMS,
4449 ));
4450 resp.output.ssekms_key_id = Some(key_id.clone());
4451 }
4452 _ => {}
4453 }
4454 Ok(resp)
4455 }
4456
4457 async fn upload_part(
4458 &self,
4459 mut req: S3Request<UploadPartInput>,
4460 ) -> S3Result<S3Response<UploadPartOutput>> {
4461 // v0.8.12 HIGH-9 fix: same `s3:PutObject` gate as
4462 // `put_object` / `create_multipart_upload`. Even though
4463 // Create already passed the gate, a bucket policy that
4464 // *revokes* `s3:PutObject` mid-flight should stop further
4465 // parts (e.g. legal hold drops, retention shortened).
4466 let part_bucket = req.input.bucket.clone();
4467 let part_key = req.input.key.clone();
4468 self.enforce_policy(&req, "s3:PutObject", &part_bucket, Some(&part_key))?;
4469 self.enforce_rate_limit(&req, &part_bucket)?;
4470 // 各 part を圧縮して frame header 付きで forward。GET 時に
4471 // `decompress_multipart` が frame iter で順に解凍する。
4472 // **per-part codec dispatch**: dispatcher が body 先頭 sample から
4473 // codec を選ぶので、parquet 風の mixed-content multipart で part ごとに
4474 // 最適 codec を使える (整数列 part → Bitcomp、text 列 part → zstd 等)。
4475 //
4476 // v0.8 #54 BUG-5/BUG-10 fix: lookup the per-upload SSE
4477 // context captured by `create_multipart_upload` and (a) strip
4478 // any SSE-C request headers off `req.input` so the backend
4479 // doesn't see them — same root cause as v0.7 #48 BUG-2/3 on
4480 // single-PUT; MinIO refuses SSE-C parts over HTTP — and (b)
4481 // observe that an upload context exists for `upload_id`. The
4482 // actual encrypt happens once at `complete_multipart_upload`
4483 // time on the assembled body (the per-part-encrypt approach
4484 // would require a matching multi-segment decrypt path on GET;
4485 // encrypting the whole assembled body keeps the GET path's
4486 // `is_sse_encrypted` branch in get_object L2429 working
4487 // unchanged).
4488 let sse_ctx = self.multipart_state.get(req.input.upload_id.as_str());
4489 // v0.8.2 #62 (H-1 audit fix): SSE-C key consistency check.
4490 // The AWS S3 spec requires the same SSE-C key headers on
4491 // every UploadPart and rejects mismatches with 400. Prior to
4492 // #62 we silently stripped the headers (BUG-10 fix) without
4493 // validating them, allowing a client to send part 1 under
4494 // key-A and part 2 under key-B; both got stored, then
4495 // re-encrypted with key-A on Complete — the client thinks
4496 // part 2 is under key-B but a GET with key-B would in fact
4497 // hit the part-1 ciphertext that was actually encrypted with
4498 // key-A. That would either decrypt successfully (silent
4499 // corruption: client lost track of which key encrypts what)
4500 // or fail in a confusing way. Validate the per-part headers
4501 // now and reject with 400 InvalidArgument on mismatch /
4502 // omission / partial supply, matching real-S3 behaviour.
4503 if let Some(ref ctx) = sse_ctx {
4504 if let crate::multipart_state::MultipartSseMode::SseC {
4505 key_md5: ctx_md5, ..
4506 } = &ctx.sse
4507 {
4508 let alg = req.input.sse_customer_algorithm.take();
4509 let key_b64 = req.input.sse_customer_key.take();
4510 let md5_b64 = req.input.sse_customer_key_md5.take();
4511 match (alg, key_b64, md5_b64) {
4512 (Some(a), Some(k), Some(m)) => {
4513 // Parse + validate; if the per-part headers
4514 // are themselves malformed (algorithm not
4515 // AES256, MD5 mismatch, key not 32 bytes)
4516 // surface the same 400 the single-PUT path
4517 // would. Then compare the parsed MD5 to the
4518 // upload-context's MD5; mismatch is a
4519 // different-key UploadPart and must reject.
4520 let part_material = crate::sse::parse_customer_key_headers(&a, &k, &m)
4521 .map_err(sse_c_error_to_s3)?;
4522 if part_material.key_md5 != *ctx_md5 {
4523 return Err(S3Error::with_message(
4524 S3ErrorCode::InvalidArgument,
4525 "SSE-C key on UploadPart does not match the key supplied on CreateMultipartUpload",
4526 ));
4527 }
4528 // OK — same key as Create. Headers are
4529 // already taken off `req.input` so the
4530 // backend never sees them.
4531 }
4532 (None, None, None) => {
4533 // AWS S3 spec: SSE-C headers MUST be replayed
4534 // on every UploadPart of an SSE-C multipart.
4535 // Real-S3 returns 400 InvalidRequest in this
4536 // case; mirror that.
4537 return Err(S3Error::with_message(
4538 S3ErrorCode::InvalidRequest,
4539 "SSE-C requires customer-key headers on every UploadPart (CreateMultipartUpload was SSE-C)",
4540 ));
4541 }
4542 _ => {
4543 // Partial header set (e.g. algorithm + key
4544 // but no MD5) — same handling as the
4545 // single-PUT `extract_sse_c_material` helper.
4546 return Err(S3Error::with_message(
4547 S3ErrorCode::InvalidRequest,
4548 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
4549 ));
4550 }
4551 }
4552 } else {
4553 // CreateMultipartUpload was non-SSE-C (None / SseS4 /
4554 // SseKms). A part that arrives carrying SSE-C headers
4555 // is either a confused client or an attempt to
4556 // smuggle SSE-C around the gateway-internal SSE
4557 // recipe. Reject with 400 InvalidRequest rather than
4558 // silently strip — the strip would let the client
4559 // believe the part was encrypted under their key
4560 // when in fact the upload's encryption recipe is
4561 // whatever the Create captured.
4562 if req.input.sse_customer_algorithm.is_some()
4563 || req.input.sse_customer_key.is_some()
4564 || req.input.sse_customer_key_md5.is_some()
4565 {
4566 return Err(S3Error::with_message(
4567 S3ErrorCode::InvalidRequest,
4568 "UploadPart sent SSE-C headers but CreateMultipartUpload was not SSE-C",
4569 ));
4570 }
4571 }
4572 } else {
4573 // No upload context registered (gateway crashed between
4574 // Create and Part, or pre-#62 abandoned-upload restore).
4575 // We can't check key consistency in this case — strip
4576 // the headers and let the request through unchanged so
4577 // the backend's `NoSuchUpload` reply (or whatever it
4578 // chooses to do) flows back to the client.
4579 let _ = req.input.sse_customer_algorithm.take();
4580 let _ = req.input.sse_customer_key.take();
4581 let _ = req.input.sse_customer_key_md5.take();
4582 }
4583 let _sse_ctx = sse_ctx;
4584 if let Some(blob) = req.input.body.take() {
4585 let bytes = collect_blob(blob, self.max_body_bytes)
4586 .await
4587 .map_err(internal("collect upload_part body"))?;
4588 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
4589 // checksum algorithms against the received part body.
4590 verify_client_body_checksums(
4591 &bytes,
4592 req.input.content_md5.as_deref(),
4593 req.input.checksum_crc32.as_deref(),
4594 req.input.checksum_crc32c.as_deref(),
4595 req.input.checksum_sha1.as_deref(),
4596 req.input.checksum_sha256.as_deref(),
4597 req.input.checksum_crc64nvme.as_deref(),
4598 )?;
4599 let sample_len = bytes.len().min(SAMPLE_BYTES);
4600 // v0.8 #56: full part body is already in memory here; use its
4601 // length as the size hint so the dispatcher can promote to GPU
4602 // if it's big enough.
4603 let codec_kind = self
4604 .dispatcher
4605 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
4606 .await;
4607 let original_size = bytes.len() as u64;
4608 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
4609 let (compress_res, tel) = self
4610 .registry
4611 .compress_with_telemetry(bytes, codec_kind)
4612 .await;
4613 stamp_gpu_compress_telemetry(&tel);
4614 let (compressed, manifest) =
4615 compress_res.map_err(internal("registry compress part"))?;
4616 let header = FrameHeader {
4617 codec: codec_kind,
4618 original_size,
4619 compressed_size: compressed.len() as u64,
4620 crc32c: manifest.crc32c,
4621 };
4622 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
4623 write_frame(&mut framed, header, &compressed);
4624 // v0.2 #5: heuristic-based padding skip for likely-final parts.
4625 //
4626 // AWS SDK / aws-cli / boto3 always send the final (and only the
4627 // final) part below the configured part_size. So if the raw user
4628 // part is already smaller than S3's 5 MiB multipart minimum, this
4629 // is overwhelmingly likely to be the final part — and the final
4630 // part is exempt from S3's size constraint. Skipping padding here
4631 // saves up to ~5 MiB per object on highly compressible workloads.
4632 //
4633 // If a misbehaving client sends a tiny **non-final** part, S3
4634 // itself rejects with EntityTooSmall at CompleteMultipartUpload —
4635 // identical outcome to a vanilla S3 PUT, just earlier than
4636 // padding-then-complete would catch it.
4637 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
4638 if !likely_final {
4639 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
4640 }
4641 let framed_bytes = framed.freeze();
4642 let new_len = framed_bytes.len() as i64;
4643 // 同じ wire 互換問題が multipart にもある (content-length / checksum)
4644 req.input.content_length = Some(new_len);
4645 req.input.checksum_algorithm = None;
4646 req.input.checksum_crc32 = None;
4647 req.input.checksum_crc32c = None;
4648 req.input.checksum_crc64nvme = None;
4649 req.input.checksum_sha1 = None;
4650 req.input.checksum_sha256 = None;
4651 req.input.content_md5 = None;
4652 req.input.body = Some(bytes_to_blob(framed_bytes));
4653 debug!(
4654 part_number = ?req.input.part_number,
4655 upload_id = ?req.input.upload_id,
4656 original_size,
4657 framed_size = new_len,
4658 "S4 upload_part: framed compressed payload"
4659 );
4660 }
4661 self.backend.upload_part(req).await
4662 }
4663 async fn complete_multipart_upload(
4664 &self,
4665 mut req: S3Request<CompleteMultipartUploadInput>,
4666 ) -> S3Result<S3Response<CompleteMultipartUploadOutput>> {
4667 let bucket = req.input.bucket.clone();
4668 let key = req.input.key.clone();
4669 let upload_id = req.input.upload_id.clone();
4670 // v0.8.12 HIGH-9 fix: gate Complete on `s3:PutObject` (the
4671 // commit point for the multipart-assembled object).
4672 self.enforce_policy(&req, "s3:PutObject", &bucket, Some(&key))?;
4673 self.enforce_rate_limit(&req, &bucket)?;
4674 // v0.8.12 HIGH-6 fix: re-verify Object Lock on the target key
4675 // at Complete time. Without this an attacker with PutObject
4676 // permission could `CreateMultipartUpload` against a key
4677 // that's currently under retention / legal hold and silently
4678 // overwrite it on Complete (the single-PUT path runs the
4679 // same check at L2007). Compliance retention is never
4680 // bypassable; Governance only with explicit IAM permission
4681 // (HIGH-7 gate below).
4682 if let Some(mgr) = self.object_lock.as_ref()
4683 && let Some(state) = mgr.get(&bucket, &key)
4684 {
4685 // CompleteMultipartUpload doesn't carry the bypass header
4686 // (the s3s DTO matches AWS' wire schema). A locked key
4687 // therefore cannot be overwritten by Complete regardless
4688 // of caller permission — operators who need to break a
4689 // Governance lock do it via PutObjectRetention before
4690 // calling Complete.
4691 let now = chrono::Utc::now();
4692 if !state.can_delete(now, false) {
4693 crate::metrics::record_policy_denial("s3:PutObject", &bucket);
4694 return Err(S3Error::with_message(
4695 S3ErrorCode::AccessDenied,
4696 "Access Denied because target key is protected by object lock",
4697 ));
4698 }
4699 }
4700 // v0.8.1 #59: serialise concurrent Complete invocations on the
4701 // same `(bucket, key)`. The race window the lock closes is the
4702 // GET-assembled-body → encrypt → PUT-encrypted-body triple
4703 // below (BUG-5 fix); without serialisation, two Completes for
4704 // different `upload_id` but the same logical key could each
4705 // read the other's plaintext assembled body and overwrite the
4706 // peer's encrypted result. The guard is held to function exit
4707 // (drop on `Ok` / `Err`), covering version-id mint, object-
4708 // lock apply, tagging persist, and replication enqueue too.
4709 let completion_lock = self.multipart_state.completion_lock(&bucket, &key);
4710 let _completion_guard = completion_lock.lock().await;
4711 // v0.8 #54 — fetch the per-upload context captured on Create.
4712 // `None` means an abandoned / unknown upload_id (gateway
4713 // crashed between Create and Complete, or pre-v0.8 state
4714 // restore); we still let the backend do its thing for
4715 // transparency, but we can't apply any SSE / version / lock /
4716 // tag / replication post-processing because we never captured
4717 // the recipe.
4718 let ctx = self.multipart_state.get(upload_id.as_str());
4719 // v0.8 #54 BUG-10 fix: same SSE-C header strip as upload_part
4720 // — some clients (boto3 / aws-sdk-cpp older versions) replay
4721 // the SSE-C triple on Complete too, and MinIO will choke if
4722 // they reach the backend.
4723 let _ = req.input.sse_customer_algorithm.take();
4724 let _ = req.input.sse_customer_key.take();
4725 let _ = req.input.sse_customer_key_md5.take();
4726 let mut resp = self.backend.complete_multipart_upload(req).await?;
4727 // CompleteMultipartUpload 成功 → 完成した object を full fetch して frame
4728 // index を build、`<key>.s4index` sidecar として保存。これで Range GET の
4729 // partial fetch path が利用可能になる (Range request の帯域節約)。
4730 // 注: 巨大 object の場合この pass は重いが、Range query は一度 sidecar が
4731 // できれば爆速になるので 1 回の cost は payback される
4732 //
4733 // v0.8 #54 BUG-5..9: this same fetch is the choke-point for
4734 // the SSE encrypt re-PUT + versioning shadow-key rewrite +
4735 // replication source-bytes capture, so we GET once and reuse
4736 // the bytes for every post-processing step.
4737 let assembled_body: Option<bytes::Bytes> = if let Ok(uri) = safe_object_uri(&bucket, &key) {
4738 let get_input = GetObjectInput {
4739 bucket: bucket.clone(),
4740 key: key.clone(),
4741 ..Default::default()
4742 };
4743 let get_req = S3Request {
4744 input: get_input,
4745 method: http::Method::GET,
4746 uri,
4747 headers: http::HeaderMap::new(),
4748 extensions: http::Extensions::new(),
4749 credentials: None,
4750 region: None,
4751 service: None,
4752 trailing_headers: None,
4753 };
4754 match self.backend.get_object(get_req).await {
4755 Ok(get_resp) => match get_resp.output.body {
4756 Some(blob) => collect_blob(blob, self.max_body_bytes).await.ok(),
4757 None => None,
4758 },
4759 Err(e) => {
4760 // v0.8.4 #71 (C-1 audit fix): a silent
4761 // `Err(_) => None` here is a SSE plaintext
4762 // leak. The post-processing block below only
4763 // runs the SSE re-encrypt branch when
4764 // `assembled_body.is_some()`, so swallowing a
4765 // backend error skipped the encrypt step and
4766 // left the multipart object on disk as
4767 // plaintext, even on SSE-S4 / SSE-C / SSE-KMS
4768 // configured buckets. Same root-cause family
4769 // as v0.8 BUG-5; this branch closes the
4770 // remaining read-side window.
4771 //
4772 // We distinguish two cases:
4773 // - `NoSuchKey`: the object is genuinely
4774 // missing post-Complete. This is rare and
4775 // typically races with a concurrent
4776 // DeleteObject; there is nothing to re-
4777 // encrypt and no SSE markers to honour, so
4778 // falling through to the legacy
4779 // `assembled_body = None` path is safe.
4780 // - everything else (5xx, network, auth,
4781 // etc.): we must FAIL the Complete so the
4782 // client can retry. Returning Ok with
4783 // `assembled_body = None` would silently
4784 // skip the SSE re-encrypt and leave the
4785 // backend bytes plaintext.
4786 if matches!(e.code(), &S3ErrorCode::NoSuchKey) {
4787 tracing::warn!(
4788 bucket = %bucket,
4789 key = %key,
4790 "multipart Complete: backend GET returned NoSuchKey; \
4791 skipping post-processing (object likely raced with DeleteObject)"
4792 );
4793 None
4794 } else {
4795 tracing::error!(
4796 bucket = %bucket,
4797 key = %key,
4798 error = %e,
4799 "multipart Complete: backend GET failed; failing the Complete \
4800 so the client retries (silent fall-through would skip SSE \
4801 re-encrypt and store plaintext)"
4802 );
4803 return Err(internal("multipart Complete: backend body fetch failed")(e));
4804 }
4805 }
4806 }
4807 } else {
4808 None
4809 };
4810 // Sidecar build (existing behaviour, gated on assembled body).
4811 //
4812 // v0.8.12 HIGH-10 fix: skip the sidecar when the Complete is
4813 // going to SSE-encrypt the assembled body before re-PUT (the
4814 // single-PUT path applies the same suppression at L2271).
4815 // Stale offsets into the pre-encrypt body would break Range
4816 // GET on the encrypted on-disk bytes. `ctx.sse != None`
4817 // covers all three SSE modes captured at Create time.
4818 let mp_will_encrypt = ctx
4819 .as_ref()
4820 .map(|c| !matches!(c.sse, crate::multipart_state::MultipartSseMode::None))
4821 .unwrap_or(false);
4822 if let Some(ref body) = assembled_body
4823 && !mp_will_encrypt
4824 && let Ok(mut index) = build_index_from_body(body)
4825 {
4826 // v0.8.15 H-g: stamp the source-ETag / source-compressed-size
4827 // binding on the multipart sidecar. The single-PUT path
4828 // does this at L2519-L2521 via the backend's PUT response,
4829 // but Complete returns its own ETag (an opaque manifest
4830 // hash) so we have to HEAD the freshly-completed object
4831 // to pick up what backend actually wrote, then bind the
4832 // sidecar to those values. Without the binding, a
4833 // subsequent backend-side mutation (lifecycle rewrite,
4834 // out-of-band CopyObject) wouldn't trip the staleness
4835 // check on the next Range GET — the GET would happily
4836 // slice the new bytes at the old sidecar offsets, with
4837 // silent data corruption.
4838 if let Ok(uri) = safe_object_uri(&bucket, &key) {
4839 let head_req = S3Request {
4840 input: HeadObjectInput {
4841 bucket: bucket.clone(),
4842 key: key.clone(),
4843 ..Default::default()
4844 },
4845 method: http::Method::HEAD,
4846 uri,
4847 headers: http::HeaderMap::new(),
4848 extensions: http::Extensions::new(),
4849 credentials: None,
4850 region: None,
4851 service: None,
4852 trailing_headers: None,
4853 };
4854 if let Ok(head) = self.backend.head_object(head_req).await {
4855 index.source_etag = head.output.e_tag.as_ref().map(|t| t.value().to_string());
4856 index.source_compressed_size = head
4857 .output
4858 .content_length
4859 .and_then(|n| u64::try_from(n).ok());
4860 }
4861 // HEAD failure is non-fatal — the sidecar still works
4862 // as a v1-style best-effort fast path; the Range GET
4863 // simply falls back to a full read on any consistency
4864 // signal.
4865 }
4866 self.write_sidecar(&bucket, &key, &index).await;
4867 }
4868 // From here on, post-processing depends on the context —
4869 // short-circuit when the upload had no captured recipe
4870 // (legacy / crashed-Create / pre-v0.8 state restore).
4871 if let Some(ctx) = ctx {
4872 // v0.8 #54 BUG-6 fix: mint a version-id when the bucket
4873 // is versioning-Enabled. The single-PUT path does this in
4874 // `put_object` ~L1968; multipart was the missing branch.
4875 // We mint here (post-Complete, before any re-PUT) so the
4876 // same vid threads into both the shadow-key rewrite and
4877 // the VersionEntry the manager records.
4878 let pending_version: Option<crate::versioning::PutOutcome> = self
4879 .versioning
4880 .as_ref()
4881 .map(|mgr| mgr.state(&bucket))
4882 .map(|state| match state {
4883 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
4884 version_id: crate::versioning::VersioningManager::new_version_id(),
4885 versioned_response: true,
4886 },
4887 crate::versioning::VersioningState::Suspended
4888 | crate::versioning::VersioningState::Unversioned => {
4889 crate::versioning::PutOutcome {
4890 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
4891 versioned_response: false,
4892 }
4893 }
4894 });
4895 // v0.8 #54 BUG-5 fix: encrypt the assembled framed body
4896 // and re-PUT it to the backend so the on-disk bytes are
4897 // SSE-encrypted. The single-PUT path does this body-by-
4898 // body inside `put_object` (L1907-L1942); for multipart,
4899 // encrypt-per-part would require a multi-segment decrypt
4900 // path on GET — we instead do a single encrypt over the
4901 // assembled framed body so the existing GET decrypt
4902 // branch (`is_sse_encrypted` → `decrypt(body, source)` →
4903 // FrameIter) handles it unchanged.
4904 //
4905 // The cost is one extra round-trip per Complete for SSE-
4906 // enabled multipart (already-paid for the sidecar build).
4907 // For single-instance gateways pointing at a co-located
4908 // backend this is negligible; cross-region operators
4909 // would benefit from per-part encrypt + multi-segment
4910 // decrypt as a follow-up.
4911 let needs_re_put = matches!(
4912 ctx.sse,
4913 crate::multipart_state::MultipartSseMode::SseS4
4914 | crate::multipart_state::MultipartSseMode::SseC { .. }
4915 | crate::multipart_state::MultipartSseMode::SseKms { .. }
4916 ) || pending_version
4917 .as_ref()
4918 .map(|pv| pv.versioned_response)
4919 .unwrap_or(false);
4920 // v0.8.11 CRIT-2 fix: seed the replication body with the
4921 // pre-encrypt assembled bytes, but overwrite it with the
4922 // post-encrypt `new_body` once the re-PUT branch lands.
4923 // The previous "snapshot in advance" pattern shipped the
4924 // *plaintext* framed body to the destination bucket even
4925 // when SSE-S4 / SSE-C / SSE-KMS was active — the GET on
4926 // the destination would then fail to decrypt (or, worse,
4927 // succeed in handing out plaintext that the source had
4928 // promised was encrypted at rest). When `needs_re_put`
4929 // is false (no SSE, no versioning), the backend still
4930 // holds the original plaintext-framed bytes, and the
4931 // seed value is what the destination should receive.
4932 let mut replication_body = assembled_body.clone();
4933 let mut applied_metadata: Option<std::collections::HashMap<String, String>> = None;
4934 if needs_re_put && let Some(body) = assembled_body {
4935 // v0.8.1 #58: same Zeroizing pattern as put_object's
4936 // single-PUT KMS branch — DEK plaintext lives in
4937 // `Zeroizing<[u8; 32]>` for the lifetime of this
4938 // Complete handler, then is wiped on drop.
4939 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
4940 if let crate::multipart_state::MultipartSseMode::SseKms { ref key_id } = ctx.sse
4941 {
4942 let kms = self.kms.as_ref().ok_or_else(|| {
4943 S3Error::with_message(
4944 S3ErrorCode::InvalidRequest,
4945 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4946 )
4947 })?;
4948 let (dek, wrapped) =
4949 kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
4950 if dek.len() != 32 {
4951 return Err(S3Error::with_message(
4952 S3ErrorCode::InternalError,
4953 format!(
4954 "KMS backend returned a DEK of {} bytes (expected 32)",
4955 dek.len()
4956 ),
4957 ));
4958 }
4959 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
4960 zeroize::Zeroizing::new([0u8; 32]);
4961 dek_arr.copy_from_slice(&dek);
4962 // `dek` (Zeroizing<Vec<u8>>) is dropped at scope end.
4963 Some((dek_arr, wrapped))
4964 } else {
4965 None
4966 };
4967 // Build the new metadata map: re-fetch via HEAD so
4968 // the multipart / codec markers the backend stamped
4969 // on Create flow through unchanged, then layer the
4970 // SSE markers on top.
4971 let head_req = S3Request {
4972 input: HeadObjectInput {
4973 bucket: bucket.clone(),
4974 key: key.clone(),
4975 ..Default::default()
4976 },
4977 method: http::Method::HEAD,
4978 uri: safe_object_uri(&bucket, &key)?,
4979 headers: http::HeaderMap::new(),
4980 extensions: http::Extensions::new(),
4981 credentials: None,
4982 region: None,
4983 service: None,
4984 trailing_headers: None,
4985 };
4986 let mut new_metadata: std::collections::HashMap<String, String> =
4987 match self.backend.head_object(head_req).await {
4988 Ok(h) => h.output.metadata.unwrap_or_default(),
4989 Err(_) => std::collections::HashMap::new(),
4990 };
4991 let new_body = match &ctx.sse {
4992 crate::multipart_state::MultipartSseMode::SseC { key, key_md5 } => {
4993 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
4994 new_metadata.insert("s4-sse-type".into(), "AES256".into());
4995 new_metadata.insert(
4996 "s4-sse-c-key-md5".into(),
4997 base64::engine::general_purpose::STANDARD.encode(key_md5),
4998 );
4999 // v0.8.2 #62: `key` is `&Zeroizing<[u8; 32]>`;
5000 // auto-deref through one explicit binding so
5001 // `SseSource::CustomerKey` gets the `&[u8; 32]`
5002 // it expects (mirrors the SSE-KMS DEK shape
5003 // a few lines down).
5004 let key_ref: &[u8; 32] = key;
5005 crate::sse::encrypt_with_source(
5006 &body,
5007 crate::sse::SseSource::CustomerKey {
5008 key: key_ref,
5009 key_md5,
5010 },
5011 )
5012 }
5013 crate::multipart_state::MultipartSseMode::SseKms { .. } => {
5014 let (dek, wrapped) = kms_wrap
5015 .as_ref()
5016 .expect("SseKms branch implies kms_wrap is Some");
5017 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5018 new_metadata.insert("s4-sse-type".into(), "aws:kms".into());
5019 new_metadata.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
5020 // v0.8.1 #58: auto-deref from `&Zeroizing<[u8; 32]>`
5021 // to `&[u8; 32]` (same shape as the put_object
5022 // single-PUT branch).
5023 let dek_ref: &[u8; 32] = dek;
5024 crate::sse::encrypt_with_source(
5025 &body,
5026 crate::sse::SseSource::Kms {
5027 dek: dek_ref,
5028 wrapped,
5029 },
5030 )
5031 }
5032 crate::multipart_state::MultipartSseMode::SseS4 => {
5033 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
5034 S3Error::with_message(
5035 S3ErrorCode::InternalError,
5036 "SSE-S4 captured at Create but keyring missing at Complete",
5037 )
5038 })?;
5039 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5040 // SSE-S4 deliberately omits `s4-sse-type` so
5041 // HEAD doesn't falsely advertise AWS-style
5042 // SSE-S3 (matches the put_object L1929-L1939
5043 // comment).
5044 // v0.8 #52: same chunk_size dispatch as the
5045 // single-PUT branch — multipart Complete
5046 // re-encrypts the assembled body, so honoring
5047 // the chunked path here is required to keep
5048 // GET streaming on multipart-uploaded objects.
5049 if self.sse_chunk_size > 0 {
5050 crate::sse::encrypt_v2_chunked(&body, keyring, self.sse_chunk_size)
5051 .map_err(|e| {
5052 S3Error::with_message(
5053 S3ErrorCode::InternalError,
5054 format!("SSE-S4 chunked encrypt failed at Complete: {e}"),
5055 )
5056 })?
5057 } else {
5058 crate::sse::encrypt_v2(&body, keyring)
5059 }
5060 }
5061 crate::multipart_state::MultipartSseMode::None => body.clone(),
5062 };
5063 // v0.8 #54 BUG-6 fix: write the re-PUT under the
5064 // shadow key so the version chain doesn't overwrite
5065 // the previous version on a versioned bucket. The
5066 // original (unshadowed) key was assembled by the
5067 // backend on Complete; we delete it after the shadow
5068 // PUT lands.
5069 let put_target_key = if let Some(pv) = pending_version.as_ref() {
5070 if pv.versioned_response {
5071 versioned_shadow_key(&key, &pv.version_id)
5072 } else {
5073 key.clone()
5074 }
5075 } else {
5076 key.clone()
5077 };
5078 let new_body_len = new_body.len() as i64;
5079 let put_req = S3Request {
5080 input: PutObjectInput {
5081 bucket: bucket.clone(),
5082 key: put_target_key.clone(),
5083 body: Some(bytes_to_blob(new_body.clone())),
5084 metadata: Some(new_metadata.clone()),
5085 content_length: Some(new_body_len),
5086 ..Default::default()
5087 },
5088 method: http::Method::PUT,
5089 uri: safe_object_uri(&bucket, &put_target_key)?,
5090 headers: http::HeaderMap::new(),
5091 extensions: http::Extensions::new(),
5092 credentials: None,
5093 region: None,
5094 service: None,
5095 trailing_headers: None,
5096 };
5097 self.backend.put_object(put_req).await?;
5098 // v0.8.11 CRIT-2 fix: refresh the replication snapshot
5099 // with the bytes that were actually persisted to the
5100 // backend (post-SSE-encrypt for SSE modes; identical to
5101 // `body` for `MultipartSseMode::None` + versioning-only
5102 // re-PUT). The destination then sees the same on-disk
5103 // shape the source does, and a destination GET decrypts
5104 // correctly when SSE is on.
5105 replication_body = Some(new_body.clone());
5106 // If we rewrote the storage key (versioning shadow),
5107 // we must drop the original (unshadowed) Complete-
5108 // assembled bytes so subsequent listings don't see a
5109 // duplicate.
5110 if put_target_key != key {
5111 let del_req = S3Request {
5112 input: DeleteObjectInput {
5113 bucket: bucket.clone(),
5114 key: key.clone(),
5115 ..Default::default()
5116 },
5117 method: http::Method::DELETE,
5118 uri: safe_object_uri(&bucket, &key)?,
5119 headers: http::HeaderMap::new(),
5120 extensions: http::Extensions::new(),
5121 credentials: None,
5122 region: None,
5123 service: None,
5124 trailing_headers: None,
5125 };
5126 let _ = self.backend.delete_object(del_req).await;
5127 }
5128 applied_metadata = Some(new_metadata);
5129 }
5130 // v0.8 #54 BUG-6 commit: register the new version with
5131 // the VersioningManager so list_object_versions /
5132 // GET ?versionId= see it.
5133 if let (Some(mgr), Some(pv)) = (self.versioning.as_ref(), pending_version.as_ref()) {
5134 let etag = resp
5135 .output
5136 .e_tag
5137 .clone()
5138 .map(ETag::into_value)
5139 .unwrap_or_default();
5140 let now = chrono::Utc::now();
5141 mgr.commit_put_with_version(
5142 &bucket,
5143 &key,
5144 crate::versioning::VersionEntry {
5145 version_id: pv.version_id.clone(),
5146 etag,
5147 size: replication_body
5148 .as_ref()
5149 .map(|b| b.len() as u64)
5150 .unwrap_or(0),
5151 is_delete_marker: false,
5152 created_at: now,
5153 },
5154 );
5155 if pv.versioned_response {
5156 resp.output.version_id = Some(pv.version_id.clone());
5157 }
5158 }
5159 // v0.8 #54 BUG-7 fix: persist any per-upload Object Lock
5160 // recipe + auto-apply the bucket default. Mirrors the
5161 // put_object L2057-L2074 block.
5162 if let Some(mgr) = self.object_lock.as_ref() {
5163 if ctx.object_lock_mode.is_some()
5164 || ctx.object_lock_retain_until.is_some()
5165 || ctx.object_lock_legal_hold
5166 {
5167 let mut state = mgr.get(&bucket, &key).unwrap_or_default();
5168 if let Some(m) = ctx.object_lock_mode {
5169 state.mode = Some(m);
5170 }
5171 if let Some(u) = ctx.object_lock_retain_until {
5172 state.retain_until = Some(u);
5173 }
5174 if ctx.object_lock_legal_hold {
5175 state.legal_hold_on = true;
5176 }
5177 mgr.set(&bucket, &key, state);
5178 }
5179 mgr.apply_default_on_put(&bucket, &key, chrono::Utc::now());
5180 }
5181 // v0.8 #54 BUG-9 fix: persist the captured tags via the
5182 // TagManager so GetObjectTagging returns them.
5183 if let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), ctx.tags.as_ref()) {
5184 mgr.put_object_tags(&bucket, &key, tags.clone());
5185 }
5186 // SSE-C / SSE-KMS response echo. The
5187 // CompleteMultipartUploadOutput only exposes
5188 // `server_side_encryption` + `ssekms_key_id` (no
5189 // sse_customer_* — those round-tripped on Create / parts).
5190 match &ctx.sse {
5191 crate::multipart_state::MultipartSseMode::SseC { .. } => {
5192 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5193 ServerSideEncryption::AES256,
5194 ));
5195 }
5196 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5197 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5198 ServerSideEncryption::AWS_KMS,
5199 ));
5200 resp.output.ssekms_key_id = Some(key_id.clone());
5201 }
5202 _ => {}
5203 }
5204 // v0.8 #54 BUG-8 fix: fire cross-bucket replication just
5205 // like put_object L2165 does. We hand the dispatcher the
5206 // assembled body bytes (post-encrypt where applicable, so
5207 // the destination ends up byte-identical to the source's
5208 // on-disk shape) plus the metadata that was actually
5209 // committed.
5210 let replication_body_bytes = replication_body.unwrap_or_default();
5211 // v0.8.2 #61: thread the multipart-Complete `pending_version`
5212 // through so a versioning-Enabled source's destination
5213 // receives the same shadow-key path (mirror of the
5214 // single-PUT branch above).
5215 self.spawn_replication_if_matched(
5216 &bucket,
5217 &key,
5218 &ctx.tags,
5219 &replication_body_bytes,
5220 &applied_metadata,
5221 true,
5222 pending_version.as_ref(),
5223 );
5224 self.multipart_state.remove(upload_id.as_str());
5225 }
5226 // v0.8.1 #59 janitor: best-effort sweep of stale completion
5227 // locks while we are still on the critical path of a single
5228 // Complete (so steady-state workloads of unique keys don't
5229 // accumulate `DashMap` entries). The sweep only retires
5230 // entries whose `Arc::strong_count == 1`, so any other in-
5231 // flight Complete on a different key keeps its lock alive.
5232 // Our own `_completion_guard` keeps `bucket`/`key`'s entry
5233 // alive across this call; it's reaped on the next Complete or
5234 // the next caller-driven prune.
5235 self.multipart_state.prune_completion_locks();
5236 Ok(resp)
5237 }
5238 async fn abort_multipart_upload(
5239 &self,
5240 req: S3Request<AbortMultipartUploadInput>,
5241 ) -> S3Result<S3Response<AbortMultipartUploadOutput>> {
5242 // v0.8.12 HIGH-9 fix: gate Abort on `s3:AbortMultipartUpload`
5243 // — the AWS-spec action verb for this operation. Without the
5244 // gate, anyone who could guess an upload_id could throw away
5245 // someone else's in-flight multipart upload.
5246 let abort_bucket = req.input.bucket.clone();
5247 let abort_key = req.input.key.clone();
5248 self.enforce_policy(
5249 &req,
5250 "s3:AbortMultipartUpload",
5251 &abort_bucket,
5252 Some(&abort_key),
5253 )?;
5254 // v0.8 #54: drop the per-upload state (SSE-C key bytes / tag
5255 // set) promptly so an aborted upload doesn't leak the
5256 // customer's key into a long-running gateway's RSS.
5257 //
5258 // v0.8.4 #71 (H-7 audit fix): backend.abort_multipart_upload
5259 // FIRST, then drop in-process state ONLY on success. The
5260 // previous order ("remove → call backend") meant a transient
5261 // backend abort failure (5xx, network) wiped the SSE-C key
5262 // bytes locally while leaving the parts on the backend, so a
5263 // client retry would have to re-validate the SSE-C key against
5264 // a context the gateway no longer has — and the retried abort
5265 // would still hit the unaborted backend parts. Calling the
5266 // backend first lets the failure propagate to the client with
5267 // state intact for a clean retry; only on success do we wipe
5268 // the local state.
5269 let upload_id = req.input.upload_id.as_str().to_owned();
5270 let resp = self.backend.abort_multipart_upload(req).await?;
5271 self.multipart_state.remove(&upload_id);
5272 Ok(resp)
5273 }
5274 async fn list_multipart_uploads(
5275 &self,
5276 req: S3Request<ListMultipartUploadsInput>,
5277 ) -> S3Result<S3Response<ListMultipartUploadsOutput>> {
5278 self.backend.list_multipart_uploads(req).await
5279 }
5280 async fn list_parts(
5281 &self,
5282 req: S3Request<ListPartsInput>,
5283 ) -> S3Result<S3Response<ListPartsOutput>> {
5284 self.backend.list_parts(req).await
5285 }
5286
5287 // =========================================================================
5288 // Phase 2 — pure passthrough delegations。S4 はこれらに対して圧縮 hook を
5289 // 持たないので、backend (= AWS S3) の動作と完全に同一。
5290 //
5291 // 既知の制限事項:
5292 // - copy_object / upload_part_copy: source object が S4-compressed の場合、
5293 // backend が bytes を copy するだけなので metadata (s4-codec etc) も一緒に
5294 // coppied される (AWS S3 default = MetadataDirective COPY)。GET は manifest
5295 // 経由で正しく decompress できる。MetadataDirective REPLACE で上書き
5296 // されると圧縮 metadata が消えて壊れる — 顧客側の運用で注意
5297 // - list_object_versions: versioning enabled bucket では各 version も S4
5298 // metadata を維持する。古い version も S4 経由で正しく GET できる。
5299 // =========================================================================
5300
5301 // ---- Object ACL / tagging / attributes ----
5302 async fn get_object_acl(
5303 &self,
5304 req: S3Request<GetObjectAclInput>,
5305 ) -> S3Result<S3Response<GetObjectAclOutput>> {
5306 self.backend.get_object_acl(req).await
5307 }
5308 async fn put_object_acl(
5309 &self,
5310 req: S3Request<PutObjectAclInput>,
5311 ) -> S3Result<S3Response<PutObjectAclOutput>> {
5312 self.backend.put_object_acl(req).await
5313 }
5314 // v0.6 #39: object tagging — when a `TagManager` is attached the
5315 // configuration / per-(bucket, key) state lives in the manager and
5316 // these handlers serve directly from it; when no manager is
5317 // attached they fall back to the backend (legacy passthrough so
5318 // v0.5 deployments are unaffected).
5319 async fn get_object_tagging(
5320 &self,
5321 req: S3Request<GetObjectTaggingInput>,
5322 ) -> S3Result<S3Response<GetObjectTaggingOutput>> {
5323 let Some(mgr) = self.tagging.as_ref() else {
5324 return self.backend.get_object_tagging(req).await;
5325 };
5326 let tags = mgr
5327 .get_object_tags(&req.input.bucket, &req.input.key)
5328 .unwrap_or_default();
5329 Ok(S3Response::new(GetObjectTaggingOutput {
5330 tag_set: tagset_to_aws(&tags),
5331 ..Default::default()
5332 }))
5333 }
5334 async fn put_object_tagging(
5335 &self,
5336 req: S3Request<PutObjectTaggingInput>,
5337 ) -> S3Result<S3Response<PutObjectTaggingOutput>> {
5338 let Some(mgr) = self.tagging.as_ref() else {
5339 return self.backend.put_object_tagging(req).await;
5340 };
5341 let bucket = req.input.bucket.clone();
5342 let key = req.input.key.clone();
5343 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
5344 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5345 // v0.6 #39: gate via IAM policy with both the request tags
5346 // (`s3:RequestObjectTag/<key>`) and any existing tags on the
5347 // target object (`s3:ExistingObjectTag/<key>`).
5348 let existing = mgr.get_object_tags(&bucket, &key);
5349 self.enforce_policy_with_extra(
5350 &req,
5351 "s3:PutObjectTagging",
5352 &bucket,
5353 Some(&key),
5354 Some(&parsed),
5355 existing.as_ref(),
5356 )?;
5357 mgr.put_object_tags(&bucket, &key, parsed);
5358 Ok(S3Response::new(PutObjectTaggingOutput::default()))
5359 }
5360 async fn delete_object_tagging(
5361 &self,
5362 req: S3Request<DeleteObjectTaggingInput>,
5363 ) -> S3Result<S3Response<DeleteObjectTaggingOutput>> {
5364 let Some(mgr) = self.tagging.as_ref() else {
5365 return self.backend.delete_object_tagging(req).await;
5366 };
5367 let bucket = req.input.bucket.clone();
5368 let key = req.input.key.clone();
5369 let existing = mgr.get_object_tags(&bucket, &key);
5370 self.enforce_policy_with_extra(
5371 &req,
5372 "s3:DeleteObjectTagging",
5373 &bucket,
5374 Some(&key),
5375 None,
5376 existing.as_ref(),
5377 )?;
5378 mgr.delete_object_tags(&bucket, &key);
5379 Ok(S3Response::new(DeleteObjectTaggingOutput::default()))
5380 }
5381 async fn get_object_attributes(
5382 &self,
5383 req: S3Request<GetObjectAttributesInput>,
5384 ) -> S3Result<S3Response<GetObjectAttributesOutput>> {
5385 self.backend.get_object_attributes(req).await
5386 }
5387 async fn restore_object(
5388 &self,
5389 req: S3Request<RestoreObjectInput>,
5390 ) -> S3Result<S3Response<RestoreObjectOutput>> {
5391 self.backend.restore_object(req).await
5392 }
5393 async fn upload_part_copy(
5394 &self,
5395 req: S3Request<UploadPartCopyInput>,
5396 ) -> S3Result<S3Response<UploadPartCopyOutput>> {
5397 // v0.8.12 HIGH-9 fix: same per-action gates as `copy_object` —
5398 // destination PUT + source GET.
5399 let dst_bucket = req.input.bucket.clone();
5400 let dst_key = req.input.key.clone();
5401 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
5402 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
5403 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
5404 }
5405 self.enforce_rate_limit(&req, &dst_bucket)?;
5406 // v0.2 #6: byte-range aware copy when the source is S4-framed.
5407 //
5408 // For a framed source (multipart upload OR single-PUT framed-v2),
5409 // a naive byte-range passthrough would copy compressed bytes that
5410 // don't align with S4 frame boundaries — silently corrupting the
5411 // result. Instead we GET the source through S4 (which handles
5412 // decompression + Range), re-compress + re-frame as a new part,
5413 // and forward as upload_part. For non-framed sources (S4-untouched
5414 // raw objects), passthrough is correct and we keep the original
5415 // (cheaper) code path.
5416 // v0.8.4 #74: propagate the optional `?versionId=<vid>` from the
5417 // copy-source header. Without this, a versioned source bucket
5418 // copy that pins a specific old version would silently fall
5419 // back to "latest", assembling wrong bytes into the destination
5420 // multipart object (silent data corruption).
5421 let CopySource::Bucket {
5422 bucket: src_bucket,
5423 key: src_key,
5424 version_id: src_version_id,
5425 } = &req.input.copy_source
5426 else {
5427 return self.backend.upload_part_copy(req).await;
5428 };
5429 let src_bucket = src_bucket.to_string();
5430 let src_key = src_key.to_string();
5431 let src_version_id: Option<String> = src_version_id.as_deref().map(str::to_owned);
5432
5433 // Probe metadata to decide whether the source needs S4-aware copy.
5434 let head_input = HeadObjectInput {
5435 bucket: src_bucket.clone(),
5436 key: src_key.clone(),
5437 version_id: src_version_id.clone(),
5438 ..Default::default()
5439 };
5440 let head_req = S3Request {
5441 input: head_input,
5442 method: http::Method::HEAD,
5443 uri: req.uri.clone(),
5444 headers: req.headers.clone(),
5445 extensions: http::Extensions::new(),
5446 credentials: req.credentials.clone(),
5447 region: req.region.clone(),
5448 service: req.service.clone(),
5449 trailing_headers: None,
5450 };
5451 let needs_s4_copy = match self.backend.head_object(head_req).await {
5452 Ok(h) => {
5453 is_multipart_object(&h.output.metadata) || is_framed_v2_object(&h.output.metadata)
5454 }
5455 Err(_) => false,
5456 };
5457 if !needs_s4_copy {
5458 return self.backend.upload_part_copy(req).await;
5459 }
5460
5461 // Resolve the optional source byte range to pass to GET.
5462 let source_range = req
5463 .input
5464 .copy_source_range
5465 .as_ref()
5466 .map(|r| parse_copy_source_range(r))
5467 .transpose()
5468 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
5469
5470 // GET source via S4 (handles decompression + sidecar partial fetch
5471 // when range is present). The result is the requested user-visible
5472 // byte range, fully decompressed. version_id is propagated so
5473 // pinned-version copies fetch the exact version requested.
5474 let mut get_input = GetObjectInput {
5475 bucket: src_bucket.clone(),
5476 key: src_key.clone(),
5477 version_id: src_version_id.clone(),
5478 ..Default::default()
5479 };
5480 get_input.range = source_range;
5481 let get_req = S3Request {
5482 input: get_input,
5483 method: http::Method::GET,
5484 uri: req.uri.clone(),
5485 headers: req.headers.clone(),
5486 extensions: http::Extensions::new(),
5487 credentials: req.credentials.clone(),
5488 region: req.region.clone(),
5489 service: req.service.clone(),
5490 trailing_headers: None,
5491 };
5492 let get_resp = self.get_object(get_req).await?;
5493 let blob = get_resp.output.body.ok_or_else(|| {
5494 S3Error::with_message(
5495 S3ErrorCode::InternalError,
5496 "upload_part_copy: empty body from source GET",
5497 )
5498 })?;
5499 let bytes = collect_blob(blob, self.max_body_bytes)
5500 .await
5501 .map_err(internal("collect upload_part_copy source body"))?;
5502
5503 // Compress + frame as a fresh part (mirrors upload_part path).
5504 let sample_len = bytes.len().min(SAMPLE_BYTES);
5505 // v0.8 #56: same size-hint promotion as the upload_part path.
5506 let codec_kind = self
5507 .dispatcher
5508 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
5509 .await;
5510 let original_size = bytes.len() as u64;
5511 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
5512 let (compress_res, tel) = self
5513 .registry
5514 .compress_with_telemetry(bytes, codec_kind)
5515 .await;
5516 stamp_gpu_compress_telemetry(&tel);
5517 let (compressed, manifest) =
5518 compress_res.map_err(internal("registry compress upload_part_copy"))?;
5519 let header = FrameHeader {
5520 codec: codec_kind,
5521 original_size,
5522 compressed_size: compressed.len() as u64,
5523 crc32c: manifest.crc32c,
5524 };
5525 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
5526 write_frame(&mut framed, header, &compressed);
5527 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
5528 if !likely_final {
5529 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
5530 }
5531 let framed_bytes = framed.freeze();
5532 let framed_len = framed_bytes.len() as i64;
5533
5534 // Forward as upload_part to the destination multipart upload.
5535 let part_input = UploadPartInput {
5536 bucket: req.input.bucket.clone(),
5537 key: req.input.key.clone(),
5538 part_number: req.input.part_number,
5539 upload_id: req.input.upload_id.clone(),
5540 body: Some(bytes_to_blob(framed_bytes)),
5541 content_length: Some(framed_len),
5542 ..Default::default()
5543 };
5544 let part_req = S3Request {
5545 input: part_input,
5546 method: http::Method::PUT,
5547 uri: req.uri.clone(),
5548 headers: req.headers.clone(),
5549 extensions: http::Extensions::new(),
5550 credentials: req.credentials.clone(),
5551 region: req.region.clone(),
5552 service: req.service.clone(),
5553 trailing_headers: None,
5554 };
5555 let upload_resp = self.backend.upload_part(part_req).await?;
5556
5557 let copy_output = UploadPartCopyOutput {
5558 copy_part_result: Some(CopyPartResult {
5559 e_tag: upload_resp.output.e_tag.clone(),
5560 ..Default::default()
5561 }),
5562 ..Default::default()
5563 };
5564 Ok(S3Response::new(copy_output))
5565 }
5566
5567 // ---- Object lock / retention / legal hold (v0.5 #30) ----
5568 //
5569 // When an `ObjectLockManager` is attached the configuration / per-object
5570 // state lives in the manager and these handlers serve directly from it;
5571 // when no manager is attached they fall back to the backend (legacy
5572 // passthrough so v0.4 deployments are unaffected).
5573 async fn get_object_lock_configuration(
5574 &self,
5575 req: S3Request<GetObjectLockConfigurationInput>,
5576 ) -> S3Result<S3Response<GetObjectLockConfigurationOutput>> {
5577 self.enforce_policy(
5578 &req,
5579 "s3:GetBucketObjectLockConfiguration",
5580 &req.input.bucket,
5581 None,
5582 )?;
5583 if let Some(mgr) = self.object_lock.as_ref() {
5584 let cfg = mgr
5585 .bucket_default(&req.input.bucket)
5586 .map(|d| ObjectLockConfiguration {
5587 object_lock_enabled: Some(ObjectLockEnabled::from_static(
5588 ObjectLockEnabled::ENABLED,
5589 )),
5590 rule: Some(ObjectLockRule {
5591 default_retention: Some(DefaultRetention {
5592 days: Some(d.retention_days as i32),
5593 mode: Some(ObjectLockRetentionMode::from_static(match d.mode {
5594 crate::object_lock::LockMode::Governance => {
5595 ObjectLockRetentionMode::GOVERNANCE
5596 }
5597 crate::object_lock::LockMode::Compliance => {
5598 ObjectLockRetentionMode::COMPLIANCE
5599 }
5600 })),
5601 years: None,
5602 }),
5603 }),
5604 });
5605 let output = GetObjectLockConfigurationOutput {
5606 object_lock_configuration: cfg,
5607 };
5608 return Ok(S3Response::new(output));
5609 }
5610 self.backend.get_object_lock_configuration(req).await
5611 }
5612 async fn put_object_lock_configuration(
5613 &self,
5614 req: S3Request<PutObjectLockConfigurationInput>,
5615 ) -> S3Result<S3Response<PutObjectLockConfigurationOutput>> {
5616 self.enforce_policy(
5617 &req,
5618 "s3:PutBucketObjectLockConfiguration",
5619 &req.input.bucket,
5620 None,
5621 )?;
5622 if let Some(mgr) = self.object_lock.as_ref() {
5623 let bucket = req.input.bucket.clone();
5624 if let Some(cfg) = req.input.object_lock_configuration.as_ref()
5625 && let Some(rule) = cfg.rule.as_ref()
5626 && let Some(d) = rule.default_retention.as_ref()
5627 {
5628 let mode = d
5629 .mode
5630 .as_ref()
5631 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()))
5632 .ok_or_else(|| {
5633 S3Error::with_message(
5634 S3ErrorCode::InvalidRequest,
5635 "Object Lock default retention requires a valid Mode (GOVERNANCE | COMPLIANCE)",
5636 )
5637 })?;
5638 // S3 spec: exactly one of Days / Years (we accept Days
5639 // outright and convert Years → Days for storage; Years
5640 // is just a UX shorthand on the wire).
5641 let days: u32 = match (d.days, d.years) {
5642 (Some(d), None) if d > 0 => d as u32,
5643 (None, Some(y)) if y > 0 => (y as u32).saturating_mul(365),
5644 _ => {
5645 return Err(S3Error::with_message(
5646 S3ErrorCode::InvalidRequest,
5647 "Object Lock default retention requires exactly one of Days or Years (positive integer)",
5648 ));
5649 }
5650 };
5651 mgr.set_bucket_default(
5652 &bucket,
5653 crate::object_lock::BucketObjectLockDefault {
5654 mode,
5655 retention_days: days,
5656 },
5657 );
5658 }
5659 return Ok(S3Response::new(PutObjectLockConfigurationOutput::default()));
5660 }
5661 self.backend.put_object_lock_configuration(req).await
5662 }
5663 async fn get_object_legal_hold(
5664 &self,
5665 req: S3Request<GetObjectLegalHoldInput>,
5666 ) -> S3Result<S3Response<GetObjectLegalHoldOutput>> {
5667 let key = req.input.key.clone();
5668 self.enforce_policy(&req, "s3:GetObjectLegalHold", &req.input.bucket, Some(&key))?;
5669 if let Some(mgr) = self.object_lock.as_ref() {
5670 let on = mgr
5671 .get(&req.input.bucket, &req.input.key)
5672 .map(|s| s.legal_hold_on)
5673 .unwrap_or(false);
5674 let status = ObjectLockLegalHoldStatus::from_static(if on {
5675 ObjectLockLegalHoldStatus::ON
5676 } else {
5677 ObjectLockLegalHoldStatus::OFF
5678 });
5679 let output = GetObjectLegalHoldOutput {
5680 legal_hold: Some(ObjectLockLegalHold {
5681 status: Some(status),
5682 }),
5683 };
5684 return Ok(S3Response::new(output));
5685 }
5686 self.backend.get_object_legal_hold(req).await
5687 }
5688 async fn put_object_legal_hold(
5689 &self,
5690 req: S3Request<PutObjectLegalHoldInput>,
5691 ) -> S3Result<S3Response<PutObjectLegalHoldOutput>> {
5692 let key = req.input.key.clone();
5693 self.enforce_policy(&req, "s3:PutObjectLegalHold", &req.input.bucket, Some(&key))?;
5694 if let Some(mgr) = self.object_lock.as_ref() {
5695 let on = req
5696 .input
5697 .legal_hold
5698 .as_ref()
5699 .and_then(|h| h.status.as_ref())
5700 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
5701 .unwrap_or(false);
5702 mgr.set_legal_hold(&req.input.bucket, &req.input.key, on);
5703 return Ok(S3Response::new(PutObjectLegalHoldOutput::default()));
5704 }
5705 self.backend.put_object_legal_hold(req).await
5706 }
5707 async fn get_object_retention(
5708 &self,
5709 req: S3Request<GetObjectRetentionInput>,
5710 ) -> S3Result<S3Response<GetObjectRetentionOutput>> {
5711 let key = req.input.key.clone();
5712 self.enforce_policy(&req, "s3:GetObjectRetention", &req.input.bucket, Some(&key))?;
5713 if let Some(mgr) = self.object_lock.as_ref() {
5714 let retention = mgr
5715 .get(&req.input.bucket, &req.input.key)
5716 .filter(|s| s.mode.is_some() || s.retain_until.is_some())
5717 .map(|s| {
5718 let mode = s.mode.map(|m| {
5719 ObjectLockRetentionMode::from_static(match m {
5720 crate::object_lock::LockMode::Governance => {
5721 ObjectLockRetentionMode::GOVERNANCE
5722 }
5723 crate::object_lock::LockMode::Compliance => {
5724 ObjectLockRetentionMode::COMPLIANCE
5725 }
5726 })
5727 });
5728 let until = s.retain_until.map(chrono_utc_to_timestamp);
5729 ObjectLockRetention {
5730 mode,
5731 retain_until_date: until,
5732 }
5733 });
5734 let output = GetObjectRetentionOutput { retention };
5735 return Ok(S3Response::new(output));
5736 }
5737 self.backend.get_object_retention(req).await
5738 }
5739 async fn put_object_retention(
5740 &self,
5741 req: S3Request<PutObjectRetentionInput>,
5742 ) -> S3Result<S3Response<PutObjectRetentionOutput>> {
5743 let key = req.input.key.clone();
5744 self.enforce_policy(&req, "s3:PutObjectRetention", &req.input.bucket, Some(&key))?;
5745 if let Some(mgr) = self.object_lock.as_ref() {
5746 let bucket = req.input.bucket.clone();
5747 let key = req.input.key.clone();
5748 // v0.8.12 HIGH-7 fix: the bypass header gates Governance
5749 // shortening only when the caller has the matching IAM
5750 // action explicitly allowed; otherwise it's silently
5751 // dropped to `false` and the "shortening Governance
5752 // requires bypass" branch below rejects.
5753 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
5754 let bypass = if bypass_header {
5755 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
5756 .is_ok()
5757 } else {
5758 false
5759 };
5760 let retention = req.input.retention.as_ref().ok_or_else(|| {
5761 S3Error::with_message(
5762 S3ErrorCode::InvalidRequest,
5763 "PutObjectRetention requires a Retention element",
5764 )
5765 })?;
5766 let new_mode = retention
5767 .mode
5768 .as_ref()
5769 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
5770 let new_until = retention
5771 .retain_until_date
5772 .as_ref()
5773 .map(timestamp_to_chrono_utc)
5774 .unwrap_or(None);
5775 let now = chrono::Utc::now();
5776 let existing = mgr.get(&bucket, &key).unwrap_or_default();
5777 // S3 immutability rules:
5778 // - Compliance is one-way: once set, mode cannot move to
5779 // Governance, and retain-until cannot be shortened.
5780 // - Governance can be lengthened freely; shortened only
5781 // with bypass=true.
5782 if let Some(existing_mode) = existing.mode
5783 && existing_mode == crate::object_lock::LockMode::Compliance
5784 && existing.is_locked(now)
5785 {
5786 if matches!(new_mode, Some(crate::object_lock::LockMode::Governance)) {
5787 return Err(S3Error::with_message(
5788 S3ErrorCode::AccessDenied,
5789 "Cannot downgrade Compliance retention to Governance while lock is active",
5790 ));
5791 }
5792 if let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5793 && next < prev
5794 {
5795 return Err(S3Error::with_message(
5796 S3ErrorCode::AccessDenied,
5797 "Cannot shorten Compliance retention while lock is active",
5798 ));
5799 }
5800 }
5801 if let Some(existing_mode) = existing.mode
5802 && existing_mode == crate::object_lock::LockMode::Governance
5803 && existing.is_locked(now)
5804 && !bypass
5805 && let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5806 && next < prev
5807 {
5808 return Err(S3Error::with_message(
5809 S3ErrorCode::AccessDenied,
5810 "Shortening Governance retention requires x-amz-bypass-governance-retention: true",
5811 ));
5812 }
5813 let mut state = existing;
5814 if new_mode.is_some() {
5815 state.mode = new_mode;
5816 }
5817 if new_until.is_some() {
5818 state.retain_until = new_until;
5819 }
5820 mgr.set(&bucket, &key, state);
5821 return Ok(S3Response::new(PutObjectRetentionOutput::default()));
5822 }
5823 self.backend.put_object_retention(req).await
5824 }
5825
5826 // ---- Versioning ----
5827 // list_object_versions is implemented above in the compression-hook
5828 // section so it filters S4-internal sidecars (v0.4 #17) AND, when a
5829 // VersioningManager is attached (v0.5 #34), serves chains directly
5830 // from the in-memory index.
5831 async fn get_bucket_versioning(
5832 &self,
5833 req: S3Request<GetBucketVersioningInput>,
5834 ) -> S3Result<S3Response<GetBucketVersioningOutput>> {
5835 // v0.5 #34: when a VersioningManager is attached, the bucket's
5836 // versioning state lives in the manager (= S4-server's
5837 // authoritative source). Pass-through hits the backend only
5838 // when no manager is configured (legacy v0.4 behaviour).
5839 if let Some(mgr) = self.versioning.as_ref() {
5840 let output = match mgr.state(&req.input.bucket).as_aws_status() {
5841 Some(s) => GetBucketVersioningOutput {
5842 status: Some(BucketVersioningStatus::from(s.to_owned())),
5843 ..Default::default()
5844 },
5845 None => GetBucketVersioningOutput::default(),
5846 };
5847 return Ok(S3Response::new(output));
5848 }
5849 self.backend.get_bucket_versioning(req).await
5850 }
5851 async fn put_bucket_versioning(
5852 &self,
5853 req: S3Request<PutBucketVersioningInput>,
5854 ) -> S3Result<S3Response<PutBucketVersioningOutput>> {
5855 // v0.6 #42: MFA gating on the `PutBucketVersioning` request
5856 // itself. S3 spec: when the request body carries an
5857 // `MfaDelete` element (either `Enabled` or `Disabled`), the
5858 // request must include a valid `x-amz-mfa` token — both for
5859 // the *first* enable (so the operator can't quietly side-step
5860 // the gate by never enabling it) and for any subsequent
5861 // change (so a leaked credential alone can't disable MFA
5862 // Delete to bypass it on subsequent DELETEs). Requests that
5863 // omit the `MfaDelete` element entirely (i.e. they flip only
5864 // `Status`) skip this gate, matching AWS.
5865 if let Some(mgr) = self.mfa_delete.as_ref()
5866 && let Some(target_enabled) = req
5867 .input
5868 .versioning_configuration
5869 .mfa_delete
5870 .as_ref()
5871 .map(|m| m.as_str().eq_ignore_ascii_case("Enabled"))
5872 {
5873 let bucket = req.input.bucket.clone();
5874 let header = req.input.mfa.as_deref();
5875 let secret = mgr.lookup_secret(&bucket);
5876 let verified = match (header, secret.as_ref()) {
5877 (Some(h), Some(s)) => match crate::mfa::parse_mfa_header(h) {
5878 Ok((serial, code)) => {
5879 serial == s.serial
5880 && crate::mfa::verify_totp(&s.secret_base32, &code, current_unix_secs())
5881 }
5882 Err(_) => false,
5883 },
5884 _ => false,
5885 };
5886 if !verified {
5887 crate::metrics::record_mfa_delete_denial(&bucket);
5888 let err = if header.is_none() {
5889 crate::mfa::MfaError::Missing
5890 } else {
5891 crate::mfa::MfaError::InvalidCode
5892 };
5893 return Err(mfa_error_to_s3(err));
5894 }
5895 mgr.set_bucket_state(&bucket, target_enabled);
5896 }
5897 // v0.5 #34: stash the new state in the manager, then forward to
5898 // the backend so any downstream that *also* tracks state
5899 // (e.g. a real S3 backend) stays in sync. Manager-attached but
5900 // backend rejection is treated as a soft-fail (state is still
5901 // owned by the manager).
5902 if let Some(mgr) = self.versioning.as_ref() {
5903 let new_state = match req
5904 .input
5905 .versioning_configuration
5906 .status
5907 .as_ref()
5908 .map(|s| s.as_str())
5909 {
5910 Some(s) if s.eq_ignore_ascii_case("Enabled") => {
5911 crate::versioning::VersioningState::Enabled
5912 }
5913 Some(s) if s.eq_ignore_ascii_case("Suspended") => {
5914 crate::versioning::VersioningState::Suspended
5915 }
5916 _ => crate::versioning::VersioningState::Unversioned,
5917 };
5918 mgr.set_state(&req.input.bucket, new_state);
5919 return Ok(S3Response::new(PutBucketVersioningOutput::default()));
5920 }
5921 self.backend.put_bucket_versioning(req).await
5922 }
5923
5924 // ---- Bucket location ----
5925 async fn get_bucket_location(
5926 &self,
5927 req: S3Request<GetBucketLocationInput>,
5928 ) -> S3Result<S3Response<GetBucketLocationOutput>> {
5929 self.backend.get_bucket_location(req).await
5930 }
5931
5932 // ---- Bucket policy ----
5933 async fn get_bucket_policy(
5934 &self,
5935 req: S3Request<GetBucketPolicyInput>,
5936 ) -> S3Result<S3Response<GetBucketPolicyOutput>> {
5937 self.backend.get_bucket_policy(req).await
5938 }
5939 async fn put_bucket_policy(
5940 &self,
5941 req: S3Request<PutBucketPolicyInput>,
5942 ) -> S3Result<S3Response<PutBucketPolicyOutput>> {
5943 self.backend.put_bucket_policy(req).await
5944 }
5945 async fn delete_bucket_policy(
5946 &self,
5947 req: S3Request<DeleteBucketPolicyInput>,
5948 ) -> S3Result<S3Response<DeleteBucketPolicyOutput>> {
5949 self.backend.delete_bucket_policy(req).await
5950 }
5951 async fn get_bucket_policy_status(
5952 &self,
5953 req: S3Request<GetBucketPolicyStatusInput>,
5954 ) -> S3Result<S3Response<GetBucketPolicyStatusOutput>> {
5955 self.backend.get_bucket_policy_status(req).await
5956 }
5957
5958 // ---- Bucket ACL ----
5959 async fn get_bucket_acl(
5960 &self,
5961 req: S3Request<GetBucketAclInput>,
5962 ) -> S3Result<S3Response<GetBucketAclOutput>> {
5963 self.backend.get_bucket_acl(req).await
5964 }
5965 async fn put_bucket_acl(
5966 &self,
5967 req: S3Request<PutBucketAclInput>,
5968 ) -> S3Result<S3Response<PutBucketAclOutput>> {
5969 self.backend.put_bucket_acl(req).await
5970 }
5971
5972 // ---- Bucket CORS (v0.6 #38) ----
5973 async fn get_bucket_cors(
5974 &self,
5975 req: S3Request<GetBucketCorsInput>,
5976 ) -> S3Result<S3Response<GetBucketCorsOutput>> {
5977 if let Some(mgr) = self.cors.as_ref() {
5978 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
5979 S3Error::with_message(
5980 S3ErrorCode::NoSuchCORSConfiguration,
5981 "The CORS configuration does not exist".to_string(),
5982 )
5983 })?;
5984 let rules: Vec<CORSRule> = cfg
5985 .rules
5986 .into_iter()
5987 .map(|r| CORSRule {
5988 allowed_headers: if r.allowed_headers.is_empty() {
5989 None
5990 } else {
5991 Some(r.allowed_headers)
5992 },
5993 allowed_methods: r.allowed_methods,
5994 allowed_origins: r.allowed_origins,
5995 expose_headers: if r.expose_headers.is_empty() {
5996 None
5997 } else {
5998 Some(r.expose_headers)
5999 },
6000 id: r.id,
6001 max_age_seconds: r.max_age_seconds.map(|s| s as i32),
6002 })
6003 .collect();
6004 return Ok(S3Response::new(GetBucketCorsOutput {
6005 cors_rules: Some(rules),
6006 }));
6007 }
6008 self.backend.get_bucket_cors(req).await
6009 }
6010 async fn put_bucket_cors(
6011 &self,
6012 req: S3Request<PutBucketCorsInput>,
6013 ) -> S3Result<S3Response<PutBucketCorsOutput>> {
6014 if let Some(mgr) = self.cors.as_ref() {
6015 let cfg = crate::cors::CorsConfig {
6016 rules: req
6017 .input
6018 .cors_configuration
6019 .cors_rules
6020 .into_iter()
6021 .map(|r| crate::cors::CorsRule {
6022 allowed_origins: r.allowed_origins,
6023 allowed_methods: r.allowed_methods,
6024 allowed_headers: r.allowed_headers.unwrap_or_default(),
6025 expose_headers: r.expose_headers.unwrap_or_default(),
6026 max_age_seconds: r
6027 .max_age_seconds
6028 .and_then(|s| if s < 0 { None } else { Some(s as u32) }),
6029 id: r.id,
6030 })
6031 .collect(),
6032 };
6033 // v0.8.15 M-3: AWS S3 rejects `AllowedMethods` outside
6034 // the canonical {GET,PUT,POST,DELETE,HEAD} set (including
6035 // the `*` wildcard). Validate at PutBucketCors time so
6036 // operators see the misconfiguration in the API response
6037 // instead of having silently-broken preflights at the
6038 // browser later.
6039 if let Err(e) = crate::cors::CorsManager::validate(&cfg) {
6040 return Err(S3Error::with_message(
6041 S3ErrorCode::InvalidArgument,
6042 e.to_string(),
6043 ));
6044 }
6045 mgr.put(&req.input.bucket, cfg);
6046 return Ok(S3Response::new(PutBucketCorsOutput::default()));
6047 }
6048 self.backend.put_bucket_cors(req).await
6049 }
6050 async fn delete_bucket_cors(
6051 &self,
6052 req: S3Request<DeleteBucketCorsInput>,
6053 ) -> S3Result<S3Response<DeleteBucketCorsOutput>> {
6054 if let Some(mgr) = self.cors.as_ref() {
6055 mgr.delete(&req.input.bucket);
6056 return Ok(S3Response::new(DeleteBucketCorsOutput::default()));
6057 }
6058 self.backend.delete_bucket_cors(req).await
6059 }
6060
6061 // ---- Bucket lifecycle (v0.6 #37) ----
6062 async fn get_bucket_lifecycle_configuration(
6063 &self,
6064 req: S3Request<GetBucketLifecycleConfigurationInput>,
6065 ) -> S3Result<S3Response<GetBucketLifecycleConfigurationOutput>> {
6066 if let Some(mgr) = self.lifecycle.as_ref() {
6067 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6068 S3Error::with_message(
6069 S3ErrorCode::NoSuchLifecycleConfiguration,
6070 "The lifecycle configuration does not exist".to_string(),
6071 )
6072 })?;
6073 let rules: Vec<LifecycleRule> = cfg.rules.iter().map(internal_rule_to_dto).collect();
6074 return Ok(S3Response::new(GetBucketLifecycleConfigurationOutput {
6075 rules: Some(rules),
6076 transition_default_minimum_object_size: None,
6077 }));
6078 }
6079 self.backend.get_bucket_lifecycle_configuration(req).await
6080 }
6081 async fn put_bucket_lifecycle_configuration(
6082 &self,
6083 req: S3Request<PutBucketLifecycleConfigurationInput>,
6084 ) -> S3Result<S3Response<PutBucketLifecycleConfigurationOutput>> {
6085 if let Some(mgr) = self.lifecycle.as_ref() {
6086 let bucket = req.input.bucket.clone();
6087 let dto_cfg = req.input.lifecycle_configuration.unwrap_or_default();
6088 let cfg = dto_lifecycle_to_internal(&dto_cfg);
6089 mgr.put(&bucket, cfg);
6090 return Ok(S3Response::new(
6091 PutBucketLifecycleConfigurationOutput::default(),
6092 ));
6093 }
6094 self.backend.put_bucket_lifecycle_configuration(req).await
6095 }
6096 async fn delete_bucket_lifecycle(
6097 &self,
6098 req: S3Request<DeleteBucketLifecycleInput>,
6099 ) -> S3Result<S3Response<DeleteBucketLifecycleOutput>> {
6100 if let Some(mgr) = self.lifecycle.as_ref() {
6101 mgr.delete(&req.input.bucket);
6102 return Ok(S3Response::new(DeleteBucketLifecycleOutput::default()));
6103 }
6104 self.backend.delete_bucket_lifecycle(req).await
6105 }
6106
6107 // ---- Bucket tagging (v0.6 #39) ----
6108 async fn get_bucket_tagging(
6109 &self,
6110 req: S3Request<GetBucketTaggingInput>,
6111 ) -> S3Result<S3Response<GetBucketTaggingOutput>> {
6112 let Some(mgr) = self.tagging.as_ref() else {
6113 return self.backend.get_bucket_tagging(req).await;
6114 };
6115 let tags = mgr.get_bucket_tags(&req.input.bucket).unwrap_or_default();
6116 Ok(S3Response::new(GetBucketTaggingOutput {
6117 tag_set: tagset_to_aws(&tags),
6118 }))
6119 }
6120 async fn put_bucket_tagging(
6121 &self,
6122 req: S3Request<PutBucketTaggingInput>,
6123 ) -> S3Result<S3Response<PutBucketTaggingOutput>> {
6124 let Some(mgr) = self.tagging.as_ref() else {
6125 return self.backend.put_bucket_tagging(req).await;
6126 };
6127 let bucket = req.input.bucket.clone();
6128 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6129 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6130 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6131 mgr.put_bucket_tags(&bucket, parsed);
6132 Ok(S3Response::new(PutBucketTaggingOutput::default()))
6133 }
6134 async fn delete_bucket_tagging(
6135 &self,
6136 req: S3Request<DeleteBucketTaggingInput>,
6137 ) -> S3Result<S3Response<DeleteBucketTaggingOutput>> {
6138 let Some(mgr) = self.tagging.as_ref() else {
6139 return self.backend.delete_bucket_tagging(req).await;
6140 };
6141 let bucket = req.input.bucket.clone();
6142 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6143 mgr.delete_bucket_tags(&bucket);
6144 Ok(S3Response::new(DeleteBucketTaggingOutput::default()))
6145 }
6146
6147 // ---- Bucket encryption ----
6148 async fn get_bucket_encryption(
6149 &self,
6150 req: S3Request<GetBucketEncryptionInput>,
6151 ) -> S3Result<S3Response<GetBucketEncryptionOutput>> {
6152 self.backend.get_bucket_encryption(req).await
6153 }
6154 async fn put_bucket_encryption(
6155 &self,
6156 req: S3Request<PutBucketEncryptionInput>,
6157 ) -> S3Result<S3Response<PutBucketEncryptionOutput>> {
6158 self.backend.put_bucket_encryption(req).await
6159 }
6160 async fn delete_bucket_encryption(
6161 &self,
6162 req: S3Request<DeleteBucketEncryptionInput>,
6163 ) -> S3Result<S3Response<DeleteBucketEncryptionOutput>> {
6164 self.backend.delete_bucket_encryption(req).await
6165 }
6166
6167 // ---- Bucket logging ----
6168 async fn get_bucket_logging(
6169 &self,
6170 req: S3Request<GetBucketLoggingInput>,
6171 ) -> S3Result<S3Response<GetBucketLoggingOutput>> {
6172 self.backend.get_bucket_logging(req).await
6173 }
6174 async fn put_bucket_logging(
6175 &self,
6176 req: S3Request<PutBucketLoggingInput>,
6177 ) -> S3Result<S3Response<PutBucketLoggingOutput>> {
6178 self.backend.put_bucket_logging(req).await
6179 }
6180
6181 // ---- Bucket notification (v0.6 #35) ----
6182 //
6183 // When a `NotificationManager` is attached, S4 itself owns per-bucket
6184 // notification configurations and the PUT / GET handlers route through
6185 // the manager. The wire DTO's queue / topic configurations map onto
6186 // S4's `Destination::Sqs` / `Destination::Sns`; LambdaFunction and
6187 // EventBridge configurations are accepted on PUT but silently dropped
6188 // (out of scope for v0.6 #35). When no manager is attached the legacy
6189 // backend-passthrough behaviour applies.
6190 async fn get_bucket_notification_configuration(
6191 &self,
6192 req: S3Request<GetBucketNotificationConfigurationInput>,
6193 ) -> S3Result<S3Response<GetBucketNotificationConfigurationOutput>> {
6194 if let Some(mgr) = self.notifications.as_ref() {
6195 let cfg = mgr.get(&req.input.bucket).unwrap_or_default();
6196 let dto = notif_to_dto(&cfg);
6197 return Ok(S3Response::new(GetBucketNotificationConfigurationOutput {
6198 event_bridge_configuration: dto.event_bridge_configuration,
6199 lambda_function_configurations: dto.lambda_function_configurations,
6200 queue_configurations: dto.queue_configurations,
6201 topic_configurations: dto.topic_configurations,
6202 }));
6203 }
6204 self.backend
6205 .get_bucket_notification_configuration(req)
6206 .await
6207 }
6208 async fn put_bucket_notification_configuration(
6209 &self,
6210 req: S3Request<PutBucketNotificationConfigurationInput>,
6211 ) -> S3Result<S3Response<PutBucketNotificationConfigurationOutput>> {
6212 if let Some(mgr) = self.notifications.as_ref() {
6213 let cfg = notif_from_dto(&req.input.notification_configuration);
6214 mgr.put(&req.input.bucket, cfg);
6215 return Ok(S3Response::new(
6216 PutBucketNotificationConfigurationOutput::default(),
6217 ));
6218 }
6219 self.backend
6220 .put_bucket_notification_configuration(req)
6221 .await
6222 }
6223
6224 // ---- Bucket request payment ----
6225 async fn get_bucket_request_payment(
6226 &self,
6227 req: S3Request<GetBucketRequestPaymentInput>,
6228 ) -> S3Result<S3Response<GetBucketRequestPaymentOutput>> {
6229 self.backend.get_bucket_request_payment(req).await
6230 }
6231 async fn put_bucket_request_payment(
6232 &self,
6233 req: S3Request<PutBucketRequestPaymentInput>,
6234 ) -> S3Result<S3Response<PutBucketRequestPaymentOutput>> {
6235 self.backend.put_bucket_request_payment(req).await
6236 }
6237
6238 // ---- Bucket website ----
6239 async fn get_bucket_website(
6240 &self,
6241 req: S3Request<GetBucketWebsiteInput>,
6242 ) -> S3Result<S3Response<GetBucketWebsiteOutput>> {
6243 self.backend.get_bucket_website(req).await
6244 }
6245 async fn put_bucket_website(
6246 &self,
6247 req: S3Request<PutBucketWebsiteInput>,
6248 ) -> S3Result<S3Response<PutBucketWebsiteOutput>> {
6249 self.backend.put_bucket_website(req).await
6250 }
6251 async fn delete_bucket_website(
6252 &self,
6253 req: S3Request<DeleteBucketWebsiteInput>,
6254 ) -> S3Result<S3Response<DeleteBucketWebsiteOutput>> {
6255 self.backend.delete_bucket_website(req).await
6256 }
6257
6258 // ---- Bucket replication (v0.6 #40) ----
6259 async fn get_bucket_replication(
6260 &self,
6261 req: S3Request<GetBucketReplicationInput>,
6262 ) -> S3Result<S3Response<GetBucketReplicationOutput>> {
6263 if let Some(mgr) = self.replication.as_ref() {
6264 return match mgr.get(&req.input.bucket) {
6265 Some(cfg) => Ok(S3Response::new(GetBucketReplicationOutput {
6266 replication_configuration: Some(replication_to_dto(&cfg)),
6267 })),
6268 None => Err(S3Error::with_message(
6269 S3ErrorCode::Custom("ReplicationConfigurationNotFoundError".into()),
6270 format!(
6271 "no replication configuration on bucket {}",
6272 req.input.bucket
6273 ),
6274 )),
6275 };
6276 }
6277 self.backend.get_bucket_replication(req).await
6278 }
6279 async fn put_bucket_replication(
6280 &self,
6281 req: S3Request<PutBucketReplicationInput>,
6282 ) -> S3Result<S3Response<PutBucketReplicationOutput>> {
6283 if let Some(mgr) = self.replication.as_ref() {
6284 let cfg = replication_from_dto(&req.input.replication_configuration);
6285 mgr.put(&req.input.bucket, cfg);
6286 return Ok(S3Response::new(PutBucketReplicationOutput::default()));
6287 }
6288 self.backend.put_bucket_replication(req).await
6289 }
6290 async fn delete_bucket_replication(
6291 &self,
6292 req: S3Request<DeleteBucketReplicationInput>,
6293 ) -> S3Result<S3Response<DeleteBucketReplicationOutput>> {
6294 if let Some(mgr) = self.replication.as_ref() {
6295 mgr.delete(&req.input.bucket);
6296 return Ok(S3Response::new(DeleteBucketReplicationOutput::default()));
6297 }
6298 self.backend.delete_bucket_replication(req).await
6299 }
6300
6301 // ---- Bucket accelerate ----
6302 async fn get_bucket_accelerate_configuration(
6303 &self,
6304 req: S3Request<GetBucketAccelerateConfigurationInput>,
6305 ) -> S3Result<S3Response<GetBucketAccelerateConfigurationOutput>> {
6306 self.backend.get_bucket_accelerate_configuration(req).await
6307 }
6308 async fn put_bucket_accelerate_configuration(
6309 &self,
6310 req: S3Request<PutBucketAccelerateConfigurationInput>,
6311 ) -> S3Result<S3Response<PutBucketAccelerateConfigurationOutput>> {
6312 self.backend.put_bucket_accelerate_configuration(req).await
6313 }
6314
6315 // ---- Bucket ownership controls ----
6316 async fn get_bucket_ownership_controls(
6317 &self,
6318 req: S3Request<GetBucketOwnershipControlsInput>,
6319 ) -> S3Result<S3Response<GetBucketOwnershipControlsOutput>> {
6320 self.backend.get_bucket_ownership_controls(req).await
6321 }
6322 async fn put_bucket_ownership_controls(
6323 &self,
6324 req: S3Request<PutBucketOwnershipControlsInput>,
6325 ) -> S3Result<S3Response<PutBucketOwnershipControlsOutput>> {
6326 self.backend.put_bucket_ownership_controls(req).await
6327 }
6328 async fn delete_bucket_ownership_controls(
6329 &self,
6330 req: S3Request<DeleteBucketOwnershipControlsInput>,
6331 ) -> S3Result<S3Response<DeleteBucketOwnershipControlsOutput>> {
6332 self.backend.delete_bucket_ownership_controls(req).await
6333 }
6334
6335 // ---- Public access block ----
6336 async fn get_public_access_block(
6337 &self,
6338 req: S3Request<GetPublicAccessBlockInput>,
6339 ) -> S3Result<S3Response<GetPublicAccessBlockOutput>> {
6340 self.backend.get_public_access_block(req).await
6341 }
6342 async fn put_public_access_block(
6343 &self,
6344 req: S3Request<PutPublicAccessBlockInput>,
6345 ) -> S3Result<S3Response<PutPublicAccessBlockOutput>> {
6346 self.backend.put_public_access_block(req).await
6347 }
6348 async fn delete_public_access_block(
6349 &self,
6350 req: S3Request<DeletePublicAccessBlockInput>,
6351 ) -> S3Result<S3Response<DeletePublicAccessBlockOutput>> {
6352 self.backend.delete_public_access_block(req).await
6353 }
6354
6355 // ====================================================================
6356 // v0.6 #41: S3 Select — server-side SQL filter on object body.
6357 //
6358 // Fetch the object via the regular `get_object` path (so SSE-C /
6359 // SSE-S4 / SSE-KMS / S4 codec all decompress + decrypt transparently),
6360 // run a small SQL subset (CSV + JSON Lines, equality / inequality /
6361 // LIKE / AND / OR / NOT) over the in-memory body, and stream the
6362 // matched rows back as AWS event-stream `Records` + `Stats` + `End`
6363 // frames.
6364 //
6365 // Limitations (deliberate, documented):
6366 // - Parquet input is rejected with NotImplemented.
6367 // - Aggregates / GROUP BY / JOIN / ORDER BY / LIMIT are rejected at
6368 // parse time as InvalidRequest (s3s 0.13 doesn't expose AWS's
6369 // domain-specific `InvalidSqlExpression` code).
6370 // - The body is fully buffered before SQL evaluation (S3 Select
6371 // streaming-during-evaluation is v0.7 scope).
6372 // - GPU-accelerated WHERE evaluation is stubbed out (always None).
6373 async fn select_object_content(
6374 &self,
6375 req: S3Request<SelectObjectContentInput>,
6376 ) -> S3Result<S3Response<SelectObjectContentOutput>> {
6377 use crate::select::{
6378 EventStreamWriter, SelectInputFormat, SelectOutputFormat, run_select_csv,
6379 run_select_jsonlines,
6380 };
6381
6382 let select_bucket = req.input.bucket.clone();
6383 let select_key = req.input.key.clone();
6384 self.enforce_rate_limit(&req, &select_bucket)?;
6385 self.enforce_policy(&req, "s3:GetObject", &select_bucket, Some(&select_key))?;
6386
6387 let request = req.input.request;
6388 let sql = request.expression.clone();
6389 if request.expression_type.as_str() != "SQL" {
6390 return Err(S3Error::with_message(
6391 S3ErrorCode::InvalidExpressionType,
6392 format!(
6393 "ExpressionType must be SQL, got: {}",
6394 request.expression_type.as_str()
6395 ),
6396 ));
6397 }
6398
6399 let input_format = if let Some(_json) = request.input_serialization.json.as_ref() {
6400 SelectInputFormat::JsonLines
6401 } else if let Some(csv) = request.input_serialization.csv.as_ref() {
6402 let has_header = csv
6403 .file_header_info
6404 .as_ref()
6405 .map(|h| {
6406 let s = h.as_str();
6407 s.eq_ignore_ascii_case("USE") || s.eq_ignore_ascii_case("IGNORE")
6408 })
6409 .unwrap_or(false);
6410 let delim = csv
6411 .field_delimiter
6412 .as_deref()
6413 .and_then(|s| s.chars().next())
6414 .unwrap_or(',');
6415 SelectInputFormat::Csv {
6416 has_header,
6417 delimiter: delim,
6418 }
6419 } else if request.input_serialization.parquet.is_some() {
6420 return Err(S3Error::with_message(
6421 S3ErrorCode::NotImplemented,
6422 "Parquet input is not supported by this S3 Select implementation (v0.6: CSV / JSON Lines only)",
6423 ));
6424 } else {
6425 return Err(S3Error::with_message(
6426 S3ErrorCode::InvalidRequest,
6427 "InputSerialization requires exactly one of CSV / JSON / Parquet",
6428 ));
6429 };
6430 if let Some(ct) = request.input_serialization.compression_type.as_ref()
6431 && !ct.as_str().eq_ignore_ascii_case("NONE")
6432 {
6433 return Err(S3Error::with_message(
6434 S3ErrorCode::NotImplemented,
6435 format!(
6436 "InputSerialization CompressionType={} is not supported (v0.6: NONE only)",
6437 ct.as_str()
6438 ),
6439 ));
6440 }
6441
6442 let output_format = if request.output_serialization.json.is_some() {
6443 SelectOutputFormat::Json
6444 } else if request.output_serialization.csv.is_some() {
6445 SelectOutputFormat::Csv
6446 } else {
6447 return Err(S3Error::with_message(
6448 S3ErrorCode::InvalidRequest,
6449 "OutputSerialization requires exactly one of CSV / JSON",
6450 ));
6451 };
6452
6453 let get_input = GetObjectInput {
6454 bucket: select_bucket.clone(),
6455 key: select_key.clone(),
6456 sse_customer_algorithm: req.input.sse_customer_algorithm.clone(),
6457 sse_customer_key: req.input.sse_customer_key.clone(),
6458 sse_customer_key_md5: req.input.sse_customer_key_md5.clone(),
6459 ..Default::default()
6460 };
6461 let get_req = S3Request {
6462 input: get_input,
6463 method: http::Method::GET,
6464 uri: format!("/{}/{}", select_bucket, select_key)
6465 .parse()
6466 .map_err(|e| {
6467 S3Error::with_message(
6468 S3ErrorCode::InternalError,
6469 format!("constructing inner GET URI: {e}"),
6470 )
6471 })?,
6472 headers: http::HeaderMap::new(),
6473 extensions: http::Extensions::new(),
6474 credentials: req.credentials.clone(),
6475 region: req.region.clone(),
6476 service: req.service.clone(),
6477 trailing_headers: None,
6478 };
6479 let mut get_resp = self.get_object(get_req).await?;
6480 let blob = get_resp.output.body.take().ok_or_else(|| {
6481 S3Error::with_message(
6482 S3ErrorCode::InternalError,
6483 "Select: object body was empty after GET",
6484 )
6485 })?;
6486 let body_bytes = crate::blob::collect_blob(blob, self.max_body_bytes)
6487 .await
6488 .map_err(internal("collect Select body"))?;
6489 let scanned = body_bytes.len() as u64;
6490
6491 let matched_payload = match input_format {
6492 SelectInputFormat::JsonLines => run_select_jsonlines(&sql, &body_bytes, output_format)
6493 .map_err(|e| select_error_to_s3(e, "JSON Lines"))?,
6494 SelectInputFormat::Csv { .. } => {
6495 run_select_csv(&sql, &body_bytes, input_format, output_format)
6496 .map_err(|e| select_error_to_s3(e, "CSV"))?
6497 }
6498 };
6499
6500 let returned = matched_payload.len() as u64;
6501 let processed = scanned;
6502 let mut events: Vec<S3Result<SelectObjectContentEvent>> = Vec::with_capacity(3);
6503 if !matched_payload.is_empty() {
6504 events.push(Ok(SelectObjectContentEvent::Records(RecordsEvent {
6505 payload: Some(bytes::Bytes::from(matched_payload)),
6506 })));
6507 }
6508 events.push(Ok(SelectObjectContentEvent::Stats(StatsEvent {
6509 details: Some(Stats {
6510 bytes_scanned: Some(scanned as i64),
6511 bytes_processed: Some(processed as i64),
6512 bytes_returned: Some(returned as i64),
6513 }),
6514 })));
6515 events.push(Ok(SelectObjectContentEvent::End(EndEvent {})));
6516 // Touch EventStreamWriter so the public API stays linked into the
6517 // build (the actual wire framing is delegated to s3s).
6518 let _writer = EventStreamWriter::new();
6519
6520 let stream = SelectObjectContentEventStream::new(futures::stream::iter(events));
6521 let output = SelectObjectContentOutput {
6522 payload: Some(stream),
6523 };
6524 Ok(S3Response::new(output))
6525 }
6526
6527 // ---- Bucket Inventory configuration (v0.6 #36) ----
6528 //
6529 // When an `InventoryManager` is attached, S4-server owns the
6530 // configuration store and these handlers no longer pass through to
6531 // the backend. The mapping between the s3s-typed
6532 // `InventoryConfiguration` and the inventory module's internal
6533 // `InventoryConfig` is intentionally lossy: only the fields S4
6534 // actually uses for periodic CSV emission survive the round trip
6535 // (id, source bucket, destination bucket / prefix, format, included
6536 // versions, schedule frequency). Optional fields, encryption, and
6537 // filter prefixes are accepted on PUT and re-surfaced on GET via
6538 // a best-effort default-shape `InventoryConfiguration` so the
6539 // client sees a roundtrip-clean response.
6540 async fn put_bucket_inventory_configuration(
6541 &self,
6542 req: S3Request<PutBucketInventoryConfigurationInput>,
6543 ) -> S3Result<S3Response<PutBucketInventoryConfigurationOutput>> {
6544 if let Some(mgr) = self.inventory.as_ref() {
6545 let cfg = inv_from_dto(
6546 &req.input.bucket,
6547 &req.input.id,
6548 &req.input.inventory_configuration,
6549 );
6550 mgr.put(cfg);
6551 return Ok(S3Response::new(
6552 PutBucketInventoryConfigurationOutput::default(),
6553 ));
6554 }
6555 self.backend.put_bucket_inventory_configuration(req).await
6556 }
6557
6558 async fn get_bucket_inventory_configuration(
6559 &self,
6560 req: S3Request<GetBucketInventoryConfigurationInput>,
6561 ) -> S3Result<S3Response<GetBucketInventoryConfigurationOutput>> {
6562 if let Some(mgr) = self.inventory.as_ref() {
6563 let cfg = mgr.get(&req.input.bucket, &req.input.id);
6564 if let Some(cfg) = cfg {
6565 let out = GetBucketInventoryConfigurationOutput {
6566 inventory_configuration: Some(inv_to_dto(&cfg)),
6567 };
6568 return Ok(S3Response::new(out));
6569 }
6570 // AWS returns `NoSuchConfiguration` (404) when the id has no
6571 // matching inventory configuration on the bucket. The
6572 // generated `S3ErrorCode` enum doesn't expose a typed variant
6573 // for this code, so we round-trip through `from_bytes` which
6574 // wraps unknown codes as `Custom(...)` (= the AWS-canonical
6575 // error-code string survives into the XML response envelope).
6576 let code =
6577 S3ErrorCode::from_bytes(b"NoSuchConfiguration").unwrap_or(S3ErrorCode::NoSuchKey);
6578 return Err(S3Error::with_message(
6579 code,
6580 format!(
6581 "no inventory configuration with id={} on bucket={}",
6582 req.input.id, req.input.bucket
6583 ),
6584 ));
6585 }
6586 self.backend.get_bucket_inventory_configuration(req).await
6587 }
6588
6589 async fn list_bucket_inventory_configurations(
6590 &self,
6591 req: S3Request<ListBucketInventoryConfigurationsInput>,
6592 ) -> S3Result<S3Response<ListBucketInventoryConfigurationsOutput>> {
6593 if let Some(mgr) = self.inventory.as_ref() {
6594 let list = mgr.list_for_bucket(&req.input.bucket);
6595 let dto_list: Vec<InventoryConfiguration> = list.iter().map(inv_to_dto).collect();
6596 let out = ListBucketInventoryConfigurationsOutput {
6597 continuation_token: req.input.continuation_token.clone(),
6598 inventory_configuration_list: if dto_list.is_empty() {
6599 None
6600 } else {
6601 Some(dto_list)
6602 },
6603 is_truncated: Some(false),
6604 next_continuation_token: None,
6605 };
6606 return Ok(S3Response::new(out));
6607 }
6608 self.backend.list_bucket_inventory_configurations(req).await
6609 }
6610
6611 async fn delete_bucket_inventory_configuration(
6612 &self,
6613 req: S3Request<DeleteBucketInventoryConfigurationInput>,
6614 ) -> S3Result<S3Response<DeleteBucketInventoryConfigurationOutput>> {
6615 if let Some(mgr) = self.inventory.as_ref() {
6616 mgr.delete(&req.input.bucket, &req.input.id);
6617 return Ok(S3Response::new(
6618 DeleteBucketInventoryConfigurationOutput::default(),
6619 ));
6620 }
6621 self.backend
6622 .delete_bucket_inventory_configuration(req)
6623 .await
6624 }
6625}
6626
6627// ---------------------------------------------------------------------------
6628// v0.6 #36: Convert between the s3s-typed `InventoryConfiguration` (the wire
6629// surface) and our internal `crate::inventory::InventoryConfig`. Only the
6630// fields S4 actually uses for CSV emission survive the round trip; the
6631// missing fields (filter prefix, optional fields, encryption) are dropped on
6632// PUT and re-rendered as the AWS-default shape on GET so the client sees a
6633// well-formed `InventoryConfiguration`.
6634// ---------------------------------------------------------------------------
6635
6636fn inv_from_dto(
6637 bucket: &str,
6638 id: &str,
6639 dto: &InventoryConfiguration,
6640) -> crate::inventory::InventoryConfig {
6641 let frequency_hours = match dto.schedule.frequency.as_str() {
6642 "Weekly" => 24 * 7,
6643 // Daily is the default; anything S4 doesn't recognise (incl.
6644 // empty, which is the s3s-default) maps to Daily so the
6645 // operator's PUT doesn't silently turn into a no-op cadence.
6646 _ => 24,
6647 };
6648 // Parquet/ORC are not supported (issue #36 scope); we still accept
6649 // the PUT so callers don't fail-loud, but we record CSV and rely on
6650 // the operator catching the discrepancy on GET.
6651 let format = crate::inventory::InventoryFormat::Csv;
6652 crate::inventory::InventoryConfig {
6653 id: id.to_owned(),
6654 bucket: bucket.to_owned(),
6655 destination_bucket: dto.destination.s3_bucket_destination.bucket.clone(),
6656 destination_prefix: dto
6657 .destination
6658 .s3_bucket_destination
6659 .prefix
6660 .clone()
6661 .unwrap_or_default(),
6662 frequency_hours,
6663 format,
6664 included_object_versions: crate::inventory::IncludedVersions::from_aws_str(
6665 dto.included_object_versions.as_str(),
6666 ),
6667 }
6668}
6669
6670fn inv_to_dto(cfg: &crate::inventory::InventoryConfig) -> InventoryConfiguration {
6671 InventoryConfiguration {
6672 id: cfg.id.clone(),
6673 is_enabled: true,
6674 included_object_versions: InventoryIncludedObjectVersions::from(
6675 cfg.included_object_versions.as_aws_str().to_owned(),
6676 ),
6677 destination: InventoryDestination {
6678 s3_bucket_destination: InventoryS3BucketDestination {
6679 account_id: None,
6680 bucket: cfg.destination_bucket.clone(),
6681 encryption: None,
6682 format: InventoryFormat::from(cfg.format.as_aws_str().to_owned()),
6683 prefix: if cfg.destination_prefix.is_empty() {
6684 None
6685 } else {
6686 Some(cfg.destination_prefix.clone())
6687 },
6688 },
6689 },
6690 schedule: InventorySchedule {
6691 // `frequency_hours == 168` -> Weekly; everything else maps to
6692 // Daily for the wire response (the manager keeps the precise
6693 // hour count internally for due-checking).
6694 frequency: InventoryFrequency::from(
6695 if cfg.frequency_hours == 24 * 7 {
6696 "Weekly"
6697 } else {
6698 "Daily"
6699 }
6700 .to_owned(),
6701 ),
6702 },
6703 filter: None,
6704 optional_fields: None,
6705 }
6706}
6707
6708// ---------------------------------------------------------------------------
6709// v0.6 #35: Convert between the s3s-typed `NotificationConfiguration` (the
6710// wire surface) and our internal `crate::notifications::NotificationConfig`.
6711//
6712// We support TopicConfiguration (-> Destination::Sns) and QueueConfiguration
6713// (-> Destination::Sqs). LambdaFunction and EventBridge configurations are
6714// silently dropped on PUT (out of scope for v0.6 #35); the GET response only
6715// surfaces topic / queue rules.
6716//
6717// The webhook destination has no AWS-native wire form: operators configure
6718// webhooks via the JSON snapshot file (`--notifications-state-file`) or by
6719// poking `NotificationManager::put` directly from a custom binary. This
6720// keeps the wire surface AWS-compatible while still letting the always-
6721// available `Webhook` destination be reachable.
6722// ---------------------------------------------------------------------------
6723
6724fn notif_from_dto(dto: &NotificationConfiguration) -> crate::notifications::NotificationConfig {
6725 let mut rules: Vec<crate::notifications::NotificationRule> = Vec::new();
6726 if let Some(topics) = dto.topic_configurations.as_ref() {
6727 for (idx, t) in topics.iter().enumerate() {
6728 let events = events_from_dto(&t.events);
6729 let (prefix, suffix) = filter_from_dto(t.filter.as_ref());
6730 rules.push(crate::notifications::NotificationRule {
6731 id: t.id.clone().unwrap_or_else(|| format!("topic-{idx}")),
6732 events,
6733 destination: crate::notifications::Destination::Sns {
6734 topic_arn: t.topic_arn.clone(),
6735 },
6736 filter_prefix: prefix,
6737 filter_suffix: suffix,
6738 });
6739 }
6740 }
6741 if let Some(queues) = dto.queue_configurations.as_ref() {
6742 for (idx, q) in queues.iter().enumerate() {
6743 let events = events_from_dto(&q.events);
6744 let (prefix, suffix) = filter_from_dto(q.filter.as_ref());
6745 rules.push(crate::notifications::NotificationRule {
6746 id: q.id.clone().unwrap_or_else(|| format!("queue-{idx}")),
6747 events,
6748 destination: crate::notifications::Destination::Sqs {
6749 queue_arn: q.queue_arn.clone(),
6750 },
6751 filter_prefix: prefix,
6752 filter_suffix: suffix,
6753 });
6754 }
6755 }
6756 crate::notifications::NotificationConfig { rules }
6757}
6758
6759fn notif_to_dto(cfg: &crate::notifications::NotificationConfig) -> NotificationConfiguration {
6760 let mut topics: Vec<TopicConfiguration> = Vec::new();
6761 let mut queues: Vec<QueueConfiguration> = Vec::new();
6762 for rule in &cfg.rules {
6763 let events: Vec<Event> = rule
6764 .events
6765 .iter()
6766 .map(|e| Event::from(e.as_aws_str().to_owned()))
6767 .collect();
6768 let filter = filter_to_dto(rule.filter_prefix.as_deref(), rule.filter_suffix.as_deref());
6769 match &rule.destination {
6770 crate::notifications::Destination::Sns { topic_arn } => {
6771 topics.push(TopicConfiguration {
6772 events,
6773 filter,
6774 id: Some(rule.id.clone()),
6775 topic_arn: topic_arn.clone(),
6776 });
6777 }
6778 crate::notifications::Destination::Sqs { queue_arn } => {
6779 queues.push(QueueConfiguration {
6780 events,
6781 filter,
6782 id: Some(rule.id.clone()),
6783 queue_arn: queue_arn.clone(),
6784 });
6785 }
6786 // Webhook destinations have no AWS wire equivalent — they
6787 // round-trip through the JSON snapshot only. Skip them on the
6788 // GET surface (an SDK consumer wouldn't know what to do with
6789 // them anyway).
6790 crate::notifications::Destination::Webhook { .. } => {}
6791 }
6792 }
6793 NotificationConfiguration {
6794 event_bridge_configuration: None,
6795 lambda_function_configurations: None,
6796 queue_configurations: if queues.is_empty() {
6797 None
6798 } else {
6799 Some(queues)
6800 },
6801 topic_configurations: if topics.is_empty() {
6802 None
6803 } else {
6804 Some(topics)
6805 },
6806 }
6807}
6808
6809fn events_from_dto(events: &[Event]) -> Vec<crate::notifications::EventType> {
6810 events
6811 .iter()
6812 .filter_map(|e| crate::notifications::EventType::from_aws_str(e.as_ref()))
6813 .collect()
6814}
6815
6816fn filter_from_dto(
6817 f: Option<&NotificationConfigurationFilter>,
6818) -> (Option<String>, Option<String>) {
6819 let Some(f) = f else {
6820 return (None, None);
6821 };
6822 let Some(key) = f.key.as_ref() else {
6823 return (None, None);
6824 };
6825 let Some(rules) = key.filter_rules.as_ref() else {
6826 return (None, None);
6827 };
6828 let mut prefix = None;
6829 let mut suffix = None;
6830 for r in rules {
6831 let name = r.name.as_ref().map(|n| n.as_str().to_ascii_lowercase());
6832 let value = r.value.clone();
6833 match name.as_deref() {
6834 Some("prefix") => prefix = value,
6835 Some("suffix") => suffix = value,
6836 _ => {}
6837 }
6838 }
6839 (prefix, suffix)
6840}
6841
6842fn filter_to_dto(
6843 prefix: Option<&str>,
6844 suffix: Option<&str>,
6845) -> Option<NotificationConfigurationFilter> {
6846 if prefix.is_none() && suffix.is_none() {
6847 return None;
6848 }
6849 let mut rules: Vec<FilterRule> = Vec::new();
6850 if let Some(p) = prefix {
6851 rules.push(FilterRule {
6852 name: Some(FilterRuleName::from("prefix".to_owned())),
6853 value: Some(p.to_owned()),
6854 });
6855 }
6856 if let Some(s) = suffix {
6857 rules.push(FilterRule {
6858 name: Some(FilterRuleName::from("suffix".to_owned())),
6859 value: Some(s.to_owned()),
6860 });
6861 }
6862 Some(NotificationConfigurationFilter {
6863 key: Some(S3KeyFilter {
6864 filter_rules: Some(rules),
6865 }),
6866 })
6867}
6868
6869// ---------------------------------------------------------------------------
6870// v0.6 #40: Convert between the s3s-typed `ReplicationConfiguration` (the
6871// wire surface) and our internal `crate::replication::ReplicationConfig`.
6872// AWS's `ReplicationRuleFilter` is a sum type — `Prefix | Tag | And { Prefix,
6873// Tags }`; we flatten it into the single `(prefix, tag-vec)` representation
6874// the matcher needs. Sub-blocks v0.6 #40 does not implement
6875// (DeleteMarkerReplication / SourceSelectionCriteria / ReplicationTime /
6876// Metrics / EncryptionConfiguration) round-trip as `None` on GET — operators
6877// who set them on PUT see them silently dropped, mirroring "feature not
6878// supported in this release" semantics.
6879// ---------------------------------------------------------------------------
6880
6881fn replication_from_dto(dto: &ReplicationConfiguration) -> crate::replication::ReplicationConfig {
6882 let rules = dto
6883 .rules
6884 .iter()
6885 .enumerate()
6886 .map(|(idx, r)| {
6887 let id =
6888 r.id.as_ref()
6889 .map(|s| s.as_str().to_owned())
6890 .unwrap_or_else(|| format!("rule-{idx}"));
6891 let priority = r.priority.unwrap_or(0).max(0) as u32;
6892 let status_enabled = r.status.as_str() == ReplicationRuleStatus::ENABLED;
6893 let filter = replication_filter_from_dto(r.filter.as_ref(), r.prefix.as_deref());
6894 let destination_bucket = r.destination.bucket.clone();
6895 let destination_storage_class = r
6896 .destination
6897 .storage_class
6898 .as_ref()
6899 .map(|s| s.as_str().to_owned());
6900 crate::replication::ReplicationRule {
6901 id,
6902 priority,
6903 status_enabled,
6904 filter,
6905 destination_bucket,
6906 destination_storage_class,
6907 }
6908 })
6909 .collect();
6910 crate::replication::ReplicationConfig {
6911 role: dto.role.clone(),
6912 rules,
6913 }
6914}
6915
6916fn replication_to_dto(cfg: &crate::replication::ReplicationConfig) -> ReplicationConfiguration {
6917 let rules = cfg
6918 .rules
6919 .iter()
6920 .map(|r| {
6921 let status = if r.status_enabled {
6922 ReplicationRuleStatus::from_static(ReplicationRuleStatus::ENABLED)
6923 } else {
6924 ReplicationRuleStatus::from_static(ReplicationRuleStatus::DISABLED)
6925 };
6926 let destination = Destination {
6927 access_control_translation: None,
6928 account: None,
6929 bucket: r.destination_bucket.clone(),
6930 encryption_configuration: None,
6931 metrics: None,
6932 replication_time: None,
6933 storage_class: r
6934 .destination_storage_class
6935 .as_ref()
6936 .map(|s| StorageClass::from(s.clone())),
6937 };
6938 let filter = Some(replication_filter_to_dto(&r.filter));
6939 ReplicationRule {
6940 delete_marker_replication: None,
6941 destination,
6942 existing_object_replication: None,
6943 filter,
6944 id: Some(r.id.clone()),
6945 prefix: None,
6946 priority: Some(r.priority as i32),
6947 source_selection_criteria: None,
6948 status,
6949 }
6950 })
6951 .collect();
6952 ReplicationConfiguration {
6953 role: cfg.role.clone(),
6954 rules,
6955 }
6956}
6957
6958fn replication_filter_from_dto(
6959 f: Option<&ReplicationRuleFilter>,
6960 rule_level_prefix: Option<&str>,
6961) -> crate::replication::ReplicationFilter {
6962 let mut prefix: Option<String> = rule_level_prefix.map(str::to_owned);
6963 let mut tags: Vec<(String, String)> = Vec::new();
6964 if let Some(f) = f {
6965 if let Some(p) = f.prefix.as_ref()
6966 && prefix.is_none()
6967 {
6968 prefix = Some(p.clone());
6969 }
6970 if let Some(t) = f.tag.as_ref()
6971 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
6972 {
6973 tags.push((k.clone(), v.clone()));
6974 }
6975 if let Some(and) = f.and.as_ref() {
6976 if let Some(p) = and.prefix.as_ref()
6977 && prefix.is_none()
6978 {
6979 prefix = Some(p.clone());
6980 }
6981 if let Some(ts) = and.tags.as_ref() {
6982 for t in ts {
6983 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
6984 tags.push((k.clone(), v.clone()));
6985 }
6986 }
6987 }
6988 }
6989 }
6990 crate::replication::ReplicationFilter { prefix, tags }
6991}
6992
6993fn replication_filter_to_dto(f: &crate::replication::ReplicationFilter) -> ReplicationRuleFilter {
6994 if f.tags.is_empty() {
6995 ReplicationRuleFilter {
6996 and: None,
6997 prefix: f.prefix.clone(),
6998 tag: None,
6999 }
7000 } else if f.tags.len() == 1 && f.prefix.is_none() {
7001 let (k, v) = &f.tags[0];
7002 ReplicationRuleFilter {
7003 and: None,
7004 prefix: None,
7005 tag: Some(Tag {
7006 key: Some(k.clone()),
7007 value: Some(v.clone()),
7008 }),
7009 }
7010 } else {
7011 let tags: Vec<Tag> = f
7012 .tags
7013 .iter()
7014 .map(|(k, v)| Tag {
7015 key: Some(k.clone()),
7016 value: Some(v.clone()),
7017 })
7018 .collect();
7019 ReplicationRuleFilter {
7020 and: Some(ReplicationRuleAndOperator {
7021 prefix: f.prefix.clone(),
7022 tags: Some(tags),
7023 }),
7024 prefix: None,
7025 tag: None,
7026 }
7027 }
7028}
7029
7030// ---------------------------------------------------------------------------
7031// v0.6 #37: Convert between the s3s-typed `BucketLifecycleConfiguration`
7032// (the wire surface) and our internal `crate::lifecycle::LifecycleConfig`.
7033// The internal representation flattens AWS's "Filter | And" disjunction
7034// into a single `LifecycleFilter` struct of optional fields plus a tag
7035// vector. Fields S4's evaluator does not consume
7036// (`expired_object_delete_marker`, `noncurrent_version_transitions`,
7037// `transition_default_minimum_object_size`, the storage class on the
7038// noncurrent expiration) are dropped on PUT and re-rendered as their
7039// AWS-default shape on GET so the client always sees a well-formed
7040// configuration.
7041// ---------------------------------------------------------------------------
7042
7043fn dto_lifecycle_to_internal(
7044 dto: &BucketLifecycleConfiguration,
7045) -> crate::lifecycle::LifecycleConfig {
7046 crate::lifecycle::LifecycleConfig {
7047 rules: dto.rules.iter().map(dto_rule_to_internal).collect(),
7048 }
7049}
7050
7051fn dto_rule_to_internal(rule: &LifecycleRule) -> crate::lifecycle::LifecycleRule {
7052 let status = crate::lifecycle::LifecycleStatus::from_aws_str(rule.status.as_str());
7053 let filter = rule
7054 .filter
7055 .as_ref()
7056 .map(dto_filter_to_internal)
7057 .unwrap_or_default();
7058 let expiration_days = rule
7059 .expiration
7060 .as_ref()
7061 .and_then(|e| e.days)
7062 .and_then(|d| u32::try_from(d).ok());
7063 let expiration_date = rule
7064 .expiration
7065 .as_ref()
7066 .and_then(|e| e.date.as_ref())
7067 .and_then(timestamp_to_chrono_utc);
7068 let transitions: Vec<crate::lifecycle::TransitionRule> = rule
7069 .transitions
7070 .as_ref()
7071 .map(|ts| {
7072 ts.iter()
7073 .filter_map(|t| {
7074 let days = u32::try_from(t.days?).ok()?;
7075 let storage_class = t.storage_class.as_ref()?.as_str().to_owned();
7076 Some(crate::lifecycle::TransitionRule {
7077 days,
7078 storage_class,
7079 })
7080 })
7081 .collect()
7082 })
7083 .unwrap_or_default();
7084 let noncurrent_version_expiration_days = rule
7085 .noncurrent_version_expiration
7086 .as_ref()
7087 .and_then(|n| n.noncurrent_days)
7088 .and_then(|d| u32::try_from(d).ok());
7089 let abort_incomplete_multipart_upload_days = rule
7090 .abort_incomplete_multipart_upload
7091 .as_ref()
7092 .and_then(|a| a.days_after_initiation)
7093 .and_then(|d| u32::try_from(d).ok());
7094 crate::lifecycle::LifecycleRule {
7095 id: rule.id.clone().unwrap_or_default(),
7096 status,
7097 filter,
7098 expiration_days,
7099 expiration_date,
7100 transitions,
7101 noncurrent_version_expiration_days,
7102 abort_incomplete_multipart_upload_days,
7103 }
7104}
7105
7106fn dto_filter_to_internal(filter: &LifecycleRuleFilter) -> crate::lifecycle::LifecycleFilter {
7107 let mut prefix = filter.prefix.clone();
7108 let mut tags: Vec<(String, String)> = Vec::new();
7109 let mut size_gt: Option<u64> = filter
7110 .object_size_greater_than
7111 .and_then(|n| u64::try_from(n).ok());
7112 let mut size_lt: Option<u64> = filter
7113 .object_size_less_than
7114 .and_then(|n| u64::try_from(n).ok());
7115 if let Some(t) = &filter.tag
7116 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7117 {
7118 tags.push((k.clone(), v.clone()));
7119 }
7120 if let Some(and) = &filter.and {
7121 if prefix.is_none() {
7122 prefix = and.prefix.clone();
7123 }
7124 if size_gt.is_none() {
7125 size_gt = and
7126 .object_size_greater_than
7127 .and_then(|n| u64::try_from(n).ok());
7128 }
7129 if size_lt.is_none() {
7130 size_lt = and
7131 .object_size_less_than
7132 .and_then(|n| u64::try_from(n).ok());
7133 }
7134 if let Some(ts) = &and.tags {
7135 for t in ts {
7136 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7137 tags.push((k.clone(), v.clone()));
7138 }
7139 }
7140 }
7141 }
7142 crate::lifecycle::LifecycleFilter {
7143 prefix,
7144 tags,
7145 object_size_greater_than: size_gt,
7146 object_size_less_than: size_lt,
7147 }
7148}
7149
7150fn internal_rule_to_dto(rule: &crate::lifecycle::LifecycleRule) -> LifecycleRule {
7151 let expiration = if rule.expiration_days.is_some() || rule.expiration_date.is_some() {
7152 Some(LifecycleExpiration {
7153 date: rule.expiration_date.map(chrono_utc_to_timestamp),
7154 days: rule.expiration_days.map(|d| d as i32),
7155 expired_object_delete_marker: None,
7156 })
7157 } else {
7158 None
7159 };
7160 let transitions: Option<TransitionList> = if rule.transitions.is_empty() {
7161 None
7162 } else {
7163 Some(
7164 rule.transitions
7165 .iter()
7166 .map(|t| Transition {
7167 date: None,
7168 days: Some(t.days as i32),
7169 storage_class: Some(TransitionStorageClass::from(t.storage_class.clone())),
7170 })
7171 .collect(),
7172 )
7173 };
7174 let noncurrent_version_expiration =
7175 rule.noncurrent_version_expiration_days
7176 .map(|d| NoncurrentVersionExpiration {
7177 newer_noncurrent_versions: None,
7178 noncurrent_days: Some(d as i32),
7179 });
7180 let abort_incomplete_multipart_upload =
7181 rule.abort_incomplete_multipart_upload_days
7182 .map(|d| AbortIncompleteMultipartUpload {
7183 days_after_initiation: Some(d as i32),
7184 });
7185 let filter = if rule.filter.tags.is_empty()
7186 && rule.filter.object_size_greater_than.is_none()
7187 && rule.filter.object_size_less_than.is_none()
7188 {
7189 rule.filter.prefix.as_ref().map(|p| LifecycleRuleFilter {
7190 and: None,
7191 object_size_greater_than: None,
7192 object_size_less_than: None,
7193 prefix: Some(p.clone()),
7194 tag: None,
7195 })
7196 } else if rule.filter.tags.len() == 1
7197 && rule.filter.prefix.is_none()
7198 && rule.filter.object_size_greater_than.is_none()
7199 && rule.filter.object_size_less_than.is_none()
7200 {
7201 let (k, v) = rule.filter.tags[0].clone();
7202 Some(LifecycleRuleFilter {
7203 and: None,
7204 object_size_greater_than: None,
7205 object_size_less_than: None,
7206 prefix: None,
7207 tag: Some(Tag {
7208 key: Some(k),
7209 value: Some(v),
7210 }),
7211 })
7212 } else {
7213 let tags = if rule.filter.tags.is_empty() {
7214 None
7215 } else {
7216 Some(
7217 rule.filter
7218 .tags
7219 .iter()
7220 .map(|(k, v)| Tag {
7221 key: Some(k.clone()),
7222 value: Some(v.clone()),
7223 })
7224 .collect(),
7225 )
7226 };
7227 Some(LifecycleRuleFilter {
7228 and: Some(LifecycleRuleAndOperator {
7229 object_size_greater_than: rule
7230 .filter
7231 .object_size_greater_than
7232 .and_then(|n| i64::try_from(n).ok()),
7233 object_size_less_than: rule
7234 .filter
7235 .object_size_less_than
7236 .and_then(|n| i64::try_from(n).ok()),
7237 prefix: rule.filter.prefix.clone(),
7238 tags,
7239 }),
7240 object_size_greater_than: None,
7241 object_size_less_than: None,
7242 prefix: None,
7243 tag: None,
7244 })
7245 };
7246 LifecycleRule {
7247 abort_incomplete_multipart_upload,
7248 expiration,
7249 filter,
7250 id: if rule.id.is_empty() {
7251 None
7252 } else {
7253 Some(rule.id.clone())
7254 },
7255 noncurrent_version_expiration,
7256 noncurrent_version_transitions: None,
7257 prefix: None,
7258 status: ExpirationStatus::from(rule.status.as_aws_str().to_owned()),
7259 transitions,
7260 }
7261}
7262
7263// (timestamp <-> chrono helpers `timestamp_to_chrono_utc` /
7264// `chrono_utc_to_timestamp` are defined earlier in this file for the
7265// tagging/notifications work; the lifecycle DTO converters reuse them.)
7266
7267// ---------------------------------------------------------------------------
7268// v0.5 #33: SigV4a (asymmetric ECDSA-P256) integration hook.
7269//
7270// Kept as a self-contained block at the bottom of the file so it doesn't
7271// touch the existing `S4Service` struct, `new()`, or any of the per-op
7272// handlers above. The hook is wired in by the binary at server-build time
7273// as a hyper middleware layer (see `main.rs`), NOT inside `S4Service`.
7274//
7275// Lifecycle:
7276// 1. `SigV4aGate::new(store)` is constructed once at boot from the
7277// operator-supplied credential directory.
7278// 2. For each incoming request, `SigV4aGate::pre_route(&req,
7279// &requested_region, &canonical_request_bytes)` is invoked BEFORE
7280// the request hits the S3 framework. If the request claims SigV4a
7281// and verifies, control returns to the framework. Otherwise a 403
7282// `SignatureDoesNotMatch` is produced.
7283// 3. Plain SigV4 (HMAC-SHA256) requests pass through untouched.
7284// ---------------------------------------------------------------------------
7285
7286/// Gate that fronts the S3 service path with SigV4a verification (v0.5 #33).
7287///
7288/// Wraps a [`crate::sigv4a::SigV4aCredentialStore`] and exposes a single
7289/// `pre_route` entry point that returns `Ok(())` for both
7290/// "request is plain SigV4 — pass through" and "request is SigV4a and
7291/// verified", and an `Err(...)` containing a 403-equivalent diagnostic
7292/// otherwise. Cheap to clone (the inner store is `Arc`-backed).
7293///
7294/// v0.8.4 #76 (audit H-6): the gate now enforces an `x-amz-date`
7295/// freshness window (default 15 min, AWS-spec) and a strict credential
7296/// scope shape (`<key>/<YYYYMMDD>/s3/aws4_request`), shutting the
7297/// captured-request replay vector — previously a stolen valid SigV4a
7298/// signature could be replayed indefinitely (including DELETE).
7299#[derive(Debug, Clone)]
7300pub struct SigV4aGate {
7301 store: crate::sigv4a::SharedSigV4aCredentialStore,
7302 /// v0.8.4 #76: how far the request's `x-amz-date` may drift from
7303 /// the server's clock before being rejected with 403
7304 /// `RequestTimeTooSkewed`. Matches the AWS S3 spec default of
7305 /// 15 min when constructed via [`SigV4aGate::new`]; the operator
7306 /// can override via [`SigV4aGate::with_skew_tolerance`] (CLI flag
7307 /// `--sigv4a-skew-tolerance-seconds`).
7308 skew_tolerance: chrono::Duration,
7309}
7310
7311impl SigV4aGate {
7312 /// Default `x-amz-date` skew tolerance — 15 min, matching AWS S3.
7313 pub const DEFAULT_SKEW_TOLERANCE_SECS: i64 = 900;
7314
7315 #[must_use]
7316 pub fn new(store: crate::sigv4a::SharedSigV4aCredentialStore) -> Self {
7317 Self {
7318 store,
7319 skew_tolerance: chrono::Duration::seconds(Self::DEFAULT_SKEW_TOLERANCE_SECS),
7320 }
7321 }
7322
7323 /// v0.8.4 #76: override the `x-amz-date` skew tolerance (default
7324 /// 15 min). Operators can widen this for high-clock-drift
7325 /// environments or tighten it for compliance regimes that demand
7326 /// stricter freshness.
7327 #[must_use]
7328 pub fn with_skew_tolerance(mut self, skew: chrono::Duration) -> Self {
7329 self.skew_tolerance = skew;
7330 self
7331 }
7332
7333 /// Read the configured skew tolerance — exposed mostly for test +
7334 /// observability use.
7335 #[must_use]
7336 pub fn skew_tolerance(&self) -> chrono::Duration {
7337 self.skew_tolerance
7338 }
7339
7340 /// Inspect an incoming HTTP request. Behaviour:
7341 ///
7342 /// - Not SigV4a (no `X-Amz-Region-Set` and no SigV4a `Authorization`
7343 /// prefix) → returns `Ok(())`; the framework's existing SigV4
7344 /// path handles the request.
7345 /// - SigV4a + valid signature + region match + fresh x-amz-date
7346 /// → `Ok(())`.
7347 /// - SigV4a + unknown access-key-id → `Err` with `InvalidAccessKeyId`.
7348 /// - SigV4a + bad signature / region mismatch → `Err` with
7349 /// `SignatureDoesNotMatch`.
7350 /// - SigV4a + missing or skewed `x-amz-date` → `Err` with one of
7351 /// the v0.8.4 #76 freshness variants (`RequestTimeTooSkewed`
7352 /// et al.).
7353 ///
7354 /// `canonical_request_bytes` is the SigV4a string-to-sign (or
7355 /// canonical-request bytes; the caller decides) that the framework
7356 /// has already produced for this request. Keeping it as a parameter
7357 /// instead of rebuilding it inside the hook avoids duplicating the
7358 /// canonicalisation logic.
7359 pub fn pre_route<B>(
7360 &self,
7361 req: &http::Request<B>,
7362 requested_region: &str,
7363 canonical_request_bytes: &[u8],
7364 ) -> Result<(), SigV4aGateError> {
7365 self.pre_route_at(
7366 req,
7367 requested_region,
7368 canonical_request_bytes,
7369 chrono::Utc::now(),
7370 )
7371 }
7372
7373 /// Like [`SigV4aGate::pre_route`] but takes an explicit `now` for
7374 /// tests that need to pin the freshness clock. Production callers
7375 /// use `pre_route` (which calls `chrono::Utc::now()`).
7376 pub fn pre_route_at<B>(
7377 &self,
7378 req: &http::Request<B>,
7379 requested_region: &str,
7380 canonical_request_bytes: &[u8],
7381 now: chrono::DateTime<chrono::Utc>,
7382 ) -> Result<(), SigV4aGateError> {
7383 if !crate::sigv4a::detect(req) {
7384 return Ok(());
7385 }
7386 let auth_hdr = req
7387 .headers()
7388 .get(http::header::AUTHORIZATION)
7389 .and_then(|v| v.to_str().ok())
7390 .ok_or(SigV4aGateError::MissingAuthorization)?;
7391 let parsed = crate::sigv4a::parse_authorization_header(auth_hdr)
7392 .map_err(|_| SigV4aGateError::MalformedAuthorization)?;
7393 let region_set = req
7394 .headers()
7395 .get(crate::sigv4a::REGION_SET_HEADER)
7396 .and_then(|v| v.to_str().ok())
7397 .unwrap_or("*");
7398 let key = self
7399 .store
7400 .get(&parsed.access_key_id)
7401 .ok_or_else(|| SigV4aGateError::UnknownAccessKey(parsed.access_key_id.clone()))?;
7402 // v0.8.4 #76: snapshot the request headers into a
7403 // lowercase-keyed flat map so `verify_request` can do the
7404 // x-amz-date freshness checks without taking a generic
7405 // `HeaderMap` dep. Cheap because the headers list is tiny.
7406 //
7407 // v0.8.5 #84 (audit H-4): detect duplicate header names while
7408 // we flatten — `HashMap::insert` would silently overwrite the
7409 // first value with the second, mirroring the auth-confusion
7410 // vector the canonical-request builder also defends against.
7411 // Reject upfront so the rest of the gate (freshness check,
7412 // ECDSA verify) never sees a half-truncated header set. We
7413 // detect by checking `contains_key` *before* insertion rather
7414 // than by counting via `headers().get_all`, because the
7415 // upstream `HeaderMap` iteration yields each duplicate entry
7416 // as its own (name, value) pair — the second-seen entry is
7417 // exactly what `contains_key` traps.
7418 let mut header_map: std::collections::HashMap<String, String> =
7419 std::collections::HashMap::with_capacity(req.headers().len());
7420 for (name, value) in req.headers() {
7421 if let Ok(v) = value.to_str() {
7422 let lower = name.as_str().to_ascii_lowercase();
7423 if header_map.contains_key(&lower) {
7424 return Err(SigV4aGateError::Verify(
7425 crate::sigv4a::SigV4aError::DuplicateSignedHeader { header: lower },
7426 ));
7427 }
7428 header_map.insert(lower, v.to_string());
7429 }
7430 }
7431 crate::sigv4a::verify_request(
7432 &parsed,
7433 &header_map,
7434 canonical_request_bytes,
7435 key,
7436 region_set,
7437 requested_region,
7438 now,
7439 self.skew_tolerance,
7440 )
7441 .map_err(SigV4aGateError::Verify)?;
7442 Ok(())
7443 }
7444}
7445
7446/// Failure modes from [`SigV4aGate::pre_route`]. All variants map to
7447/// HTTP 403 with one of the two AWS-standard error codes
7448/// (`InvalidAccessKeyId` / `SignatureDoesNotMatch` / `RequestTimeTooSkewed`)
7449/// — see [`SigV4aGateError::s3_error_code`].
7450#[derive(Debug, thiserror::Error)]
7451pub enum SigV4aGateError {
7452 #[error("missing Authorization header")]
7453 MissingAuthorization,
7454 #[error("malformed SigV4a Authorization header")]
7455 MalformedAuthorization,
7456 #[error("unknown SigV4a access-key-id: {0}")]
7457 UnknownAccessKey(String),
7458 #[error("SigV4a verification failed: {0}")]
7459 Verify(#[source] crate::sigv4a::SigV4aError),
7460}
7461
7462impl SigV4aGateError {
7463 /// AWS S3 error code that should accompany the response.
7464 ///
7465 /// v0.8.4 #76 (audit H-6): the freshness check surfaces
7466 /// `RequestTimeTooSkewed` (matches AWS spec); date / scope shape
7467 /// failures surface as `InvalidRequest` (400); other failures stay
7468 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` (403) so the wire
7469 /// surface stays AWS-compatible.
7470 #[must_use]
7471 pub fn s3_error_code(&self) -> &'static str {
7472 match self {
7473 Self::UnknownAccessKey(_) => "InvalidAccessKeyId",
7474 Self::Verify(crate::sigv4a::SigV4aError::RequestTimeTooSkewed { .. }) => {
7475 "RequestTimeTooSkewed"
7476 }
7477 Self::Verify(
7478 crate::sigv4a::SigV4aError::MissingXAmzDate
7479 | crate::sigv4a::SigV4aError::InvalidDateFormat
7480 | crate::sigv4a::SigV4aError::DateScopeMismatch
7481 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7482 | crate::sigv4a::SigV4aError::InvalidTerminator
7483 | crate::sigv4a::SigV4aError::WrongService { .. }
7484 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7485 ) => "InvalidRequest",
7486 _ => "SignatureDoesNotMatch",
7487 }
7488 }
7489
7490 /// HTTP status code to accompany the response. v0.8.4 #76: format
7491 /// errors that are clearly client mistakes (missing / malformed
7492 /// `x-amz-date`, malformed credential scope, wrong service) are
7493 /// surfaced as 400 InvalidRequest; the rest stay 403.
7494 #[must_use]
7495 pub fn http_status(&self) -> http::StatusCode {
7496 match self {
7497 Self::Verify(
7498 crate::sigv4a::SigV4aError::MissingXAmzDate
7499 | crate::sigv4a::SigV4aError::InvalidDateFormat
7500 | crate::sigv4a::SigV4aError::DateScopeMismatch
7501 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7502 | crate::sigv4a::SigV4aError::InvalidTerminator
7503 | crate::sigv4a::SigV4aError::WrongService { .. }
7504 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7505 ) => http::StatusCode::BAD_REQUEST,
7506 _ => http::StatusCode::FORBIDDEN,
7507 }
7508 }
7509}
7510
7511#[cfg(test)]
7512mod tests {
7513 use super::*;
7514
7515 #[test]
7516 fn manifest_roundtrip_via_metadata() {
7517 let original = ChunkManifest {
7518 codec: CodecKind::CpuZstd,
7519 original_size: 1234,
7520 compressed_size: 567,
7521 crc32c: 0xdead_beef,
7522 };
7523 let mut meta: Option<Metadata> = None;
7524 write_manifest(&mut meta, &original);
7525 let extracted = extract_manifest(&meta).expect("manifest must round-trip");
7526 assert_eq!(extracted.codec, original.codec);
7527 assert_eq!(extracted.original_size, original.original_size);
7528 assert_eq!(extracted.compressed_size, original.compressed_size);
7529 assert_eq!(extracted.crc32c, original.crc32c);
7530 }
7531
7532 #[test]
7533 fn missing_metadata_yields_none() {
7534 let meta: Option<Metadata> = None;
7535 assert!(extract_manifest(&meta).is_none());
7536 }
7537
7538 #[test]
7539 fn partial_metadata_yields_none() {
7540 let mut meta = Metadata::new();
7541 meta.insert(META_CODEC.into(), "cpu-zstd".into());
7542 let opt = Some(meta);
7543 assert!(extract_manifest(&opt).is_none());
7544 }
7545
7546 #[test]
7547 fn parse_copy_source_range_basic() {
7548 let r = parse_copy_source_range("bytes=10-20").unwrap();
7549 match r {
7550 s3s::dto::Range::Int { first, last } => {
7551 assert_eq!(first, 10);
7552 assert_eq!(last, Some(20));
7553 }
7554 _ => panic!("expected Int range"),
7555 }
7556 }
7557
7558 #[test]
7559 fn parse_copy_source_range_rejects_inverted() {
7560 let err = parse_copy_source_range("bytes=20-10").unwrap_err();
7561 assert!(err.contains("last < first"));
7562 }
7563
7564 #[test]
7565 fn parse_copy_source_range_rejects_missing_prefix() {
7566 let err = parse_copy_source_range("10-20").unwrap_err();
7567 assert!(err.contains("must start with 'bytes='"));
7568 }
7569
7570 #[test]
7571 fn parse_copy_source_range_rejects_open_ended() {
7572 // S3 upload_part_copy spec requires N-M (closed); suffix and
7573 // open-ended forms are not allowed for this header.
7574 assert!(parse_copy_source_range("bytes=10-").is_err());
7575 assert!(parse_copy_source_range("bytes=-10").is_err());
7576 }
7577
7578 // v0.7 #49: safe_object_uri must round-trip every legal S3 key
7579 // (which includes spaces, slashes, control chars, raw UTF-8) into
7580 // a parseable `http::Uri` instead of panicking like the previous
7581 // `format!(...).parse().unwrap()` call sites did.
7582
7583 #[test]
7584 fn safe_object_uri_basic_ascii() {
7585 let uri = safe_object_uri("bucket", "key").expect("ascii must be safe");
7586 assert_eq!(uri.path(), "/bucket/key");
7587 }
7588
7589 #[test]
7590 fn safe_object_uri_encodes_spaces() {
7591 let uri = safe_object_uri("bucket", "key with spaces").expect("must encode spaces");
7592 // RFC 3986 path-segment encoding turns ' ' into %20.
7593 assert!(
7594 uri.path().contains("%20"),
7595 "expected percent-encoded space, got {}",
7596 uri.path()
7597 );
7598 assert!(uri.path().starts_with("/bucket/"));
7599 }
7600
7601 #[test]
7602 fn safe_object_uri_preserves_slashes() {
7603 // S3 keys legally contain '/' as a logical path separator —
7604 // the helper must NOT escape it (otherwise the synthetic URI
7605 // changes the perceived hierarchy).
7606 let uri = safe_object_uri("bucket", "key/with/slashes").expect("slashes must round-trip");
7607 assert_eq!(uri.path(), "/bucket/key/with/slashes");
7608 }
7609
7610 #[test]
7611 fn safe_object_uri_handles_newline_without_panic() {
7612 // Newlines are control chars in URIs; whether the result is
7613 // Ok (encoded as %0A) or Err (parse rejects), the helper
7614 // MUST NOT panic. Either outcome is acceptable.
7615 let _ = safe_object_uri("bucket", "key\n");
7616 }
7617
7618 #[test]
7619 fn safe_object_uri_handles_null_byte_without_panic() {
7620 let _ = safe_object_uri("bucket", "key\0bad");
7621 }
7622
7623 #[test]
7624 fn safe_object_uri_handles_unicode_without_panic() {
7625 // RTL override, BOM, plain Japanese — none should panic.
7626 let _ = safe_object_uri("bucket", "rtl\u{202E}override");
7627 let _ = safe_object_uri("bucket", "\u{FEFF}bom-key");
7628 let _ = safe_object_uri("bucket", "日本語キー");
7629 }
7630
7631 #[test]
7632 fn safe_object_uri_no_panic_for_every_byte() {
7633 // Exhaustive byte coverage: 0x00..=0xFF as a 1-byte key.
7634 // None of these may panic. (0x80..=0xFF are not valid UTF-8
7635 // by themselves; we go through `String::from_utf8_lossy` so
7636 // the helper sees a real `&str` regardless of the raw byte.)
7637 for b in 0u8..=255 {
7638 let s = String::from_utf8_lossy(&[b]).into_owned();
7639 let _ = safe_object_uri("bucket", &s);
7640 }
7641 }
7642
7643 /// v0.8.1 #58: smoke test for the DEK-handling shape used by the
7644 /// SSE-KMS branches of `put_object` and `complete_multipart_upload`.
7645 /// Mirrors the call pattern (generate_dek → length check → copy
7646 /// into stack `[u8; 32]` → reborrow as `&[u8; 32]` for `SseSource`)
7647 /// without spinning up a full `S4Service`.
7648 ///
7649 /// The real assertion this guards against is a regression where
7650 /// the `Zeroizing` wrapper is accidentally dropped before the
7651 /// stack copy lands (e.g. someone refactors to use
7652 /// `let dek = kms.generate_dek(...).await?.0; drop(dek); ...`)
7653 /// or where `&**dek` is rewritten in a way that doesn't compile.
7654 #[tokio::test]
7655 async fn kms_dek_lifetime_within_function_scope() {
7656 use crate::kms::{KmsBackend, LocalKms};
7657 use std::collections::HashMap;
7658 use std::path::PathBuf;
7659 use zeroize::Zeroizing;
7660
7661 let mut keks = HashMap::new();
7662 keks.insert("scope".to_string(), [33u8; 32]);
7663 let kms = LocalKms::from_keks(PathBuf::from("/tmp/kms-scope-test"), keks);
7664
7665 // Mirror the put_object KMS branch shape exactly.
7666 let (dek, wrapped) = kms.generate_dek("scope").await.unwrap();
7667 assert_eq!(dek.len(), 32);
7668 let mut dek_arr: Zeroizing<[u8; 32]> = Zeroizing::new([0u8; 32]);
7669 dek_arr.copy_from_slice(&dek);
7670
7671 // The reborrow used at the SseSource construction site —
7672 // mirrors the call-site pattern where `let dek_ref: &[u8; 32]`
7673 // auto-derefs from a `Zeroizing<[u8; 32]>` reference.
7674 let dek_ref: &[u8; 32] = &dek_arr;
7675 // Sanity: the reborrow points at the same bytes.
7676 assert_eq!(dek_ref, &*dek_arr);
7677 // Wrapped key id flows through unchanged.
7678 assert_eq!(wrapped.key_id, "scope");
7679
7680 // At end of scope, both `dek` (Zeroizing<Vec<u8>>) and
7681 // `dek_arr` (Zeroizing<[u8; 32]>) are dropped, wiping the
7682 // backing memory. Cannot directly assert the wipe (would be
7683 // UB to read freed memory), so this test instead enforces
7684 // that the call shape compiles and executes; the wipe itself
7685 // is exercised by the `zeroize` crate's own test suite.
7686 }
7687
7688 /// v0.8.5 #86 (audit M-2): the replication dispatcher must
7689 /// `acquire_owned()` a permit from `replication_semaphore` before
7690 /// kicking off the destination PUT, so a saturated semaphore
7691 /// back-pressures the in-flight queue depth instead of letting it
7692 /// grow without bound. We exercise the field directly (initial
7693 /// permit count, override via `with_replication_max_concurrent`,
7694 /// permit drop on `Drop`) — the full `spawn_replication_if_matched`
7695 /// integration is exercised by the existing replication tests in
7696 /// `tests/feature_e2e.rs` once a `ReplicationManager` is attached.
7697 #[tokio::test]
7698 async fn replication_semaphore_caps_concurrent_dispatchers() {
7699 // Build a minimal `S4Service` directly — no handler path is
7700 // exercised, only the constructor + setter + accessor shape.
7701 let registry = Arc::new(
7702 CodecRegistry::new(CodecKind::Passthrough)
7703 .with(Arc::new(s4_codec::passthrough::Passthrough)),
7704 );
7705 let dispatcher = Arc::new(s4_codec::dispatcher::AlwaysDispatcher(
7706 CodecKind::Passthrough,
7707 ));
7708 let s4 = S4Service::new(NoopBackend, registry, dispatcher);
7709
7710 // Default cap matches the documented constant.
7711 assert_eq!(
7712 s4.replication_semaphore().available_permits(),
7713 S4Service::<NoopBackend>::DEFAULT_REPLICATION_MAX_CONCURRENT,
7714 "fresh S4Service must expose DEFAULT_REPLICATION_MAX_CONCURRENT permits"
7715 );
7716
7717 // Override via the builder — replaces the underlying `Semaphore`.
7718 let s4 = s4.with_replication_max_concurrent(2);
7719 assert_eq!(
7720 s4.replication_semaphore().available_permits(),
7721 2,
7722 "with_replication_max_concurrent(2) must expose exactly 2 permits"
7723 );
7724
7725 // Acquiring permits must reduce `available_permits()` and
7726 // dropping them must restore the count — this is the contract
7727 // `spawn_replication_if_matched` relies on for back-pressure.
7728 let sem = Arc::clone(s4.replication_semaphore());
7729 let p1 = sem.clone().acquire_owned().await.expect("permit 1");
7730 let p2 = sem.clone().acquire_owned().await.expect("permit 2");
7731 assert_eq!(
7732 sem.available_permits(),
7733 0,
7734 "two acquired permits must zero `available_permits()`"
7735 );
7736 // A third `try_acquire_owned` must fail — the cap is enforced
7737 // synchronously, no extra spawn slips through.
7738 assert!(
7739 sem.clone().try_acquire_owned().is_err(),
7740 "third acquire must back-pressure: cap was 2"
7741 );
7742 drop(p1);
7743 drop(p2);
7744 assert_eq!(
7745 sem.available_permits(),
7746 2,
7747 "dropping permits must restore cap"
7748 );
7749
7750 // Lower-bound clamp: a 0 cap would deadlock all dispatchers,
7751 // so the setter clamps it to 1 instead of accepting it
7752 // (callers are warned in the CLI doc).
7753 let s4 = s4.with_replication_max_concurrent(0);
7754 assert_eq!(
7755 s4.replication_semaphore().available_permits(),
7756 1,
7757 "cap=0 must be clamped to 1 to avoid total deadlock"
7758 );
7759 }
7760
7761 /// v0.8.5 #86 (audit M-1): the access-log flusher must return a
7762 /// `JoinHandle<()>` that the caller can `abort()` on shutdown
7763 /// without leaving a dangling task. The pre-#86 call site dropped
7764 /// the handle at end-of-block (silently detaching it); the fix is
7765 /// hoisting it into a process-lived `Vec` so the graceful-shutdown
7766 /// branch in `main.rs` can wait for clean exit. This test exercises
7767 /// the `JoinHandle.abort()` shape directly so a future refactor that
7768 /// stops returning the handle (or returns a non-abortable wrapper)
7769 /// trips this regression guard.
7770 #[tokio::test]
7771 async fn flusher_handle_can_be_aborted_cleanly() {
7772 // Stand up a minimal `AccessLog` pointing at a tmp dir so the
7773 // flusher's `create_dir_all` succeeds. The dir is cleaned up
7774 // by the OS / test harness; we don't assert on the contents.
7775 let tmp = std::env::temp_dir().join(format!(
7776 "s4-86-flusher-{}-{}",
7777 std::process::id(),
7778 std::time::SystemTime::now()
7779 .duration_since(std::time::UNIX_EPOCH)
7780 .map(|d| d.as_nanos())
7781 .unwrap_or(0)
7782 ));
7783 let dest = crate::access_log::AccessLogDest { dir: tmp.clone() };
7784 let log = crate::access_log::AccessLog::new(dest);
7785 let handle = log.spawn_flusher(None);
7786 assert!(
7787 !handle.is_finished(),
7788 "freshly-spawned flusher must not yet be finished"
7789 );
7790 handle.abort();
7791 // `await`-ing an aborted handle returns `Err(JoinError)` whose
7792 // `is_cancelled()` is true.
7793 let join_result = handle.await;
7794 assert!(
7795 join_result.is_err(),
7796 "aborted flusher must surface JoinError, got Ok"
7797 );
7798 assert!(
7799 join_result.unwrap_err().is_cancelled(),
7800 "JoinError must report .is_cancelled() = true after abort()"
7801 );
7802 let _ = std::fs::remove_dir_all(&tmp);
7803 }
7804
7805 /// Stub backend used solely by the v0.8.5 #86 unit tests above —
7806 /// the `S4Service` constructor needs `B: S3` but the tests only
7807 /// exercise builder / accessor shape, never a handler call. Every
7808 /// `S3` method falls through to the trait's default
7809 /// `NotImplemented` (which `s3s` provides automatically).
7810 struct NoopBackend;
7811
7812 #[async_trait::async_trait]
7813 impl S3 for NoopBackend {}
7814
7815 /// v0.8.5 #81 (audit H-7): the panic-catch wrapper at the
7816 /// dispatcher spawn site must intercept a panicking inner future,
7817 /// log at ERROR, and bump the per-kind counter — instead of letting
7818 /// the panic propagate as a `JoinError` that no operator dashboard
7819 /// scrapes. We exercise the wrapper directly (rather than driving a
7820 /// full `spawn_replication_if_matched` end-to-end, which would
7821 /// require a full `S4Service` + backend) because the wrapper shape
7822 /// is the load-bearing piece — any inner-future swap would still
7823 /// route through the same `AssertUnwindSafe(...).catch_unwind()`
7824 /// closure we want to lock in here.
7825 #[tokio::test]
7826 async fn dispatcher_panic_caught_and_metric_bumped() {
7827 use futures::FutureExt as _;
7828
7829 let handle = crate::metrics::test_metrics_handle();
7830 let kind = "replication";
7831
7832 // Mirror the production wrapper shape verbatim — if the
7833 // production code ever stops using `AssertUnwindSafe.catch_unwind`
7834 // this test shouldn't keep passing on a hand-rolled copy that
7835 // diverged.
7836 let panicking = async {
7837 panic!("simulated dispatcher panic");
7838 };
7839 let result = std::panic::AssertUnwindSafe(panicking).catch_unwind().await;
7840 assert!(
7841 result.is_err(),
7842 "catch_unwind must surface the panic instead of swallowing it"
7843 );
7844 // Bump the production counter via the same helper the wrapper
7845 // calls so the rendered output gates on the production code
7846 // path, not a parallel bookkeeping copy.
7847 crate::metrics::record_dispatcher_panic(kind);
7848
7849 let rendered = handle.render();
7850 assert!(
7851 rendered.contains("s4_dispatcher_panics_total"),
7852 "expected s4_dispatcher_panics_total in metrics output, got: {rendered}"
7853 );
7854 assert!(
7855 rendered.contains("kind=\"replication\""),
7856 "expected kind=\"replication\" label in metrics output, got: {rendered}"
7857 );
7858 }
7859}