s4_server/service.rs
1//! `s3s::S3` 実装 — `s3s_aws::Proxy` への delegation を default にしつつ、
2//! `put_object` / `get_object` 経路で `s4_codec::CodecRegistry` を呼ぶ。
3//!
4//! ## カバー範囲 (Phase 1 月 2)
5//!
6//! - 圧縮 hook あり: `put_object`, `get_object`
7//! - 純 delegation (圧縮なし): `head_bucket`, `list_buckets`, `create_bucket`, `delete_bucket`,
8//! `head_object`, `delete_object`, `delete_objects`, `copy_object`, `list_objects`,
9//! `list_objects_v2`, `create_multipart_upload`, `upload_part`,
10//! `complete_multipart_upload`, `abort_multipart_upload`, `list_multipart_uploads`,
11//! `list_parts`
12//! - 未対応 (デフォルトで NotImplemented): その他 80+ ops (Tagging / ACL / Lifecycle 等は Phase 2)
13//!
14//! ## アーキテクチャ
15//!
16//! - `S4Service<B>` は backend (B: S3) と `Arc<CodecRegistry>` と `Arc<dyn CodecDispatcher>`
17//! を保持する。`CodecRegistry` 経由で複数 codec を抱えられるので、ひとつの S4 インスタンスが
18//! 複数 codec で書かれた object を透過的に GET できる
19//! - PUT: dispatcher が body の先頭 sample から codec を選び、registry で compress、
20//! manifest を S3 metadata に書いて backend に forward
21//! - GET: backend から取得 → metadata から manifest を復元 → registry.decompress で
22//! manifest 指定の codec で解凍 → 元の bytes を return
23//!
24//! ## 既知の制限事項
25//!
26//! - **Multipart Upload は per-part 圧縮が未実装**: 現状は upload_part を素通し。
27//! Phase 1 月 2 後半で per-part compress + complete_multipart_upload で manifest 集約。
28//! - **PUT body は memory に collect**: max_body_bytes 上限あり (default 5 GiB = S3 単発 PUT 上限)。
29//! Streaming-aware 圧縮は Phase 2。
30
31use std::sync::Arc;
32
33use base64::Engine as _;
34use bytes::BytesMut;
35use s3s::dto::*;
36use s3s::{S3, S3Error, S3ErrorCode, S3Request, S3Response, S3Result};
37use s4_codec::index::{FrameIndex, build_index_from_body, decode_index, encode_index, sidecar_key};
38use s4_codec::multipart::{
39 FRAME_HEADER_BYTES, FrameHeader, FrameIter, S3_MULTIPART_MIN_PART_BYTES, pad_to_minimum,
40 write_frame,
41};
42use s4_codec::{ChunkManifest, CodecDispatcher, CodecKind, CodecRegistry, CompressTelemetry};
43use std::time::Instant;
44use tracing::{debug, info};
45
46use crate::blob::{
47 bytes_to_blob, chain_sample_with_rest, collect_blob, collect_with_sample, peek_sample,
48};
49use crate::streaming::{
50 Crc32cVerifyingReader, async_read_to_blob, blob_to_async_read, cpu_zstd_decompress_stream,
51 pick_chunk_size, streaming_compress_to_frames, supports_streaming_compress,
52 supports_streaming_decompress,
53};
54
55/// PUT body の先頭 sampling で渡す最大 byte 数。
56const SAMPLE_BYTES: usize = 4096;
57
58/// v0.8 #55: stamp the GPU pipeline metrics (`s4_gpu_compress_seconds`,
59/// `s4_gpu_throughput_bytes_per_sec`, `s4_gpu_oom_total`) from a
60/// `CompressTelemetry` returned by `CodecRegistry::compress_with_telemetry`.
61/// CPU codecs (`gpu_seconds = None`) are no-ops here — they're already
62/// covered by the existing `s4_request_latency_seconds` / `s4_bytes_*`
63/// counters in the request-level `record_put` / `record_get` calls.
64#[inline]
65fn stamp_gpu_compress_telemetry(tel: &CompressTelemetry) {
66 if let Some(secs) = tel.gpu_seconds {
67 crate::metrics::record_gpu_compress(tel.codec, secs, tel.bytes_in, tel.bytes_out);
68 }
69 if tel.oom {
70 crate::metrics::record_gpu_oom(tel.codec);
71 }
72}
73
74/// v0.7 #49: percent-encoding set covering everything that is **not** an
75/// `unreserved` character per RFC 3986 §2.3, **plus** we additionally
76/// encode the path-reserved sub-delims that `http::Uri` rejects in a
77/// path segment (`?`, `#`, `%`, control bytes, space, etc.). We
78/// deliberately keep `/` un-encoded because S3 keys legally use `/` as
79/// a logical separator and the rest of the synthetic URI relies on the
80/// path layout `/{bucket}/{key}` round-tripping byte-for-byte.
81const URI_KEY_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
82 .add(b' ')
83 .add(b'"')
84 .add(b'#')
85 .add(b'<')
86 .add(b'>')
87 .add(b'?')
88 .add(b'`')
89 .add(b'{')
90 .add(b'}')
91 .add(b'|')
92 .add(b'\\')
93 .add(b'^')
94 .add(b'[')
95 .add(b']')
96 .add(b'%');
97
98/// v0.7 #49: build the synthetic `/{bucket}/{key}` request URI used by
99/// the sidecar / replication helpers when they re-enter the backend
100/// trait without going through the HTTP layer. S3 object keys can
101/// contain spaces, control bytes, and arbitrary Unicode that would
102/// make `format!(...).parse::<http::Uri>()` panic; we percent-encode
103/// the key bytes (RFC 3986 path segment) and the bucket name (defensive
104/// — bucket names are normally DNS-safe, but the helper is the single
105/// choke-point) before splicing them in. If the encoded form *still*
106/// fails to parse (extremely unlikely once everything outside the
107/// unreserved set is escaped) we surface a typed `400 InvalidObjectName`
108/// instead of crashing the worker.
109pub(crate) fn safe_object_uri(bucket: &str, key: &str) -> S3Result<http::Uri> {
110 use percent_encoding::utf8_percent_encode;
111 let bucket_enc = utf8_percent_encode(bucket, URI_KEY_ENCODE_SET);
112 let key_enc = utf8_percent_encode(key, URI_KEY_ENCODE_SET);
113 let raw = format!("/{bucket_enc}/{key_enc}");
114 raw.parse::<http::Uri>().map_err(|e| {
115 // S3 spec uses `InvalidObjectName` (HTTP 400) for keys that
116 // can't be represented in a request URI. The generated
117 // `S3ErrorCode` enum doesn't expose a typed variant for it,
118 // so we round-trip through `from_bytes` which preserves the
119 // canonical wire string while falling back to InvalidArgument
120 // if even that lookup fails (cannot happen at runtime — kept
121 // as a belt-and-suspenders branch so this helper never
122 // panics).
123 let code =
124 S3ErrorCode::from_bytes(b"InvalidObjectName").unwrap_or(S3ErrorCode::InvalidArgument);
125 S3Error::with_message(
126 code,
127 format!("object key cannot be encoded as a request URI: {e}"),
128 )
129 })
130}
131
132/// v0.8.12 HIGH-12 fix: verify a client-supplied integrity checksum
133/// against the received body BEFORE we strip the header on the way
134/// to the backend. Returns `Err(BadDigest)` on mismatch (matches
135/// AWS S3 wire behaviour); `Ok(())` when the supplied digest matches
136/// OR when the supplied algorithm is one we don't yet implement
137/// (the latter is logged so operators see the gap — fail-open on
138/// unsupported algorithms is the documented trade in the v0.8.11
139/// CHANGELOG, with full coverage tracked as a follow-up issue).
140///
141/// Algorithms covered: `Content-MD5` (base64 MD5),
142/// `x-amz-checksum-crc32c` (base64 big-endian u32),
143/// `x-amz-checksum-sha256` (base64 SHA-256). The remaining S3
144/// checksum algorithms (CRC32 non-Castagnoli, SHA-1, CRC64-NVME)
145/// are accepted and silently passed; verifying them needs new
146/// dependencies and was held back to keep the v0.8.12 surface
147/// bounded.
148#[allow(clippy::too_many_arguments)]
149fn verify_client_body_checksums(
150 body: &[u8],
151 content_md5_b64: Option<&str>,
152 checksum_crc32_b64: Option<&str>,
153 checksum_crc32c_b64: Option<&str>,
154 checksum_sha1_b64: Option<&str>,
155 checksum_sha256_b64: Option<&str>,
156 checksum_crc64nvme_b64: Option<&str>,
157) -> S3Result<()> {
158 use base64::Engine as _;
159 use md5::Md5;
160 use sha2::Sha256;
161 // `Digest` from md-5 / sha2 brings the `new`, `update`, `finalize`
162 // trait methods into scope. Bind anonymously so this `use` is
163 // never flagged as unused while still serving its real purpose.
164 use md5::Digest as _;
165 let b64 = base64::engine::general_purpose::STANDARD;
166 let bad = |what: &str| {
167 let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
168 S3Error::with_message(
169 code,
170 format!("client-supplied {what} did not match the received body"),
171 )
172 };
173 if let Some(claimed) = content_md5_b64 {
174 let want = b64.decode(claimed).map_err(|_| {
175 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed Content-MD5")
176 })?;
177 if want.len() != 16 {
178 return Err(S3Error::with_message(
179 S3ErrorCode::InvalidDigest,
180 "Content-MD5 must decode to 16 bytes",
181 ));
182 }
183 let mut h = Md5::new();
184 h.update(body);
185 let got = h.finalize();
186 // `subtle::ConstantTimeEq` would be ideal but the existing
187 // `constant_time_eq` helper in sse.rs is private; use a
188 // straightforward byte compare. The attacker doesn't get to
189 // choose the body retroactively, so a timing oracle here
190 // doesn't help them. `&got[..]` derefs the GenericArray
191 // into a `&[u8]` (the deprecated `.as_slice()` is gone in
192 // generic-array 1.x; CI runs `-D warnings`).
193 if got[..] != *want.as_slice() {
194 return Err(bad("Content-MD5"));
195 }
196 }
197 if let Some(claimed) = checksum_crc32c_b64 {
198 let want = b64.decode(claimed).map_err(|_| {
199 S3Error::with_message(
200 S3ErrorCode::InvalidDigest,
201 "malformed x-amz-checksum-crc32c",
202 )
203 })?;
204 if want.len() != 4 {
205 return Err(S3Error::with_message(
206 S3ErrorCode::InvalidDigest,
207 "x-amz-checksum-crc32c must decode to 4 bytes (big-endian u32)",
208 ));
209 }
210 let got = crc32c::crc32c(body).to_be_bytes();
211 if got != want.as_slice() {
212 return Err(bad("x-amz-checksum-crc32c"));
213 }
214 }
215 if let Some(claimed) = checksum_sha256_b64 {
216 let want = b64.decode(claimed).map_err(|_| {
217 S3Error::with_message(
218 S3ErrorCode::InvalidDigest,
219 "malformed x-amz-checksum-sha256",
220 )
221 })?;
222 if want.len() != 32 {
223 return Err(S3Error::with_message(
224 S3ErrorCode::InvalidDigest,
225 "x-amz-checksum-sha256 must decode to 32 bytes",
226 ));
227 }
228 let mut h = Sha256::new();
229 h.update(body);
230 let got = h.finalize();
231 if got[..] != *want.as_slice() {
232 return Err(bad("x-amz-checksum-sha256"));
233 }
234 }
235 // v0.8.12 #128 (MED-C): CRC32 (IEEE 802.3 — the non-Castagnoli
236 // variant AWS uses for `x-amz-checksum-crc32`). 4-byte
237 // big-endian value, base64-encoded.
238 if let Some(claimed) = checksum_crc32_b64 {
239 let want = b64.decode(claimed).map_err(|_| {
240 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-crc32")
241 })?;
242 if want.len() != 4 {
243 return Err(S3Error::with_message(
244 S3ErrorCode::InvalidDigest,
245 "x-amz-checksum-crc32 must decode to 4 bytes (big-endian u32)",
246 ));
247 }
248 let mut h = crc32fast::Hasher::new();
249 h.update(body);
250 let got = h.finalize().to_be_bytes();
251 if got != want.as_slice() {
252 return Err(bad("x-amz-checksum-crc32"));
253 }
254 }
255 // v0.8.12 #128 (MED-C): SHA-1. 20-byte digest, base64-encoded.
256 if let Some(claimed) = checksum_sha1_b64 {
257 use sha1::Sha1;
258 let want = b64.decode(claimed).map_err(|_| {
259 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-sha1")
260 })?;
261 if want.len() != 20 {
262 return Err(S3Error::with_message(
263 S3ErrorCode::InvalidDigest,
264 "x-amz-checksum-sha1 must decode to 20 bytes",
265 ));
266 }
267 let mut h = Sha1::new();
268 h.update(body);
269 let got = h.finalize();
270 if got[..] != *want.as_slice() {
271 return Err(bad("x-amz-checksum-sha1"));
272 }
273 }
274 // v0.8.12 #128 (MED-C): CRC64-NVME — AWS's newest checksum
275 // algorithm. NVMe spec: poly 0xad93d23594c93659, init / xorout
276 // 0xffffffffffffffff, refin / refout true. The reflected
277 // polynomial + 256-entry lookup table are computed lazily on
278 // first call (small enough to inline rather than pull in a
279 // dedicated crc64 crate).
280 if let Some(claimed) = checksum_crc64nvme_b64 {
281 let want = b64.decode(claimed).map_err(|_| {
282 S3Error::with_message(
283 S3ErrorCode::InvalidDigest,
284 "malformed x-amz-checksum-crc64nvme",
285 )
286 })?;
287 if want.len() != 8 {
288 return Err(S3Error::with_message(
289 S3ErrorCode::InvalidDigest,
290 "x-amz-checksum-crc64nvme must decode to 8 bytes (big-endian u64)",
291 ));
292 }
293 let got = crc64_nvme(body).to_be_bytes();
294 if got != want.as_slice() {
295 return Err(bad("x-amz-checksum-crc64nvme"));
296 }
297 }
298 Ok(())
299}
300
301/// v0.8.12 #128 (MED-C): CRC-64/NVME (AWS S3 `x-amz-checksum-crc64nvme`).
302/// NVMe spec: poly 0xad93d23594c93659, init 0xffffffffffffffff, refin
303/// true, refout true, xorout 0xffffffffffffffff. The reflected
304/// polynomial table is computed lazily on first call via
305/// [`std::sync::OnceLock`]; subsequent calls share the 256-entry table.
306fn crc64_nvme(bytes: &[u8]) -> u64 {
307 use std::sync::OnceLock;
308 static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
309 let tbl = TABLE.get_or_init(|| {
310 // Reflected polynomial (bit-reverse of 0xad93d23594c93659).
311 const POLY_REFLECTED: u64 = 0x9a6c_9329_ac4b_c9b5;
312 let mut t = [0u64; 256];
313 let mut i = 0usize;
314 while i < 256 {
315 let mut c = i as u64;
316 let mut j = 0;
317 while j < 8 {
318 c = if c & 1 != 0 {
319 (c >> 1) ^ POLY_REFLECTED
320 } else {
321 c >> 1
322 };
323 j += 1;
324 }
325 t[i] = c;
326 i += 1;
327 }
328 t
329 });
330 let mut crc: u64 = !0u64;
331 for &b in bytes {
332 let idx = ((crc as u8) ^ b) as usize;
333 crc = (crc >> 8) ^ tbl[idx];
334 }
335 !crc
336}
337
338/// v0.4 #20: captured at the start of a handler, before the request is
339/// consumed by the backend call, so the matching `record_access` at
340/// end-of-request can fill in the structured access log entry.
341struct AccessLogPreamble {
342 remote_ip: Option<String>,
343 requester: Option<String>,
344 request_uri: String,
345 user_agent: Option<String>,
346}
347
348pub struct S4Service<B: S3> {
349 /// Wrapped in `Arc` so the v0.6 #40 cross-bucket replication
350 /// dispatcher can clone it into a detached `tokio::spawn` task
351 /// (Arc::clone is cheap; backend trait methods take `&self` so no
352 /// other handler is affected by the indirection).
353 backend: Arc<B>,
354 registry: Arc<CodecRegistry>,
355 dispatcher: Arc<dyn CodecDispatcher>,
356 max_body_bytes: usize,
357 policy: Option<crate::policy::SharedPolicy>,
358 /// v0.3 #13: surfaced as the `aws:SecureTransport` Condition key. Set
359 /// to `true` when the listener is wrapped in TLS (or ACME), so policies
360 /// gating "deny if not over TLS" can do their job. Defaults to `false`
361 /// (HTTP); set via [`S4Service::with_secure_transport`] at boot.
362 secure_transport: bool,
363 /// v0.4 #19: optional per-(principal, bucket) token-bucket limiter.
364 rate_limits: Option<crate::rate_limit::SharedRateLimits>,
365 /// v0.4 #20: optional S3-style access log emitter.
366 access_log: Option<crate::access_log::SharedAccessLog>,
367 /// v0.4 #21 / v0.5 #29: optional server-side encryption keyring
368 /// (AES-256-GCM). When set, every PUT body gets wrapped in S4E2
369 /// (with the keyring's active key id) after the compress + framing
370 /// steps; every GET that sniffs as S4E1/S4E2 is decrypted before
371 /// frame parsing. A `with_sse_key(...)` call wraps the supplied
372 /// key in a 1-slot keyring so single-key (v0.4) operators get the
373 /// same behaviour they had before, just on the v2 frame.
374 sse_keyring: Option<crate::sse::SharedSseKeyring>,
375 /// v0.5 #34: optional first-class versioning state machine. When
376 /// `Some(...)`, S4-server itself owns the per-bucket versioning
377 /// state + per-(bucket, key) version chain; PUT / GET / DELETE /
378 /// list_object_versions / get_bucket_versioning /
379 /// put_bucket_versioning handlers consult the manager instead of
380 /// passing through. When `None` (default), the legacy
381 /// backend-passthrough behaviour applies so existing v0.4
382 /// deployments are unaffected until they explicitly call
383 /// `with_versioning(...)`.
384 versioning: Option<Arc<crate::versioning::VersioningManager>>,
385 /// v0.5 #28: optional SSE-KMS envelope-encryption backend. When
386 /// `Some(...)`, PUTs carrying `x-amz-server-side-encryption: aws:kms`
387 /// generate a fresh DEK via the backend, encrypt the body with it
388 /// (S4E4 frame), and persist only the wrapped DEK. GETs sniffing as
389 /// S4E4 unwrap the DEK through the same backend before decrypt.
390 /// `kms_default_key_id` is used when the request omits an explicit
391 /// `x-amz-server-side-encryption-aws-kms-key-id` (mirrors AWS S3
392 /// bucket-default behaviour).
393 kms: Option<Arc<dyn crate::kms::KmsBackend>>,
394 kms_default_key_id: Option<String>,
395 /// v0.5 #30: optional Object Lock (WORM) enforcement layer. When
396 /// `Some(...)`, `delete_object` and overwrite-style `put_object`
397 /// consult the manager and refuse the operation with HTTP 403
398 /// `AccessDenied` while the object is locked (Compliance until
399 /// expiry, Governance unless the bypass header is set, or any time
400 /// a legal hold is on). PUT also auto-applies the bucket-default
401 /// retention to brand-new objects when configured. When `None`
402 /// (default), the legacy backend-passthrough behaviour applies, so
403 /// existing v0.4 deployments are unaffected until they explicitly
404 /// call `with_object_lock(...)`.
405 object_lock: Option<Arc<crate::object_lock::ObjectLockManager>>,
406 /// v0.6 #38: optional first-class CORS bucket configuration manager.
407 /// When `Some(...)`, S4-server itself owns per-bucket CORS rules and
408 /// `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
409 /// consult the manager instead of passing through to the backend.
410 /// `handle_preflight` (public method on `S4Service`) routes OPTIONS-
411 /// style preflight matching through the same store; the actual HTTP
412 /// OPTIONS routing wire-up at the listener level is a follow-up
413 /// (s3s framework does not surface OPTIONS as a typed handler).
414 cors: Option<Arc<crate::cors::CorsManager>>,
415 /// v0.6 #36: optional first-class S3 Inventory manager. When
416 /// `Some(...)`, S4-server itself owns per-(bucket, id) inventory
417 /// configurations and `put_bucket_inventory_configuration` /
418 /// `get_bucket_inventory_configuration` /
419 /// `list_bucket_inventory_configurations` /
420 /// `delete_bucket_inventory_configuration` consult the manager
421 /// instead of passing through to the backend. The actual periodic
422 /// CSV emission is driven by a tokio task in `main.rs` that calls
423 /// `InventoryManager::run_once_for_test` on a fixed cadence; the
424 /// service handlers below only deal with config-level CRUD.
425 inventory: Option<Arc<crate::inventory::InventoryManager>>,
426 /// v0.6 #35: optional first-class S3 bucket-notification manager.
427 /// When `Some(...)`, S4-server itself owns per-bucket notification
428 /// configurations and `put_bucket_notification_configuration` /
429 /// `get_bucket_notification_configuration` consult the manager
430 /// instead of passing through to the backend. Successful PUT /
431 /// DELETE handlers fire matching destinations on a detached tokio
432 /// task (best-effort; see `crate::notifications::dispatch_event`).
433 notifications: Option<Arc<crate::notifications::NotificationManager>>,
434 /// v0.6 #37: optional first-class S3 Lifecycle configuration
435 /// manager. When `Some(...)`, S4-server itself owns per-bucket
436 /// lifecycle rules and `put_bucket_lifecycle_configuration` /
437 /// `get_bucket_lifecycle_configuration` /
438 /// `delete_bucket_lifecycle` consult the manager instead of
439 /// passing through to the backend. The actual background scanner
440 /// (list_objects_v2 -> evaluate -> delete / metadata-rewrite per
441 /// rule) is a v0.7+ follow-up; the test path
442 /// `S4Service::run_lifecycle_once_for_test` exercises the
443 /// evaluator end-to-end so this v0.6 #37 wiring is enough to ship
444 /// the configuration-management half without putting a
445 /// half-wired bucket-walk in front of users.
446 lifecycle: Option<Arc<crate::lifecycle::LifecycleManager>>,
447 /// v0.6 #39: optional first-class object + bucket Tagging manager.
448 /// When `Some(...)`, S4-server itself owns per-(bucket, key) and
449 /// per-bucket tag state — `PutObjectTagging` /
450 /// `GetObjectTagging` / `DeleteObjectTagging` /
451 /// `PutBucketTagging` / `GetBucketTagging` /
452 /// `DeleteBucketTagging` route through the manager (replacing the
453 /// previous backend-passthrough behaviour). `put_object` also
454 /// pre-parses the `x-amz-tagging` header / `Tagging` input field
455 /// so the IAM policy evaluator can gate on
456 /// `s3:RequestObjectTag/<key>` and `s3:ExistingObjectTag/<key>`.
457 /// On a successful PUT the parsed tags are persisted; on a
458 /// successful DELETE the matching tag entry is dropped.
459 tagging: Option<Arc<crate::tagging::TagManager>>,
460 /// v0.6 #40: optional first-class cross-bucket replication manager.
461 /// When `Some(...)`, S4-server itself owns per-bucket replication
462 /// rules; `PutBucketReplication` / `GetBucketReplication` /
463 /// `DeleteBucketReplication` route through the manager (replacing
464 /// the previous backend-passthrough behaviour). On every successful
465 /// `put_object` the manager's rule list is consulted; the
466 /// highest-priority matching enabled rule wins, the per-key status
467 /// is recorded as `Pending`, and the source body and metadata are
468 /// handed to a detached tokio task that PUTs to the destination
469 /// bucket through the same backend. The replica is stamped with
470 /// `x-amz-replication-status: REPLICA` in its metadata; the
471 /// source-side status is updated to `Completed` on success or
472 /// `Failed` after the 3-attempt retry budget is exhausted (drop
473 /// counter bumps in either-side case so dashboards see the loss).
474 /// `head_object` / `get_object` echo the recorded status back as
475 /// `x-amz-replication-status` so consumers can poll progress.
476 /// Limited to single-instance (same `S4Service`) replication; true
477 /// cross-region (multi-instance) is a v0.7+ follow-up.
478 replication: Option<Arc<crate::replication::ReplicationManager>>,
479 /// v0.6 #42: optional MFA-Delete enforcement layer. When `Some(...)`,
480 /// every DELETE / DELETE-version / delete-marker / `PutBucketVersioning`
481 /// request against a bucket whose MFA-Delete state is `Enabled`
482 /// must carry `x-amz-mfa: <serial> <code>` (RFC 6238 6-digit TOTP);
483 /// missing or invalid tokens return HTTP 403 `AccessDenied`. When
484 /// `None` (default), the gate is a no-op so existing v0.4 / v0.5
485 /// deployments are unaffected until they explicitly call
486 /// `with_mfa_delete(...)`.
487 mfa_delete: Option<Arc<crate::mfa::MfaDeleteManager>>,
488 /// v0.5 #32: when `true`, every PUT must carry an SSE indicator
489 /// (`x-amz-server-side-encryption`, the SSE-C customer-key headers,
490 /// or be matched against a configured server-managed keyring/KMS).
491 /// Set by `--compliance-mode strict` after the boot-time
492 /// prerequisite check passes.
493 compliance_strict: bool,
494 /// v0.7 #47: optional SigV4a (asymmetric ECDSA-P256-SHA256) verify
495 /// gate. When `Some(...)`, the listener-side middleware (see
496 /// [`crate::routing::try_sigv4a_verify`]) inspects every incoming
497 /// request and short-circuits SigV4a-signed ones — verifying the
498 /// signature against the credential store and returning 403
499 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` on failure. Plain
500 /// SigV4 (HMAC-SHA256) requests pass through to s3s untouched. When
501 /// `None`, the middleware is a no-op so the existing SigV4 path is
502 /// unaffected (operators opt in via `--sigv4a-credentials <DIR>`).
503 sigv4a_gate: Option<Arc<SigV4aGate>>,
504 /// v0.8 #54 BUG-5..10: per-`upload_id` side-table that ferries the
505 /// SSE / Tagging / Object-Lock context captured at
506 /// `CreateMultipartUpload` time through to `UploadPart` /
507 /// `CompleteMultipartUpload`. Always-on (no `with_*` flag) — the
508 /// store is gateway-internal and idle when no multipart is in
509 /// flight. See [`crate::multipart_state`] for rationale.
510 multipart_state: Arc<crate::multipart_state::MultipartStateStore>,
511 /// v0.8 #52: plaintext bytes per S4E5 chunk on the SSE-S4 PUT
512 /// path. `0` (default) → use the legacy buffered S4E2 path
513 /// (whole-body AES-GCM tag, GET buffers + verifies before
514 /// emitting). Non-zero → use the chunked S4E5 frame so GET can
515 /// stream-decrypt chunk-by-chunk. Wired by `--sse-chunk-size`
516 /// in `main.rs`. SSE-C and SSE-KMS are intentionally unaffected
517 /// (chunked variants tracked in a follow-up issue).
518 sse_chunk_size: usize,
519 /// v0.8.5 #86 (audit M-2): bounded permit pool gating the detached
520 /// replication dispatcher in [`Self::spawn_replication_if_matched`].
521 /// Without this cap, a high-volume PUT workload (1k req/s × N enabled
522 /// rules × slow destination = O(10k) in-flight tokio tasks) could
523 /// exhaust process memory before the destination drains. Each
524 /// dispatcher spawn `acquire_owned`s one permit and holds it for the
525 /// lifetime of the destination PUT + status stamp; once the cap is
526 /// reached the dispatcher async-blocks on `acquire_owned()` so the
527 /// listener path itself never stalls — only the in-flight replica
528 /// queue depth is bounded. Default 1024 (operator-tunable via
529 /// `--replication-max-concurrent`).
530 replication_semaphore: Arc<tokio::sync::Semaphore>,
531 /// v0.8.11 CRIT-4 fix: trust the `X-Forwarded-For` header for the
532 /// `aws:SourceIp` Condition key only when the operator has
533 /// explicitly opted in via `--trust-x-forwarded-for`. Default
534 /// (`false`) makes the policy evaluator see `source_ip = None`
535 /// for incoming requests, so a public-internet client can no
536 /// longer spoof an internal CIDR by setting `X-Forwarded-For`
537 /// themselves. Operators behind a trusted reverse proxy that
538 /// scrubs / sets `X-Forwarded-For` enable the flag; gateways
539 /// listening directly on the public internet leave it off and
540 /// gain a clear fail-closed default. A future release plumbs
541 /// the TCP peer address through the s3s service trait so we can
542 /// validate the forwarded header against a `--trusted-proxies`
543 /// CIDR list; until then the boolean opt-in closes the immediate
544 /// auth-bypass surface.
545 trust_x_forwarded_for: bool,
546}
547
548impl<B: S3> S4Service<B> {
549 /// AWS S3 単発 PUT の API 上限 (5 GiB)
550 pub const DEFAULT_MAX_BODY_BYTES: usize = 5 * 1024 * 1024 * 1024;
551
552 /// v0.8.5 #86 (audit M-2): default cap on simultaneously-in-flight
553 /// replication dispatcher tasks. See the `replication_semaphore`
554 /// field doc for the rationale + override path.
555 pub const DEFAULT_REPLICATION_MAX_CONCURRENT: usize = 1024;
556
557 pub fn new(
558 backend: B,
559 registry: Arc<CodecRegistry>,
560 dispatcher: Arc<dyn CodecDispatcher>,
561 ) -> Self {
562 Self {
563 backend: Arc::new(backend),
564 registry,
565 dispatcher,
566 max_body_bytes: Self::DEFAULT_MAX_BODY_BYTES,
567 policy: None,
568 secure_transport: false,
569 rate_limits: None,
570 access_log: None,
571 sse_keyring: None,
572 versioning: None,
573 kms: None,
574 kms_default_key_id: None,
575 object_lock: None,
576 cors: None,
577 inventory: None,
578 notifications: None,
579 lifecycle: None,
580 tagging: None,
581 replication: None,
582 mfa_delete: None,
583 compliance_strict: false,
584 sigv4a_gate: None,
585 multipart_state: Arc::new(crate::multipart_state::MultipartStateStore::new()),
586 // v0.8 #52: chunked SSE-S4 disabled by default — opt
587 // in via `S4Service::with_sse_chunk_size(...)` /
588 // `--sse-chunk-size <BYTES>`. Default keeps the legacy
589 // S4E2 buffered path so existing deployments are
590 // bit-for-bit unchanged.
591 sse_chunk_size: 0,
592 // v0.8.5 #86 (audit M-2): default cap of 1024 in-flight
593 // replication tasks. Picked to be (a) ample headroom over a
594 // typical steady-state replication rate (the v0.8.3 #66
595 // status-sweep doc cites 1k keys/hour as a "steady" rate, so
596 // even a 100x burst lands well under 1024), (b) small enough
597 // that the worst-case memory pinned by stalled dispatchers
598 // — body bytes + metadata — stays bounded (1024 × 5 MiB
599 // typical S3 PUT ≈ 5 GiB, recoverable). Operators with
600 // wider cross-region fan-out can override via
601 // `--replication-max-concurrent`.
602 replication_semaphore: Arc::new(tokio::sync::Semaphore::new(
603 Self::DEFAULT_REPLICATION_MAX_CONCURRENT,
604 )),
605 // v0.8.11 CRIT-4: default fail-closed — ignore client-
606 // supplied `X-Forwarded-For` until the operator opts in
607 // through `with_trust_x_forwarded_for(true)`.
608 trust_x_forwarded_for: false,
609 }
610 }
611
612 /// v0.8.11 CRIT-4 fix: opt in to consuming the leftmost token of
613 /// the `X-Forwarded-For` header as `aws:SourceIp`. Only enable
614 /// when the gateway sits behind a trusted reverse proxy that
615 /// strips (or rewrites) any client-supplied value. When left
616 /// off (default), the policy evaluator sees `source_ip = None`
617 /// regardless of what the client sends — closing the
618 /// public-internet `X-Forwarded-For: 10.0.0.1` IAM-allowlist
619 /// bypass.
620 #[must_use]
621 pub fn with_trust_x_forwarded_for(mut self, on: bool) -> Self {
622 self.trust_x_forwarded_for = on;
623 self
624 }
625
626 /// v0.7 #47: attach the SigV4a verify gate. Once set, the
627 /// listener-side middleware (`crate::routing::try_sigv4a_verify`)
628 /// short-circuits any incoming `AWS4-ECDSA-P256-SHA256` request,
629 /// verifying it against the supplied credential store and
630 /// returning 403 on failure. Plain SigV4 (HMAC-SHA256) requests
631 /// are unaffected. When the gate is unset (default), the
632 /// middleware skips entirely so existing SigV4 deployments keep
633 /// working.
634 #[must_use]
635 pub fn with_sigv4a_gate(mut self, gate: Arc<SigV4aGate>) -> Self {
636 self.sigv4a_gate = Some(gate);
637 self
638 }
639
640 /// v0.7 #47: borrow the attached SigV4a gate. Used by `main.rs`
641 /// to snapshot the gate `Arc` before the s3s `ServiceBuilder`
642 /// consumes the `S4Service` (the listener-side middleware needs
643 /// the same `Arc` because s3s' SigV4 verifier rejects SigV4a
644 /// algorithm tokens with "unknown algorithm" — match has to
645 /// happen at the hyper layer instead).
646 #[must_use]
647 pub fn sigv4a_gate(&self) -> Option<&Arc<SigV4aGate>> {
648 self.sigv4a_gate.as_ref()
649 }
650
651 /// v0.8.2 #62: borrow the multipart state store so `main.rs` can
652 /// snapshot the `Arc` before the s3s `ServiceBuilder` consumes
653 /// the `S4Service`. The background `sweep_stale` task in `main.rs`
654 /// holds this `Arc` and ticks once an hour to drop abandoned
655 /// upload contexts (and their `Zeroizing<[u8; 32]>` SSE-C keys).
656 #[must_use]
657 pub fn multipart_state(&self) -> &Arc<crate::multipart_state::MultipartStateStore> {
658 &self.multipart_state
659 }
660
661 /// v0.6 #39: attach the in-memory object + bucket Tagging manager.
662 /// Once set, `Put/Get/Delete` `Object/Bucket Tagging` route
663 /// through the manager (instead of forwarding to the backend),
664 /// and `put_object`'s `x-amz-tagging` parse path becomes the
665 /// source of `s3:RequestObjectTag/<key>` for the IAM policy
666 /// evaluator. The manager itself is shared via `Arc`.
667 #[must_use]
668 pub fn with_tagging(mut self, mgr: Arc<crate::tagging::TagManager>) -> Self {
669 self.tagging = Some(mgr);
670 self
671 }
672
673 /// v0.6 #39: borrow the attached tagging manager (test /
674 /// introspection — the snapshotter in `main.rs`, when wired,
675 /// will keep its own `Arc` clone).
676 #[must_use]
677 pub fn tag_manager(&self) -> Option<&Arc<crate::tagging::TagManager>> {
678 self.tagging.as_ref()
679 }
680
681 /// v0.6 #36: attach the in-memory S3 Inventory manager. Once set,
682 /// `put_bucket_inventory_configuration` /
683 /// `get_bucket_inventory_configuration` /
684 /// `list_bucket_inventory_configurations` /
685 /// `delete_bucket_inventory_configuration` route through the
686 /// manager. The actual periodic CSV / manifest emission is
687 /// orchestrated by a tokio task started in `main.rs`; the manager
688 /// itself is shared between the handler and the scheduler via
689 /// `Arc`.
690 #[must_use]
691 pub fn with_inventory(mut self, mgr: Arc<crate::inventory::InventoryManager>) -> Self {
692 self.inventory = Some(mgr);
693 self
694 }
695
696 /// v0.6 #36: borrow the attached inventory manager (test /
697 /// introspection — the background scheduler in `main.rs` keeps its
698 /// own `Arc` clone, so this accessor is for the test path that
699 /// invokes `run_once_for_test` directly).
700 #[must_use]
701 pub fn inventory_manager(&self) -> Option<&Arc<crate::inventory::InventoryManager>> {
702 self.inventory.as_ref()
703 }
704
705 /// v0.6 #37: attach the in-memory S3 Lifecycle configuration
706 /// manager. Once set, `put_bucket_lifecycle_configuration` /
707 /// `get_bucket_lifecycle_configuration` / `delete_bucket_lifecycle`
708 /// route through the manager (replacing the previous backend-
709 /// passthrough behaviour). The actual periodic scanner that walks
710 /// the source bucket and invokes Expiration / Transition /
711 /// NoncurrentExpiration actions is a v0.7+ follow-up — see
712 /// [`Self::run_lifecycle_once_for_test`] for the in-memory test
713 /// path that exercises the evaluator end-to-end.
714 #[must_use]
715 pub fn with_lifecycle(mut self, mgr: Arc<crate::lifecycle::LifecycleManager>) -> Self {
716 self.lifecycle = Some(mgr);
717 self
718 }
719
720 /// v0.6 #37: borrow the attached lifecycle manager (test /
721 /// introspection — the background scheduler in `main.rs` keeps its
722 /// own `Arc` clone, so this accessor is for the test path that
723 /// invokes the evaluator directly).
724 #[must_use]
725 pub fn lifecycle_manager(&self) -> Option<&Arc<crate::lifecycle::LifecycleManager>> {
726 self.lifecycle.as_ref()
727 }
728
729 /// v0.6 #37: synchronous test entry that runs the lifecycle evaluator
730 /// against a caller-provided list of `(key, age, size, tags)` tuples
731 /// and returns the `(key, action)` pairs that should fire. The actual
732 /// backend invocation (S3.delete_object / metadata rewrite) is left
733 /// to the caller — the unit + E2E tests use this to verify the
734 /// evaluator without spawning the (deferred) background scanner.
735 /// Returns an empty `Vec` when no lifecycle manager is attached or
736 /// no rule matches.
737 #[must_use]
738 pub fn run_lifecycle_once_for_test(
739 &self,
740 bucket: &str,
741 objects: &[crate::lifecycle::EvaluateBatchEntry],
742 ) -> Vec<(String, crate::lifecycle::LifecycleAction)> {
743 let Some(mgr) = self.lifecycle.as_ref() else {
744 return Vec::new();
745 };
746 crate::lifecycle::evaluate_batch(mgr, bucket, objects)
747 }
748
749 /// v0.6 #35: attach the in-memory bucket-notification manager. Once
750 /// set, `put_bucket_notification_configuration` /
751 /// `get_bucket_notification_configuration` route through the manager
752 /// (replacing the previous backend-passthrough behaviour); successful
753 /// `put_object` / `delete_object` calls fire matching destinations
754 /// on a detached tokio task via
755 /// `crate::notifications::dispatch_event` (best-effort, fire-and-
756 /// forget — failures bump the manager's `dropped_total` counter and
757 /// log at warn but do NOT fail the originating S3 request).
758 #[must_use]
759 pub fn with_notifications(
760 mut self,
761 mgr: Arc<crate::notifications::NotificationManager>,
762 ) -> Self {
763 self.notifications = Some(mgr);
764 self
765 }
766
767 /// v0.6 #35: borrow the attached notifications manager (test /
768 /// introspection — used by the metrics layer to read
769 /// `dropped_total`).
770 #[must_use]
771 pub fn notifications_manager(&self) -> Option<&Arc<crate::notifications::NotificationManager>> {
772 self.notifications.as_ref()
773 }
774
775 /// v0.6 #35: internal helper used by the DELETE handlers to fire a
776 /// matching notification on a detached tokio task. No-op when no
777 /// manager is attached or no rule on the bucket matches the given
778 /// (event, key) tuple.
779 fn fire_delete_notification(
780 &self,
781 bucket: &str,
782 key: &str,
783 event: crate::notifications::EventType,
784 version_id: Option<String>,
785 ) {
786 let Some(mgr) = self.notifications.as_ref() else {
787 return;
788 };
789 let dests = mgr.match_destinations(bucket, &event, key);
790 if dests.is_empty() {
791 return;
792 }
793 tokio::spawn(crate::notifications::dispatch_event(
794 Arc::clone(mgr),
795 bucket.to_owned(),
796 key.to_owned(),
797 event,
798 None,
799 None,
800 version_id,
801 format!("S4-{}", uuid::Uuid::new_v4()),
802 ));
803 }
804
805 /// v0.6 #40: attach the in-memory cross-bucket replication manager.
806 /// Once set, `put_bucket_replication` / `get_bucket_replication` /
807 /// `delete_bucket_replication` route through the manager (replacing
808 /// the previous backend-passthrough behaviour); a successful
809 /// `put_object` whose key matches an enabled rule fires a detached
810 /// tokio task that PUTs the same body + metadata to the rule's
811 /// destination bucket, stamping the replica with
812 /// `x-amz-replication-status: REPLICA`. Failures after the retry
813 /// budget bump the manager's `dropped_total` counter and are
814 /// surfaced in the `s4_replication_dropped_total` Prometheus
815 /// counter; successes bump `s4_replication_replicated_total`.
816 #[must_use]
817 pub fn with_replication(mut self, mgr: Arc<crate::replication::ReplicationManager>) -> Self {
818 self.replication = Some(mgr);
819 self
820 }
821
822 /// v0.6 #40: borrow the attached replication manager (test /
823 /// introspection — used by the metrics layer to read
824 /// `dropped_total`).
825 #[must_use]
826 pub fn replication_manager(&self) -> Option<&Arc<crate::replication::ReplicationManager>> {
827 self.replication.as_ref()
828 }
829
830 /// v0.6 #40: internal helper used by the PUT handlers to fire a
831 /// detached cross-bucket replication task. No-op when no manager
832 /// is attached, the source backend PUT failed, or no rule on the
833 /// source bucket matches the (key, tags) tuple. The `body` is the
834 /// post-compression / post-encryption `Bytes` that was sent to
835 /// the source backend (refcount-cloned), and `metadata` is the
836 /// metadata map that already includes the manifest /
837 /// `s4-encrypted` markers — the replica decodes through the same
838 /// path. The destination PUT runs through `Arc<B>::put_object`.
839 ///
840 /// ## v0.8.2 #61: generation token + shadow-key destination
841 ///
842 /// `pending_version` is the source-side `PutOutcome` minted by the
843 /// caller's versioning branch (or `None` for unversioned /
844 /// suspended buckets). When `pending_version.versioned_response`
845 /// is `true`, the dispatcher writes the destination under the same
846 /// shadow path the source uses (`<key>.__s4ver__/<vid>`) so the
847 /// destination's version chain receives the new version the same
848 /// way `?versionId=` GET resolves it. Closes audit C-1.
849 ///
850 /// The dispatcher also mints a fresh `generation` token before
851 /// spawning, threaded through to [`crate::replication::
852 /// replicate_object`]. Closes audit C-3 — a stale retry of an
853 /// older PUT can no longer overwrite the destination's newer bytes
854 /// because the CAS guard sees the higher stored generation and
855 /// drops its destination write.
856 ///
857 /// ## Asymmetric versioning policy (out of scope)
858 ///
859 /// We assume source + destination buckets share the same
860 /// versioning policy (both Enabled or both Suspended /
861 /// Unversioned). Cross-bucket policy queries would require a
862 /// backend round-trip per replication, which is not worth it for
863 /// the single-instance scope. Operators who configure asymmetric
864 /// versioning will see destination-side `?versionId=` lookups
865 /// miss — documented as out-of-scope until a future per-rule
866 /// `destination_versioning_policy` knob lands.
867 // 8 args is the post-#61 shape: replication needs the
868 // source bucket+key, the canonical tag set for rule-matching,
869 // the post-codec body+metadata for the destination PUT, the
870 // backend-success gate, and the pending version-id for the
871 // shadow-key destination override. A shape struct would just
872 // split the (single) call site so opt for the inline form.
873 #[allow(clippy::too_many_arguments)]
874 fn spawn_replication_if_matched(
875 &self,
876 source_bucket: &str,
877 source_key: &str,
878 request_tags: &Option<crate::tagging::TagSet>,
879 body: &bytes::Bytes,
880 metadata: &Option<std::collections::HashMap<String, String>>,
881 backend_ok: bool,
882 pending_version: Option<&crate::versioning::PutOutcome>,
883 ) where
884 B: Send + Sync + 'static,
885 {
886 if !backend_ok {
887 return;
888 }
889 let Some(mgr) = self.replication.as_ref() else {
890 return;
891 };
892 // Pull the request's tags into the (k, v) shape the matcher
893 // expects. The tagging manager would have the canonical
894 // post-PUT view but at this point in the pipeline it's
895 // already been written above; for the rule-match decision
896 // the request's tags are sufficient (= the tags this PUT
897 // applies, S3 PutObject is full-replace on tags).
898 let object_tags: Vec<(String, String)> = request_tags
899 .as_ref()
900 .map(|ts| ts.iter().cloned().collect())
901 .unwrap_or_default();
902 let Some(rule) = mgr.match_rule(source_bucket, source_key, &object_tags) else {
903 return;
904 };
905 // v0.8.2 #61: mint the per-PUT generation BEFORE the eager
906 // Pending stamp so the stamp itself carries the right
907 // generation (the CAS in `record_status_if_newer` would
908 // otherwise see a `generation=0` Pending and accept any
909 // stale retry).
910 let generation = mgr.next_generation();
911 // Eagerly mark the source key as Pending so a HEAD between
912 // the source PUT returning and the spawned task completing
913 // surfaces the in-flight state. CAS-guarded so a slower
914 // older PUT can't downgrade a newer Completed back to Pending.
915 let _ = mgr.record_status_if_newer(
916 source_bucket,
917 source_key,
918 generation,
919 crate::replication::ReplicationStatus::Pending,
920 );
921 // v0.8.2 #61: derive the destination storage key. For a
922 // versioning-Enabled source the destination receives the
923 // same shadow-key path so a `?versionId=<vid>` GET on the
924 // destination resolves through the same lookup the source
925 // uses. Suspended / Unversioned sources keep the logical
926 // key (= `None` override = dispatcher uses `source_key`).
927 let destination_key_override = pending_version
928 .filter(|pv| pv.versioned_response)
929 .map(|pv| versioned_shadow_key(source_key, &pv.version_id));
930 // v0.8.3 #68 (audit M-1): capture the source object's Object
931 // Lock state so the dispatcher can decorate the destination
932 // PUT with the matching AWS-wire lock headers. Without this,
933 // a Compliance / Governance / legal-hold protected source
934 // would replicate to a destination where DELETE succeeds
935 // (the WORM posture would only hold on the source).
936 let source_lock_state = self
937 .object_lock
938 .as_ref()
939 .and_then(|mgr| mgr.get(source_bucket, source_key));
940 // v0.8.3 #68: hand the destination-side ObjectLockManager to
941 // the dispatcher closure so we can persist the propagated
942 // lock state on successful destination PUT (the destination
943 // PUT below bypasses S4Service::put_object — we drive the
944 // backend directly — so the explicit_lock_mode commit block
945 // in put_object never fires for replicas. We replay it here
946 // against the destination key.)
947 let dest_lock_mgr = self.object_lock.as_ref().map(Arc::clone);
948 let mgr_cl = Arc::clone(mgr);
949 let backend = Arc::clone(&self.backend);
950 let body_cl = body.clone();
951 let metadata_cl = metadata.clone();
952 let source_bucket_cl = source_bucket.to_owned();
953 let source_key_cl = source_key.to_owned();
954 let source_lock_state_for_closure = source_lock_state.clone();
955 let source_bucket_for_warn = source_bucket.to_owned();
956 // v0.8.5 #86 (audit M-2): bound the in-flight replication queue
957 // depth. Acquire happens INSIDE the spawned task (not on the
958 // listener path) so a saturated semaphore back-pressures the
959 // dispatcher pool without stalling the source PUT response —
960 // the source has already returned 200 to the client by the time
961 // the spawn body runs. A failed `acquire_owned` only happens
962 // when the semaphore is closed (we never close it, so the
963 // logged-and-skipped fallback is unreachable in practice).
964 let semaphore = Arc::clone(&self.replication_semaphore);
965 tokio::spawn(async move {
966 let _permit = match semaphore.acquire_owned().await {
967 Ok(p) => p,
968 Err(e) => {
969 tracing::warn!(
970 bucket = %source_bucket_cl,
971 key = %source_key_cl,
972 "S4 replication dispatcher could not acquire semaphore permit (closed? {e}); skipping replica"
973 );
974 return;
975 }
976 };
977 let do_put = move |dest_bucket: String,
978 dest_key: String,
979 dest_body: bytes::Bytes,
980 dest_meta: Option<std::collections::HashMap<String, String>>| {
981 let backend = Arc::clone(&backend);
982 let dest_lock_mgr = dest_lock_mgr.clone();
983 let lock_state = source_lock_state_for_closure.clone();
984 let warn_src = source_bucket_for_warn.clone();
985 async move {
986 let req = S3Request {
987 input: PutObjectInput {
988 bucket: dest_bucket.clone(),
989 key: dest_key.clone(),
990 body: Some(bytes_to_blob(dest_body)),
991 metadata: dest_meta,
992 ..Default::default()
993 },
994 method: http::Method::PUT,
995 uri: "/".parse().unwrap(),
996 headers: http::HeaderMap::new(),
997 extensions: http::Extensions::new(),
998 credentials: None,
999 region: None,
1000 service: None,
1001 trailing_headers: None,
1002 };
1003 let put_result = backend
1004 .put_object(req)
1005 .await
1006 .map(|_| ())
1007 .map_err(|e| format!("destination put_object: {e}"));
1008 // v0.8.3 #68: on successful destination PUT,
1009 // persist the propagated lock state into the
1010 // destination's ObjectLockManager so a subsequent
1011 // DELETE on the destination is refused. Three cases:
1012 // - PUT failed → skip (no replica to protect)
1013 // - lock_state None → nothing to propagate
1014 // - dest manager None (operator misconfig)
1015 // → log warn-once + bump skip metric
1016 if put_result.is_ok()
1017 && let Some(state) = lock_state
1018 {
1019 match dest_lock_mgr {
1020 Some(ref mgr) => {
1021 mgr.set(&dest_bucket, &dest_key, state);
1022 }
1023 None => {
1024 crate::replication::warn_lock_propagation_skipped(
1025 &warn_src,
1026 &dest_bucket,
1027 );
1028 }
1029 }
1030 }
1031 put_result
1032 }
1033 };
1034 // v0.8.5 #81 (audit H-7): wrap the dispatcher body in
1035 // `futures::FutureExt::catch_unwind` so a panic inside
1036 // `replicate_object` (or any of the user-supplied closures
1037 // it drives — `do_put`, the destination backend, the lock
1038 // manager) does NOT bubble out of the detached task as a
1039 // `JoinError` that no operator dashboard scrapes. Caught
1040 // panics bump `s4_dispatcher_panics_total{kind="replication"}`
1041 // + log at ERROR with the panic payload, so silent feature
1042 // degradation (= every replication PUT panicking and
1043 // dropping the replica without any visible signal) becomes
1044 // a first-class metric the operator can alert on.
1045 //
1046 // `AssertUnwindSafe` is required because the inner future
1047 // captures `Arc<...>` clones + a `do_put` closure that are
1048 // not `UnwindSafe` by default; the safety contract here is
1049 // "we don't continue using any of those captures after the
1050 // panic" which trivially holds (we drop them and return).
1051 use futures::FutureExt as _;
1052 let dispatcher_kind = "replication";
1053 let fut = crate::replication::replicate_object(
1054 rule,
1055 source_bucket_cl,
1056 source_key_cl,
1057 body_cl,
1058 metadata_cl,
1059 do_put,
1060 mgr_cl,
1061 generation,
1062 destination_key_override,
1063 source_lock_state,
1064 );
1065 if let Err(panic) = std::panic::AssertUnwindSafe(fut).catch_unwind().await {
1066 let panic_msg = panic
1067 .downcast_ref::<&'static str>()
1068 .copied()
1069 .map(str::to_owned)
1070 .or_else(|| panic.downcast_ref::<String>().cloned())
1071 .unwrap_or_else(|| "(non-string panic payload)".to_owned());
1072 tracing::error!(
1073 kind = dispatcher_kind,
1074 panic_payload = %panic_msg,
1075 "S4 dispatcher task panicked (caught by catch_unwind, runtime not poisoned)"
1076 );
1077 crate::metrics::record_dispatcher_panic(dispatcher_kind);
1078 }
1079 });
1080 }
1081
1082 /// v0.6 #42: attach the in-memory MFA-Delete enforcement manager.
1083 /// Once set, every DELETE / DELETE-version / delete-marker /
1084 /// `PutBucketVersioning` request against a bucket whose MFA-Delete
1085 /// state is `Enabled` requires a valid `x-amz-mfa: <serial> <code>`
1086 /// header (RFC 6238 6-digit TOTP); the gate is a no-op for buckets
1087 /// where MFA-Delete is `Disabled` (S3 default).
1088 #[must_use]
1089 pub fn with_mfa_delete(mut self, mgr: Arc<crate::mfa::MfaDeleteManager>) -> Self {
1090 self.mfa_delete = Some(mgr);
1091 self
1092 }
1093
1094 /// v0.6 #42: borrow the attached MFA-Delete manager (test /
1095 /// introspection — used by the snapshot path in `main.rs` to call
1096 /// `to_json` for restart-recoverable state).
1097 #[must_use]
1098 pub fn mfa_delete_manager(&self) -> Option<&Arc<crate::mfa::MfaDeleteManager>> {
1099 self.mfa_delete.as_ref()
1100 }
1101
1102 /// v0.6 #38: attach the in-memory CORS configuration manager. Once
1103 /// set, `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
1104 /// route through the manager instead of forwarding to the backend,
1105 /// and [`Self::handle_preflight`] becomes useful for the (future)
1106 /// listener-side OPTIONS interceptor.
1107 #[must_use]
1108 pub fn with_cors(mut self, mgr: Arc<crate::cors::CorsManager>) -> Self {
1109 self.cors = Some(mgr);
1110 self
1111 }
1112
1113 /// v0.6 #38: Borrow the attached CORS manager (test / introspection).
1114 #[must_use]
1115 pub fn cors_manager(&self) -> Option<&Arc<crate::cors::CorsManager>> {
1116 self.cors.as_ref()
1117 }
1118
1119 /// v0.6 #38: evaluate a CORS preflight request against the bucket's
1120 /// configured rules and, if a rule matches, return the headers that
1121 /// the (future) listener-side OPTIONS interceptor must put on the
1122 /// 200 response: `Access-Control-Allow-Origin`, `Access-Control-
1123 /// Allow-Methods`, `Access-Control-Allow-Headers`, optionally
1124 /// `Access-Control-Max-Age` and `Access-Control-Expose-Headers`.
1125 ///
1126 /// Returns `None` when no manager is attached, no config is
1127 /// registered for the bucket, or no rule matches the (origin,
1128 /// method, headers) triple. The caller is responsible for turning
1129 /// `None` into the appropriate 403 response.
1130 ///
1131 /// **Note:** the OPTIONS routing itself (i.e. wiring this method
1132 /// into the hyper-util listener path) is a follow-up — s3s does not
1133 /// surface OPTIONS as a typed S3 handler, so this method is
1134 /// currently call-able only from inside other handlers and tests.
1135 #[must_use]
1136 pub fn handle_preflight(
1137 &self,
1138 bucket: &str,
1139 origin: &str,
1140 method: &str,
1141 request_headers: &[String],
1142 ) -> Option<std::collections::HashMap<String, String>> {
1143 let mgr = self.cors.as_ref()?;
1144 let rule = mgr.match_preflight(bucket, origin, method, request_headers)?;
1145 let mut h = std::collections::HashMap::new();
1146 // Echo the matched origin back. If the rule used "*" we still
1147 // echo "*" (S3 spec — the spec does not require us to echo the
1148 // *requesting* origin when the wildcard matched).
1149 let allow_origin = if rule.allowed_origins.iter().any(|o| o == "*") {
1150 "*".to_string()
1151 } else {
1152 origin.to_string()
1153 };
1154 h.insert("Access-Control-Allow-Origin".to_string(), allow_origin);
1155 h.insert(
1156 "Access-Control-Allow-Methods".to_string(),
1157 rule.allowed_methods.join(", "),
1158 );
1159 if !rule.allowed_headers.is_empty() {
1160 // For the Allow-Headers response, echo back the rule's
1161 // pattern list verbatim (S3 echoes the configured list,
1162 // including "*" if present). Browsers honour exact-match
1163 // rules.
1164 h.insert(
1165 "Access-Control-Allow-Headers".to_string(),
1166 rule.allowed_headers.join(", "),
1167 );
1168 }
1169 if let Some(secs) = rule.max_age_seconds {
1170 h.insert("Access-Control-Max-Age".to_string(), secs.to_string());
1171 }
1172 if !rule.expose_headers.is_empty() {
1173 h.insert(
1174 "Access-Control-Expose-Headers".to_string(),
1175 rule.expose_headers.join(", "),
1176 );
1177 }
1178 Some(h)
1179 }
1180
1181 /// v0.5 #32: enable strict compliance mode. Every PUT must carry an
1182 /// SSE indicator (server-side encryption header or SSE-C customer
1183 /// key); requests without one are rejected with 400 InvalidRequest.
1184 /// Boot-time prerequisite checking lives in the binary
1185 /// (`validate_compliance_mode`) so this flag is purely the runtime
1186 /// switch.
1187 #[must_use]
1188 pub fn with_compliance_strict(mut self, on: bool) -> Self {
1189 self.compliance_strict = on;
1190 self
1191 }
1192
1193 /// v0.5 #30: attach the in-memory Object Lock (WORM) enforcement
1194 /// manager. Once set, `delete_object` and overwrite-path
1195 /// `put_object` refuse operations on locked keys with HTTP 403
1196 /// `AccessDenied`; new PUTs to a bucket with a default retention
1197 /// policy auto-create per-object lock state.
1198 #[must_use]
1199 pub fn with_object_lock(mut self, mgr: Arc<crate::object_lock::ObjectLockManager>) -> Self {
1200 self.object_lock = Some(mgr);
1201 self
1202 }
1203
1204 /// v0.7 #45: borrow the attached Object Lock manager (read-only —
1205 /// the lifecycle scanner uses this to skip currently-locked objects
1206 /// before issuing `delete_object`, since an Object Lock always wins
1207 /// over Lifecycle Expiration in AWS S3 semantics). Mirrors the
1208 /// shape of [`Self::lifecycle_manager`] /
1209 /// [`Self::tag_manager`] — purely additive accessor, no handler
1210 /// behaviour change.
1211 #[must_use]
1212 pub fn object_lock_manager(&self) -> Option<&Arc<crate::object_lock::ObjectLockManager>> {
1213 self.object_lock.as_ref()
1214 }
1215
1216 /// v0.5 #28: attach an SSE-KMS backend. `default_key_id` is used
1217 /// when a PUT requests SSE-KMS without naming a specific KMS key
1218 /// (operators set this to mirror AWS S3's bucket-default key).
1219 #[must_use]
1220 pub fn with_kms_backend(
1221 mut self,
1222 kms: Arc<dyn crate::kms::KmsBackend>,
1223 default_key_id: Option<String>,
1224 ) -> Self {
1225 self.kms = Some(kms);
1226 self.kms_default_key_id = default_key_id;
1227 self
1228 }
1229
1230 /// v0.5 #34: attach the first-class versioning state machine. Once
1231 /// set, this `S4Service` owns the per-bucket versioning state +
1232 /// per-(bucket, key) version chain; `put_object` / `get_object` /
1233 /// `delete_object` / `list_object_versions` /
1234 /// `get_bucket_versioning` / `put_bucket_versioning` consult the
1235 /// manager instead of passing through to the backend. The backend
1236 /// is still used as the byte store: Suspended / Unversioned buckets
1237 /// keep using `<key>` directly (legacy), Enabled buckets redirect
1238 /// each version's bytes to a shadow key
1239 /// (`<key>.__s4ver__/<version-id>`) so older versions survive newer
1240 /// PUTs to the same logical key.
1241 #[must_use]
1242 pub fn with_versioning(mut self, mgr: Arc<crate::versioning::VersioningManager>) -> Self {
1243 self.versioning = Some(mgr);
1244 self
1245 }
1246
1247 /// v0.8.5 #86 (audit M-3): borrow the attached versioning manager so
1248 /// the SIGUSR1 snapshot dump-back hook in `main.rs` can re-emit the
1249 /// in-memory state to the operator's `--versioning-state-file`
1250 /// without restarting the gateway. Mirrors the shape of
1251 /// [`Self::object_lock_manager`] / [`Self::lifecycle_manager`] —
1252 /// purely additive accessor, no handler behaviour change.
1253 #[must_use]
1254 pub fn versioning_manager(&self) -> Option<&Arc<crate::versioning::VersioningManager>> {
1255 self.versioning.as_ref()
1256 }
1257
1258 /// v0.8.5 #86 (audit M-2): override the default replication-dispatch
1259 /// concurrency cap (1024). Wired by the `--replication-max-concurrent`
1260 /// CLI flag in `main.rs`. Operators running heavy cross-region
1261 /// fan-out may need to raise this; operators on memory-constrained
1262 /// hosts may need to lower it. The new value replaces the existing
1263 /// `Semaphore` (so calling this after dispatchers are already in
1264 /// flight is fine — the in-flight tasks hold permits from the old
1265 /// semaphore which is dropped when its last permit is released).
1266 /// A `max` of 0 would deadlock all replicas; the value is silently
1267 /// clamped to 1 instead.
1268 #[must_use]
1269 pub fn with_replication_max_concurrent(mut self, max: usize) -> Self {
1270 let max = max.max(1);
1271 self.replication_semaphore = Arc::new(tokio::sync::Semaphore::new(max));
1272 self
1273 }
1274
1275 /// v0.8.5 #86 (audit M-2): borrow the in-flight replication
1276 /// concurrency permit pool. Tests inspect `available_permits()`
1277 /// after invoking `spawn_replication_if_matched` to verify the
1278 /// dispatcher actually `acquire_owned`s before kicking off the
1279 /// destination PUT.
1280 #[must_use]
1281 pub fn replication_semaphore(&self) -> &Arc<tokio::sync::Semaphore> {
1282 &self.replication_semaphore
1283 }
1284
1285 /// v0.4 #21 (kept for back-compat): attach a single SSE-S4 key.
1286 /// Internally wraps it in a 1-slot keyring with id=1 active, so
1287 /// new objects ride the v0.5 S4E2 frame while previously-written
1288 /// S4E1 bytes (this same key) still decrypt via the keyring's S4E1
1289 /// fallback path. Operators wanting true rotation should call
1290 /// [`Self::with_sse_keyring`] instead.
1291 #[must_use]
1292 pub fn with_sse_key(mut self, key: crate::sse::SharedSseKey) -> Self {
1293 let keyring = crate::sse::SseKeyring::new(1, key);
1294 self.sse_keyring = Some(std::sync::Arc::new(keyring));
1295 self
1296 }
1297
1298 /// v0.5 #29: attach a multi-key SSE-S4 keyring. PUT encrypts under
1299 /// the active key (S4E2 frame stamped with that key's id); GET
1300 /// dispatches on the body's magic — S4E1 falls back to trying every
1301 /// key in the ring (active first) so v0.4 objects survive a
1302 /// migration; S4E2 looks up the explicit key_id from the header.
1303 #[must_use]
1304 pub fn with_sse_keyring(mut self, keyring: crate::sse::SharedSseKeyring) -> Self {
1305 self.sse_keyring = Some(keyring);
1306 self
1307 }
1308
1309 /// v0.8 #52: opt the SSE-S4 PUT path into the chunked S4E5 frame
1310 /// (so the matching GET can stream-decrypt chunk-by-chunk
1311 /// instead of buffering the entire body before tag verify).
1312 /// `bytes` is the plaintext slice size — typically 1 MiB; 0
1313 /// disables the path and reverts to the legacy S4E2 buffered
1314 /// frame.
1315 ///
1316 /// SSE-C (S4E3) and SSE-KMS (S4E4) are intentionally untouched:
1317 /// the chunked envelopes for those flows are a follow-up issue
1318 /// (the customer-key wire surface needs separate version
1319 /// negotiation).
1320 ///
1321 /// Has no effect when `with_sse_keyring` / `with_sse_key` is
1322 /// not also set — the chunked path runs only on the SSE-S4
1323 /// branch of `put_object`.
1324 #[must_use]
1325 pub fn with_sse_chunk_size(mut self, bytes: usize) -> Self {
1326 self.sse_chunk_size = bytes;
1327 self
1328 }
1329
1330 /// v0.4 #20: attach an S3-style access-log emitter. Each completed
1331 /// PUT / GET / DELETE / List handler emits one entry into the
1332 /// emitter's buffer; a background flusher (started separately, see
1333 /// [`crate::access_log::AccessLog::spawn_flusher`]) writes hourly
1334 /// rotated `.log` files into the configured directory.
1335 #[must_use]
1336 pub fn with_access_log(mut self, log: crate::access_log::SharedAccessLog) -> Self {
1337 self.access_log = Some(log);
1338 self
1339 }
1340
1341 /// Capture the per-request access-log preamble before the request is
1342 /// consumed by the backend call. Returns `None` if no access logger
1343 /// is configured (cheap early-out so the handler doesn't pay the
1344 /// header-clone cost when access logging is off).
1345 fn access_log_preamble<I>(&self, req: &S3Request<I>) -> Option<AccessLogPreamble> {
1346 self.access_log.as_ref()?;
1347 Some(AccessLogPreamble {
1348 // v0.8.11 CRIT-4 fix: same trust gate as `request_context`.
1349 // Recording a client-controllable header in the access log
1350 // would poison forensic queries; leave it `None` until the
1351 // operator declares X-Forwarded-For is set by a trusted
1352 // proxy.
1353 remote_ip: if self.trust_x_forwarded_for {
1354 req.headers
1355 .get("x-forwarded-for")
1356 .and_then(|v| v.to_str().ok())
1357 .and_then(|raw| raw.split(',').next())
1358 .map(|s| s.trim().to_owned())
1359 } else {
1360 None
1361 },
1362 requester: Self::principal_of(req).map(str::to_owned),
1363 request_uri: format!("{} {}", req.method, req.uri.path()),
1364 user_agent: req
1365 .headers
1366 .get("user-agent")
1367 .and_then(|v| v.to_str().ok())
1368 .map(str::to_owned),
1369 })
1370 }
1371
1372 /// Internal — called by handlers at end-of-request with a captured
1373 /// preamble. Best-effort: swallows the await fast (clones Arc +
1374 /// pushes), no error propagation back to the request path.
1375 #[allow(clippy::too_many_arguments)]
1376 async fn record_access(
1377 &self,
1378 preamble: Option<AccessLogPreamble>,
1379 operation: &'static str,
1380 bucket: &str,
1381 key: Option<&str>,
1382 http_status: u16,
1383 bytes_sent: u64,
1384 object_size: u64,
1385 total_time_ms: u64,
1386 error_code: Option<&str>,
1387 ) {
1388 let (Some(log), Some(p)) = (self.access_log.as_ref(), preamble) else {
1389 return;
1390 };
1391 log.record(crate::access_log::AccessLogEntry {
1392 time: std::time::SystemTime::now(),
1393 bucket: bucket.to_owned(),
1394 remote_ip: p.remote_ip,
1395 requester: p.requester,
1396 operation,
1397 key: key.map(str::to_owned),
1398 request_uri: p.request_uri,
1399 http_status,
1400 error_code: error_code.map(str::to_owned),
1401 bytes_sent,
1402 object_size,
1403 total_time_ms,
1404 user_agent: p.user_agent,
1405 })
1406 .await;
1407 }
1408
1409 /// v0.4 #19: attach a per-(principal, bucket) token-bucket rate limiter.
1410 /// When set, every PUT / GET / DELETE / List / Copy / multipart op is
1411 /// throttle-checked before the policy gate; throttled requests return
1412 /// `S3ErrorCode::SlowDown` (HTTP 503) and bump
1413 /// `s4_rate_limit_throttled_total{principal,bucket}`.
1414 #[must_use]
1415 pub fn with_rate_limits(mut self, rl: crate::rate_limit::SharedRateLimits) -> Self {
1416 self.rate_limits = Some(rl);
1417 self
1418 }
1419
1420 /// Helper used by request handlers to apply the rate limit. Returns
1421 /// `Ok(())` when allowed (or no rate limiter is configured), or a
1422 /// `SlowDown` S3Error otherwise.
1423 fn enforce_rate_limit<I>(&self, req: &S3Request<I>, bucket: &str) -> S3Result<()> {
1424 let Some(rl) = self.rate_limits.as_ref() else {
1425 return Ok(());
1426 };
1427 let principal_id = Self::principal_of(req);
1428 if !rl.check(principal_id, bucket) {
1429 crate::metrics::record_rate_limit_throttle(principal_id.unwrap_or("-"), bucket);
1430 return Err(S3Error::with_message(
1431 S3ErrorCode::SlowDown,
1432 format!("rate-limited: bucket={bucket}"),
1433 ));
1434 }
1435 Ok(())
1436 }
1437
1438 /// Tell the policy evaluator that the listener is reached over TLS
1439 /// (or ACME). When `true`, the `aws:SecureTransport` Condition key
1440 /// resolves to `true`. Defaults to `false`.
1441 #[must_use]
1442 pub fn with_secure_transport(mut self, on: bool) -> Self {
1443 self.secure_transport = on;
1444 self
1445 }
1446
1447 #[must_use]
1448 pub fn with_max_body_bytes(mut self, n: usize) -> Self {
1449 self.max_body_bytes = n;
1450 self
1451 }
1452
1453 /// Attach an optional bucket policy (v0.2 #7). When `Some(...)`, every
1454 /// PUT / GET / DELETE / List handler runs `policy.evaluate(...)` before
1455 /// delegating to the backend; failures return `S3ErrorCode::AccessDenied`.
1456 /// When `None` (the default), no policy enforcement happens.
1457 #[must_use]
1458 pub fn with_policy(mut self, policy: crate::policy::SharedPolicy) -> Self {
1459 self.policy = Some(policy);
1460 self
1461 }
1462
1463 /// Pull the SigV4 access key id off the request's credentials, if any.
1464 /// Used as the `principal_id` for policy evaluation.
1465 fn principal_of<I>(req: &S3Request<I>) -> Option<&str> {
1466 req.credentials.as_ref().map(|c| c.access_key.as_str())
1467 }
1468
1469 /// v0.3 #13: build the per-request policy context from the incoming
1470 /// `S3Request`. Pulls `aws:UserAgent` from the User-Agent header,
1471 /// `aws:SourceIp` from the standard `X-Forwarded-For` header (most
1472 /// production deployments are behind an LB / reverse proxy that sets
1473 /// this), `aws:CurrentTime` from the system clock, and
1474 /// `aws:SecureTransport` from the per-listener TLS flag.
1475 fn request_context<I>(&self, req: &S3Request<I>) -> crate::policy::RequestContext {
1476 let user_agent = req
1477 .headers
1478 .get("user-agent")
1479 .and_then(|v| v.to_str().ok())
1480 .map(str::to_owned);
1481 // v0.8.11 CRIT-4 fix: `X-Forwarded-For` is a client-controllable
1482 // header. Trusting it unconditionally lets any public-internet
1483 // request claim it came from a trusted CIDR (e.g.
1484 // `curl -H 'X-Forwarded-For: 10.0.0.1'` to satisfy a
1485 // `Condition: NotIpAddress aws:SourceIp [10.0.0.0/8]` Deny).
1486 // We now only consume the header when the operator has
1487 // declared "this gateway sits behind a trusted reverse proxy
1488 // that scrubs client-supplied values" via
1489 // `with_trust_x_forwarded_for(true)` /
1490 // `--trust-x-forwarded-for`. Default leaves `source_ip` as
1491 // `None`, which fails closed for IP-allowlist Allow rules
1492 // and fails open for IP-blocklist Deny rules — operators
1493 // who need either case behind a public listener must opt in
1494 // or move the gate to the reverse proxy. The leftmost
1495 // comma-separated token is the originator per the
1496 // `X-Forwarded-For: client, proxy1, proxy2` convention.
1497 let source_ip = if self.trust_x_forwarded_for {
1498 req.headers
1499 .get("x-forwarded-for")
1500 .and_then(|v| v.to_str().ok())
1501 .and_then(|raw| raw.split(',').next())
1502 .and_then(|s| s.trim().parse().ok())
1503 } else {
1504 None
1505 };
1506 crate::policy::RequestContext {
1507 source_ip,
1508 user_agent,
1509 request_time: Some(std::time::SystemTime::now()),
1510 secure_transport: self.secure_transport,
1511 existing_object_tags: None,
1512 request_object_tags: None,
1513 extra: Default::default(),
1514 }
1515 }
1516
1517 /// Helper used by request handlers to enforce the optional policy.
1518 /// Returns `Ok(())` when allowed (or no policy is configured), or an
1519 /// `AccessDenied` S3Error otherwise. Bumps the policy denial Prometheus
1520 /// counter on deny.
1521 fn enforce_policy<I>(
1522 &self,
1523 req: &S3Request<I>,
1524 action: &'static str,
1525 bucket: &str,
1526 key: Option<&str>,
1527 ) -> S3Result<()> {
1528 self.enforce_policy_with_extra(req, action, bucket, key, None, None)
1529 }
1530
1531 /// v0.6 #39: variant of [`Self::enforce_policy`] that lets the
1532 /// caller plumb tag context (existing-on-object + on-request) into
1533 /// the policy evaluator. Both arguments default to `None`, in
1534 /// which case the resulting `RequestContext` is identical to
1535 /// [`Self::enforce_policy`]'s — so for handlers that don't deal
1536 /// with tags this is a transparent no-op.
1537 fn enforce_policy_with_extra<I>(
1538 &self,
1539 req: &S3Request<I>,
1540 action: &'static str,
1541 bucket: &str,
1542 key: Option<&str>,
1543 request_tags: Option<&crate::tagging::TagSet>,
1544 existing_tags: Option<&crate::tagging::TagSet>,
1545 ) -> S3Result<()> {
1546 let Some(policy) = self.policy.as_ref() else {
1547 return Ok(());
1548 };
1549 let principal_id = Self::principal_of(req);
1550 let mut ctx = self.request_context(req);
1551 if let Some(t) = request_tags {
1552 ctx.request_object_tags = Some(t.clone());
1553 }
1554 if let Some(t) = existing_tags {
1555 ctx.existing_object_tags = Some(t.clone());
1556 }
1557 let decision = policy.evaluate_with(action, bucket, key, principal_id, &ctx);
1558 if decision.allow {
1559 Ok(())
1560 } else {
1561 crate::metrics::record_policy_denial(action, bucket);
1562 tracing::info!(
1563 action,
1564 bucket,
1565 key = ?key,
1566 principal = ?principal_id,
1567 source_ip = ?ctx.source_ip,
1568 user_agent = ?ctx.user_agent,
1569 secure_transport = ctx.secure_transport,
1570 matched_sid = ?decision.matched_sid,
1571 effect = ?decision.matched_effect,
1572 "S4 policy denied request"
1573 );
1574 Err(S3Error::with_message(
1575 S3ErrorCode::AccessDenied,
1576 format!("denied by S4 policy: {action} on bucket={bucket}"),
1577 ))
1578 }
1579 }
1580
1581 /// テスト用: backend を取り戻す (test helper、production では使わない).
1582 /// v0.6 #40 で `backend` が `Arc<B>` 化したので `Arc::try_unwrap` で
1583 /// 1-clone の場合のみ返す。共有されている (= replication dispatcher が
1584 /// 同じ Arc を持っていて未完了) 場合は `Err` を返さず panic させる
1585 /// (test 用途専用 helper の caller 契約を維持)。
1586 pub fn into_backend(self) -> B {
1587 Arc::try_unwrap(self.backend).unwrap_or_else(|_| {
1588 panic!("into_backend: backend Arc still shared (replication dispatcher in flight?)")
1589 })
1590 }
1591
1592 /// 必要 frame だけを backend に Range GET し、frame parse + decompress + slice
1593 /// した結果を返す sidecar fast path。Range request の **帯域節約版**。
1594 async fn partial_range_get(
1595 &self,
1596 req: &S3Request<GetObjectInput>,
1597 plan: s4_codec::index::RangePlan,
1598 client_start: u64,
1599 client_end_exclusive: u64,
1600 total_original: u64,
1601 get_start: Instant,
1602 ) -> S3Result<S3Response<GetObjectOutput>> {
1603 // 必要 byte 範囲だけを backend に partial GET
1604 let backend_range = s3s::dto::Range::Int {
1605 first: plan.byte_start,
1606 last: Some(plan.byte_end_exclusive - 1),
1607 };
1608 let backend_input = GetObjectInput {
1609 bucket: req.input.bucket.clone(),
1610 key: req.input.key.clone(),
1611 range: Some(backend_range),
1612 ..Default::default()
1613 };
1614 let backend_req = S3Request {
1615 input: backend_input,
1616 method: req.method.clone(),
1617 uri: req.uri.clone(),
1618 headers: req.headers.clone(),
1619 extensions: http::Extensions::new(),
1620 credentials: req.credentials.clone(),
1621 region: req.region.clone(),
1622 service: req.service.clone(),
1623 trailing_headers: None,
1624 };
1625 let mut backend_resp = self.backend.get_object(backend_req).await?;
1626 let blob = backend_resp.output.body.take().ok_or_else(|| {
1627 S3Error::with_message(
1628 S3ErrorCode::InternalError,
1629 "backend partial GET returned empty body",
1630 )
1631 })?;
1632 let bytes = collect_blob(blob, self.max_body_bytes)
1633 .await
1634 .map_err(internal("collect partial body"))?;
1635
1636 // frame parse + decompress
1637 let mut combined = BytesMut::new();
1638 for frame in FrameIter::new(bytes) {
1639 let (header, payload) = frame.map_err(|e| {
1640 S3Error::with_message(
1641 S3ErrorCode::InternalError,
1642 format!("partial-range frame parse: {e}"),
1643 )
1644 })?;
1645 let chunk_manifest = ChunkManifest {
1646 codec: header.codec,
1647 original_size: header.original_size,
1648 compressed_size: header.compressed_size,
1649 crc32c: header.crc32c,
1650 };
1651 let decompressed = self
1652 .registry
1653 .decompress(payload, &chunk_manifest)
1654 .await
1655 .map_err(internal("partial-range decompress"))?;
1656 combined.extend_from_slice(&decompressed);
1657 }
1658 let combined = combined.freeze();
1659 let sliced = combined
1660 .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
1661
1662 // response 組立て
1663 let returned_size = sliced.len() as u64;
1664 backend_resp.output.content_length = Some(returned_size as i64);
1665 backend_resp.output.content_range = Some(format!(
1666 "bytes {client_start}-{}/{total_original}",
1667 client_end_exclusive - 1
1668 ));
1669 backend_resp.output.checksum_crc32 = None;
1670 backend_resp.output.checksum_crc32c = None;
1671 backend_resp.output.checksum_crc64nvme = None;
1672 backend_resp.output.checksum_sha1 = None;
1673 backend_resp.output.checksum_sha256 = None;
1674 backend_resp.output.e_tag = None;
1675 backend_resp.output.body = Some(bytes_to_blob(sliced));
1676 backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
1677
1678 let elapsed = get_start.elapsed();
1679 crate::metrics::record_get(
1680 "partial",
1681 plan.byte_end_exclusive - plan.byte_start,
1682 returned_size,
1683 elapsed.as_secs_f64(),
1684 true,
1685 );
1686 info!(
1687 op = "get_object",
1688 bucket = %req.input.bucket,
1689 key = %req.input.key,
1690 bytes_in = plan.byte_end_exclusive - plan.byte_start,
1691 bytes_out = returned_size,
1692 total_object_size = total_original,
1693 range = true,
1694 path = "sidecar-partial",
1695 latency_ms = elapsed.as_millis() as u64,
1696 "S4 partial Range GET via sidecar index"
1697 );
1698 Ok(backend_resp)
1699 }
1700
1701 /// `<key>.s4index` sidecar object を backend に書く。失敗しても本体 PUT は
1702 /// 成功扱いにしたいので、err は warn ログのみ (Range GET の partial path が
1703 /// 使えなくなるが、full read fallback で意味的には正しい結果を返す)。
1704 async fn write_sidecar(&self, bucket: &str, key: &str, index: &FrameIndex) {
1705 let bytes = encode_index(index);
1706 let len = bytes.len() as i64;
1707 let sidecar = sidecar_key(key);
1708 // v0.7 #49: synthetic re-entry URI must be percent-encoded; if
1709 // the (already legally-arbitrary) S3 key produces something we
1710 // cannot encode at all, drop the sidecar PUT (the GET path
1711 // falls back to a full read on a missing sidecar) instead of
1712 // panicking on `parse().unwrap()`.
1713 let uri = match safe_object_uri(bucket, &sidecar) {
1714 Ok(u) => u,
1715 Err(e) => {
1716 tracing::warn!(
1717 bucket,
1718 key,
1719 "S4 write_sidecar skipped (key not URI-encodable): {e}"
1720 );
1721 return;
1722 }
1723 };
1724 let put_input = PutObjectInput {
1725 bucket: bucket.into(),
1726 key: sidecar,
1727 body: Some(bytes_to_blob(bytes)),
1728 content_length: Some(len),
1729 content_type: Some("application/x-s4-index".into()),
1730 ..Default::default()
1731 };
1732 let put_req = S3Request {
1733 input: put_input,
1734 method: http::Method::PUT,
1735 uri,
1736 headers: http::HeaderMap::new(),
1737 extensions: http::Extensions::new(),
1738 credentials: None,
1739 region: None,
1740 service: None,
1741 trailing_headers: None,
1742 };
1743 if let Err(e) = self.backend.put_object(put_req).await {
1744 tracing::warn!(
1745 bucket,
1746 key,
1747 "S4 write_sidecar failed (Range GET will fall back to full read): {e}"
1748 );
1749 }
1750 }
1751
1752 /// v0.8.4 #73 H-2: confirm that the sidecar we just decoded still
1753 /// describes the current backend object before we trust its frame
1754 /// offsets for a partial Range GET. The sidecar carries the source
1755 /// `etag` and `compressed_size` that were observed at PUT time; we
1756 /// HEAD the backend object and compare.
1757 ///
1758 /// Decision matrix:
1759 /// - sidecar `source_etag = None` (legacy v1 / build_index_from_body
1760 /// that wasn't stamped) → return `true` (best-effort, preserves
1761 /// pre-v0.8.4 behaviour for existing on-disk sidecars).
1762 /// - HEAD fails → return `false` (we can't tell either way; full GET
1763 /// path will surface the real backend error to the client).
1764 /// - HEAD ETag matches → `true`.
1765 /// - HEAD ETag differs OR HEAD size differs from
1766 /// `source_compressed_size` → `false` (sidecar stale or attacker-
1767 /// written; fall back to full GET).
1768 async fn sidecar_version_binding_ok(
1769 &self,
1770 bucket: &str,
1771 key: &str,
1772 index: &FrameIndex,
1773 ) -> bool {
1774 let Some(ref expected_etag) = index.source_etag else {
1775 // Legacy sidecar without the v0.8.4 #73 H-2 binding —
1776 // back-compat: trust it (the partial fetch is the same
1777 // best-effort path that v0.8.3 and earlier shipped).
1778 return true;
1779 };
1780 let head_input = HeadObjectInput {
1781 bucket: bucket.into(),
1782 key: key.into(),
1783 ..Default::default()
1784 };
1785 let uri = match safe_object_uri(bucket, key) {
1786 Ok(u) => u,
1787 Err(_) => return false,
1788 };
1789 let head_req = S3Request {
1790 input: head_input,
1791 method: http::Method::HEAD,
1792 uri,
1793 headers: http::HeaderMap::new(),
1794 extensions: http::Extensions::new(),
1795 credentials: None,
1796 region: None,
1797 service: None,
1798 trailing_headers: None,
1799 };
1800 let head = match self.backend.head_object(head_req).await {
1801 Ok(r) => r.output,
1802 Err(e) => {
1803 tracing::debug!(
1804 bucket,
1805 key,
1806 "S4 sidecar version-binding HEAD failed, falling back to full GET: {e}"
1807 );
1808 return false;
1809 }
1810 };
1811 // ETag is a strong-vs-weak enum; we compare on the unwrapped string
1812 // form (matches what the PUT path stamped — see below).
1813 let live_etag = head.e_tag.as_ref().map(|t| t.value());
1814 if live_etag != Some(expected_etag.as_str()) {
1815 tracing::debug!(
1816 bucket,
1817 key,
1818 "sidecar stale (ETag mismatch), falling back to full GET (sidecar={:?}, live={:?})",
1819 expected_etag,
1820 live_etag,
1821 );
1822 return false;
1823 }
1824 if let Some(expected_size) = index.source_compressed_size
1825 && let Some(live_size) = head.content_length
1826 && live_size as u64 != expected_size
1827 {
1828 tracing::debug!(
1829 bucket,
1830 key,
1831 "sidecar stale (size mismatch), falling back to full GET (sidecar={}, live={})",
1832 expected_size,
1833 live_size,
1834 );
1835 return false;
1836 }
1837 true
1838 }
1839
1840 /// `<key>.s4index` sidecar を backend から読み出す。なければ None。
1841 async fn read_sidecar(&self, bucket: &str, key: &str) -> Option<FrameIndex> {
1842 let sidecar = sidecar_key(key);
1843 // v0.7 #49: same encode-or-bail treatment as write_sidecar.
1844 let uri = safe_object_uri(bucket, &sidecar).ok()?;
1845 let get_input = GetObjectInput {
1846 bucket: bucket.into(),
1847 key: sidecar,
1848 ..Default::default()
1849 };
1850 let get_req = S3Request {
1851 input: get_input,
1852 method: http::Method::GET,
1853 uri,
1854 headers: http::HeaderMap::new(),
1855 extensions: http::Extensions::new(),
1856 credentials: None,
1857 region: None,
1858 service: None,
1859 trailing_headers: None,
1860 };
1861 let resp = self.backend.get_object(get_req).await.ok()?;
1862 let blob = resp.output.body?;
1863 let bytes = collect_blob(blob, 64 * 1024 * 1024).await.ok()?;
1864 decode_index(bytes).ok()
1865 }
1866
1867 /// Multipart object (frame 列) を解凍 → 元 bytes を再構築。
1868 ///
1869 /// **per-frame codec dispatch**: 各 frame header に codec_id が入っているので、
1870 /// frame ごとに registry が違う codec を呼ぶことができる。同一 object 内で
1871 /// 異なる codec が混在していても透過的に解凍可能 (parquet 風 mixed columns 等)。
1872 async fn decompress_multipart(&self, bytes: bytes::Bytes) -> S3Result<bytes::Bytes> {
1873 let mut out = BytesMut::new();
1874 // v0.8.15 H-h: cap the *aggregate* decoded output. Each
1875 // individual frame is already bounded by
1876 // `validate_decompress_manifest` (default 5 GiB per frame),
1877 // but a forged multi-frame body can declare many frames
1878 // each near the limit — without an object-level ceiling, a
1879 // single GET could pin tens of GiB of plaintext in
1880 // `BytesMut::extend_from_slice`. Use the gateway's
1881 // `max_body_bytes` (same cap that bounds PUT bodies) so a
1882 // GET can never produce more plaintext than a PUT can ever
1883 // legitimately have stored.
1884 let aggregate_cap = self.max_body_bytes;
1885 let mut produced: usize = 0;
1886 for frame in FrameIter::new(bytes) {
1887 let (header, payload) = frame.map_err(|e| {
1888 S3Error::with_message(
1889 S3ErrorCode::InternalError,
1890 format!("multipart frame parse: {e}"),
1891 )
1892 })?;
1893 let chunk_manifest = ChunkManifest {
1894 codec: header.codec,
1895 original_size: header.original_size,
1896 compressed_size: header.compressed_size,
1897 crc32c: header.crc32c,
1898 };
1899 // v0.8.15 H-h: pre-flight check on the declared
1900 // `original_size` so a forged manifest claiming a frame
1901 // that would push us past the cap is rejected before we
1902 // start decoding. Defence-in-depth alongside the
1903 // post-decode `produced` check below.
1904 if (produced as u64).saturating_add(header.original_size) > aggregate_cap as u64 {
1905 return Err(S3Error::with_message(
1906 S3ErrorCode::InternalError,
1907 format!(
1908 "multipart aggregate output exceeds cap: would reach \
1909 {produced_total} bytes after this frame, cap is {aggregate_cap}",
1910 produced_total = (produced as u64).saturating_add(header.original_size),
1911 ),
1912 ));
1913 }
1914 let decompressed = self
1915 .registry
1916 .decompress(payload, &chunk_manifest)
1917 .await
1918 .map_err(internal("multipart frame decompress"))?;
1919 produced = produced.saturating_add(decompressed.len());
1920 if produced > aggregate_cap {
1921 return Err(S3Error::with_message(
1922 S3ErrorCode::InternalError,
1923 format!(
1924 "multipart aggregate output exceeded cap: {produced} bytes \
1925 emitted, cap is {aggregate_cap}"
1926 ),
1927 ));
1928 }
1929 out.extend_from_slice(&decompressed);
1930 }
1931 Ok(out.freeze())
1932 }
1933}
1934
1935/// Parse a CopySourceRange header value (`bytes=N-M`, `bytes=N-`, `bytes=-N`)
1936/// into the s3s::dto::Range used by the GetObject path. The S3 spec only
1937/// allows `bytes=N-M` for upload_part_copy (no suffix or open-ended), so
1938/// reject the other variants for parity with AWS.
1939fn parse_copy_source_range(s: &str) -> Result<s3s::dto::Range, String> {
1940 let rest = s
1941 .strip_prefix("bytes=")
1942 .ok_or_else(|| format!("CopySourceRange must start with 'bytes=', got {s:?}"))?;
1943 let (a, b) = rest
1944 .split_once('-')
1945 .ok_or_else(|| format!("CopySourceRange must be 'bytes=N-M', got {s:?}"))?;
1946 let first: u64 = a
1947 .parse()
1948 .map_err(|_| format!("CopySourceRange first byte not a number: {a:?}"))?;
1949 let last: u64 = b
1950 .parse()
1951 .map_err(|_| format!("CopySourceRange last byte not a number: {b:?}"))?;
1952 if last < first {
1953 return Err(format!("CopySourceRange last < first: {s:?}"));
1954 }
1955 Ok(s3s::dto::Range::Int {
1956 first,
1957 last: Some(last),
1958 })
1959}
1960
1961/// v0.5 #34: synthesize the backend storage key for a given
1962/// (logical key, version-id) pair on an Enabled-versioning bucket.
1963///
1964/// Uses the `__s4ver__/` infix because:
1965/// - it's not a substring of `.s4index` / `.s4ver` natural keys (no false-positive
1966/// listing filter collisions)
1967/// - directory-style separator keeps S3 console "browse by prefix" UX intact
1968/// (versions roll up under one virtual folder per object)
1969/// - human-readable on debug logs / `aws s3 ls`
1970///
1971/// `list_objects` / `list_objects_v2` / `list_object_versions` MUST filter
1972/// keys containing `.__s4ver__/` from results so customers don't see internal
1973/// shadow objects.
1974pub fn versioned_shadow_key(key: &str, version_id: &str) -> String {
1975 format!("{key}.__s4ver__/{version_id}")
1976}
1977
1978/// Test for the marker substring used by [`versioned_shadow_key`]. Cheap str
1979/// scan; both list_objects filter and the GET passthrough check use this.
1980fn is_versioning_shadow_key(key: &str) -> bool {
1981 key.contains(".__s4ver__/")
1982}
1983
1984/// v0.6 #42: wall-clock seconds since the UNIX epoch — fed to
1985/// `mfa::check_mfa` so the TOTP verifier can match the client's
1986/// authenticator app's view of "now". Falls back to `0` on the
1987/// (impossible-in-practice) clock-before-1970 path so the verifier
1988/// rejects rather than panicking.
1989fn current_unix_secs() -> u64 {
1990 std::time::SystemTime::now()
1991 .duration_since(std::time::UNIX_EPOCH)
1992 .map(|d| d.as_secs())
1993 .unwrap_or(0)
1994}
1995
1996/// v0.6 #42: translate an `MfaError` into the matching S3 wire error.
1997///
1998/// - `Missing` / `SerialMismatch` / `InvalidCode` → `403 AccessDenied`
1999/// (S3 spec for MFA Delete: every gating failure surfaces as
2000/// `AccessDenied`, not a separate `MFA*` code).
2001/// - `Malformed` → `400 InvalidRequest` (the request itself is
2002/// syntactically broken, not a permission issue).
2003fn mfa_error_to_s3(e: crate::mfa::MfaError) -> S3Error {
2004 match e {
2005 crate::mfa::MfaError::Missing => S3Error::with_message(
2006 S3ErrorCode::AccessDenied,
2007 "MFA token required for this operation",
2008 ),
2009 crate::mfa::MfaError::Malformed => {
2010 S3Error::with_message(S3ErrorCode::InvalidRequest, "malformed x-amz-mfa header")
2011 }
2012 crate::mfa::MfaError::SerialMismatch => S3Error::with_message(
2013 S3ErrorCode::AccessDenied,
2014 "MFA serial does not match configured device",
2015 ),
2016 crate::mfa::MfaError::InvalidCode => {
2017 S3Error::with_message(S3ErrorCode::AccessDenied, "invalid MFA code")
2018 }
2019 }
2020}
2021
2022fn is_multipart_object(metadata: &Option<Metadata>) -> bool {
2023 metadata
2024 .as_ref()
2025 .and_then(|m| m.get(META_MULTIPART))
2026 .map(|v| v == "true")
2027 .unwrap_or(false)
2028}
2029
2030const META_CODEC: &str = "s4-codec";
2031const META_ORIGINAL_SIZE: &str = "s4-original-size";
2032const META_COMPRESSED_SIZE: &str = "s4-compressed-size";
2033const META_CRC32C: &str = "s4-crc32c";
2034/// Multipart upload で per-part frame format を使ったオブジェクトであることを示す。
2035/// GET 時にこの flag を見て frame parser を起動する。
2036const META_MULTIPART: &str = "s4-multipart";
2037/// v0.2 #4: single-PUT でも S4F2 framed format で書かれていることを示す。
2038/// 旧 v0.1 single-PUT は raw 圧縮 bytes (この flag なし)。GET 時にこの flag を
2039/// 見て framed 経路 (= multipart と同じ FrameIter parse) に流す。
2040const META_FRAMED: &str = "s4-framed";
2041
2042fn is_framed_v2_object(metadata: &Option<Metadata>) -> bool {
2043 metadata
2044 .as_ref()
2045 .and_then(|m| m.get(META_FRAMED))
2046 .map(|v| v == "true")
2047 .unwrap_or(false)
2048}
2049
2050/// v0.4 #21: detect SSE-S4 by the metadata flag we set on PUT.
2051fn is_sse_encrypted(metadata: &Option<Metadata>) -> bool {
2052 metadata
2053 .as_ref()
2054 .and_then(|m| m.get("s4-encrypted"))
2055 .map(|v| v == "aes-256-gcm")
2056 .unwrap_or(false)
2057}
2058
2059/// v0.5 #27: pull the three SSE-C headers off an input struct. The S3
2060/// contract is "all three or none" — partial sets are a 400.
2061///
2062/// Returns `Ok(None)` when no SSE-C headers were sent (server-managed or
2063/// no encryption), `Ok(Some(material))` on validated client key, and
2064/// `Err` for malformed or partial inputs.
2065fn extract_sse_c_material(
2066 algorithm: &Option<String>,
2067 key: &Option<String>,
2068 md5: &Option<String>,
2069) -> S3Result<Option<crate::sse::CustomerKeyMaterial>> {
2070 match (algorithm, key, md5) {
2071 (None, None, None) => Ok(None),
2072 (Some(a), Some(k), Some(m)) => crate::sse::parse_customer_key_headers(a, k, m)
2073 .map(Some)
2074 .map_err(sse_c_error_to_s3),
2075 _ => Err(S3Error::with_message(
2076 S3ErrorCode::InvalidRequest,
2077 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
2078 )),
2079 }
2080}
2081
2082/// v0.5 #28: detect SSE-KMS request — `x-amz-server-side-encryption: aws:kms`.
2083/// Returns the key-id to wrap under, falling back to the gateway default.
2084fn extract_kms_key_id(
2085 sse: &Option<ServerSideEncryption>,
2086 sse_kms_key_id: &Option<String>,
2087 gateway_default: Option<&str>,
2088) -> Option<String> {
2089 let asks_for_kms = sse
2090 .as_ref()
2091 .map(|s| s.as_str() == ServerSideEncryption::AWS_KMS)
2092 .unwrap_or(false);
2093 if !asks_for_kms {
2094 return None;
2095 }
2096 sse_kms_key_id
2097 .clone()
2098 .or_else(|| gateway_default.map(str::to_owned))
2099}
2100
2101/// v0.5 #28: map kms module errors to AWS-shaped S3 error codes.
2102/// `KeyNotFound` is operator misconfig (400); `BackendUnavailable` is a
2103/// transient KMS outage (503). Other variants are 500 InternalError.
2104fn kms_error_to_s3(e: crate::kms::KmsError) -> S3Error {
2105 use crate::kms::KmsError as K;
2106 match e {
2107 K::KeyNotFound { key_id } => S3Error::with_message(
2108 S3ErrorCode::InvalidArgument,
2109 format!("KMS key not found: {key_id}"),
2110 ),
2111 K::BackendUnavailable { message } => S3Error::with_message(
2112 S3ErrorCode::ServiceUnavailable,
2113 format!("KMS backend unavailable: {message}"),
2114 ),
2115 other => S3Error::with_message(S3ErrorCode::InternalError, format!("KMS error: {other}")),
2116 }
2117}
2118
2119/// v0.5 #27: map sse module errors to AWS-shaped S3 error codes.
2120/// `WrongCustomerKey` → 403 AccessDenied (matches AWS behaviour);
2121/// `InvalidCustomerKey` / algorithm / required / unexpected → 400.
2122fn sse_c_error_to_s3(e: crate::sse::SseError) -> S3Error {
2123 use crate::sse::SseError as E;
2124 match e {
2125 E::WrongCustomerKey => S3Error::with_message(
2126 S3ErrorCode::AccessDenied,
2127 "SSE-C key does not match the key used at PUT time",
2128 ),
2129 E::InvalidCustomerKey { reason } => {
2130 S3Error::with_message(S3ErrorCode::InvalidArgument, format!("SSE-C: {reason}"))
2131 }
2132 E::CustomerKeyAlgorithmUnsupported { algo } => S3Error::with_message(
2133 S3ErrorCode::InvalidArgument,
2134 format!("SSE-C unsupported algorithm: {algo:?} (only AES256 is allowed)"),
2135 ),
2136 E::CustomerKeyRequired => S3Error::with_message(
2137 S3ErrorCode::InvalidRequest,
2138 "object is SSE-C encrypted; supply x-amz-server-side-encryption-customer-* headers",
2139 ),
2140 E::CustomerKeyUnexpected => S3Error::with_message(
2141 S3ErrorCode::InvalidRequest,
2142 "object is not SSE-C encrypted; do not send x-amz-server-side-encryption-customer-* headers",
2143 ),
2144 other => S3Error::with_message(S3ErrorCode::InternalError, format!("SSE error: {other}")),
2145 }
2146}
2147
2148fn extract_manifest(metadata: &Option<Metadata>) -> Option<ChunkManifest> {
2149 let m = metadata.as_ref()?;
2150 let codec = m
2151 .get(META_CODEC)
2152 .and_then(|s| s.parse::<CodecKind>().ok())?;
2153 let original_size = m.get(META_ORIGINAL_SIZE)?.parse().ok()?;
2154 let compressed_size = m.get(META_COMPRESSED_SIZE)?.parse().ok()?;
2155 let crc32c = m.get(META_CRC32C)?.parse().ok()?;
2156 Some(ChunkManifest {
2157 codec,
2158 original_size,
2159 compressed_size,
2160 crc32c,
2161 })
2162}
2163
2164fn write_manifest(metadata: &mut Option<Metadata>, manifest: &ChunkManifest) {
2165 let meta = metadata.get_or_insert_with(Default::default);
2166 meta.insert(META_CODEC.into(), manifest.codec.as_str().into());
2167 meta.insert(
2168 META_ORIGINAL_SIZE.into(),
2169 manifest.original_size.to_string(),
2170 );
2171 meta.insert(
2172 META_COMPRESSED_SIZE.into(),
2173 manifest.compressed_size.to_string(),
2174 );
2175 meta.insert(META_CRC32C.into(), manifest.crc32c.to_string());
2176}
2177
2178fn internal<E: std::fmt::Display>(prefix: &'static str) -> impl FnOnce(E) -> S3Error {
2179 move |e| S3Error::with_message(S3ErrorCode::InternalError, format!("{prefix}: {e}"))
2180}
2181
2182/// v0.6 #41: map a `select::SelectError` to the S3 error surface. AWS
2183/// uses a domain-specific `InvalidSqlExpression` code for parse / unsupported
2184/// errors, but s3s 0.13 doesn't expose that as a typed variant — we
2185/// fall back to the well-known `InvalidRequest` 400 with a descriptive
2186/// message that includes the original error context.
2187fn select_error_to_s3(e: crate::select::SelectError, fmt: &str) -> S3Error {
2188 use crate::select::SelectError;
2189 match e {
2190 SelectError::Parse(msg) => S3Error::with_message(
2191 S3ErrorCode::InvalidRequest,
2192 format!("SQL parse error: {msg}"),
2193 ),
2194 SelectError::UnsupportedFeature(msg) => S3Error::with_message(
2195 S3ErrorCode::InvalidRequest,
2196 format!("unsupported SQL feature: {msg}"),
2197 ),
2198 SelectError::RowEval(msg) => S3Error::with_message(
2199 S3ErrorCode::InvalidRequest,
2200 format!("SQL row evaluation error: {msg}"),
2201 ),
2202 SelectError::InputFormat(msg) => S3Error::with_message(
2203 S3ErrorCode::InvalidRequest,
2204 format!("{fmt} input format error: {msg}"),
2205 ),
2206 }
2207}
2208
2209/// v0.5 #30: parse the `x-amz-bypass-governance-retention` header into a
2210/// boolean flag. AWS S3 accepts `true` (case-insensitive); any other value
2211/// (including missing) is treated as `false`.
2212fn parse_bypass_governance_header(headers: &http::HeaderMap) -> bool {
2213 headers
2214 .get("x-amz-bypass-governance-retention")
2215 .and_then(|v| v.to_str().ok())
2216 .map(|s| s.eq_ignore_ascii_case("true"))
2217 .unwrap_or(false)
2218}
2219
2220/// Convert s3s `Timestamp` into a `chrono::DateTime<Utc>` by formatting it
2221/// as an RFC3339 string and re-parsing through `chrono`. The string format
2222/// avoids pulling the `time` crate (transitive dep of s3s, not declared by
2223/// s4-server) into our direct deps. Returns `None` if the format/parse fails
2224/// or the value is outside `chrono`'s supported range.
2225fn timestamp_to_chrono_utc(ts: &Timestamp) -> Option<chrono::DateTime<chrono::Utc>> {
2226 let mut buf = Vec::new();
2227 ts.format(s3s::dto::TimestampFormat::DateTime, &mut buf)
2228 .ok()?;
2229 let s = std::str::from_utf8(&buf).ok()?;
2230 chrono::DateTime::parse_from_rfc3339(s)
2231 .ok()
2232 .map(|dt| dt.with_timezone(&chrono::Utc))
2233}
2234
2235/// Inverse of [`timestamp_to_chrono_utc`] — emit RFC3339 (the s3s
2236/// `DateTime` wire format) and re-parse via `Timestamp::parse`.
2237fn chrono_utc_to_timestamp(dt: chrono::DateTime<chrono::Utc>) -> Timestamp {
2238 // chrono's RFC3339 output format matches s3s' parser ("...Z" with
2239 // optional sub-second precision). Fall back to UNIX_EPOCH if anything
2240 // unexpected happens — we never produce malformed strings, so this
2241 // branch is unreachable in practice.
2242 let s = dt.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
2243 Timestamp::parse(s3s::dto::TimestampFormat::DateTime, &s).unwrap_or_default()
2244}
2245
2246/// v0.6 #39: convert our internal [`crate::tagging::TagSet`] into the
2247/// s3s `Vec<Tag>` wire shape used on `GetObject/BucketTaggingOutput`.
2248/// Both halves of every pair land in the `Some(_)` slot — AWS marks
2249/// the field optional but always populates it on response.
2250fn tagset_to_aws(set: &crate::tagging::TagSet) -> Vec<Tag> {
2251 set.iter()
2252 .map(|(k, v)| Tag {
2253 key: Some(k.clone()),
2254 value: Some(v.clone()),
2255 })
2256 .collect()
2257}
2258
2259/// v0.6 #39: inverse of [`tagset_to_aws`] for input handlers. Missing
2260/// keys / values become empty strings (mirrors AWS, which rejects
2261/// `<Key/>` with InvalidTag at the parser layer; downstream
2262/// `TagSet::validate` then enforces our size limits).
2263fn aws_to_tagset(tags: &[Tag]) -> Result<crate::tagging::TagSet, crate::tagging::TagError> {
2264 let pairs = tags
2265 .iter()
2266 .map(|t| {
2267 (
2268 t.key.clone().unwrap_or_default(),
2269 t.value.clone().unwrap_or_default(),
2270 )
2271 })
2272 .collect();
2273 crate::tagging::TagSet::from_pairs(pairs)
2274}
2275
2276/// `Range` request を decompressed object サイズ `total` に適用して `(start, end_exclusive)`
2277/// を返す。`Range::Int { first, last }` は `bytes=first-last` (last は inclusive)、
2278/// `Range::Suffix { length }` は末尾 `length` byte。S3 仕様に準拠。
2279pub fn resolve_range(range: &s3s::dto::Range, total: u64) -> Result<(u64, u64), String> {
2280 if total == 0 {
2281 return Err("cannot range-get zero-length object".into());
2282 }
2283 match range {
2284 s3s::dto::Range::Int { first, last } => {
2285 let start = *first;
2286 let end_inclusive = match last {
2287 Some(l) => (*l).min(total - 1),
2288 None => total - 1,
2289 };
2290 if start > end_inclusive || start >= total {
2291 return Err(format!(
2292 "range bytes={start}-{:?} out of object size {total}",
2293 last
2294 ));
2295 }
2296 Ok((start, end_inclusive + 1))
2297 }
2298 s3s::dto::Range::Suffix { length } => {
2299 let len = (*length).min(total);
2300 Ok((total - len, total))
2301 }
2302 }
2303}
2304
2305#[async_trait::async_trait]
2306impl<B: S3> S3 for S4Service<B> {
2307 // === 圧縮を挟む path (PUT) ===
2308 #[tracing::instrument(
2309 name = "s4.put_object",
2310 skip(self, req),
2311 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_in, bytes_out, latency_ms)
2312 )]
2313 async fn put_object(
2314 &self,
2315 mut req: S3Request<PutObjectInput>,
2316 ) -> S3Result<S3Response<PutObjectOutput>> {
2317 let put_start = Instant::now();
2318 let put_bucket = req.input.bucket.clone();
2319 let put_key = req.input.key.clone();
2320 // v0.8.15 M-1: reject user PUTs targeting reserved sidecar
2321 // names (`<key>.s4index`). Without this gate, a user
2322 // uploading `report.s4index` would have their object silently
2323 // hidden from `ListObjectsV2` (the list filter strips the
2324 // `.s4index` suffix) and risk being deleted by the sidecar-
2325 // cleanup path on a sibling DeleteObject. Fail fast with the
2326 // AWS-canonical `InvalidObjectName` code.
2327 if s4_codec::index::is_reserved_sidecar_key(&put_key) {
2328 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
2329 .unwrap_or(S3ErrorCode::InvalidArgument);
2330 return Err(S3Error::with_message(
2331 code,
2332 format!(
2333 "object key {put_key:?} is reserved (suffix `{}` is used for S4 internal \
2334 sidecars); pick a different key",
2335 s4_codec::index::SIDECAR_SUFFIX,
2336 ),
2337 ));
2338 }
2339 let access_preamble = self.access_log_preamble(&req);
2340 self.enforce_rate_limit(&req, &put_bucket)?;
2341 // v0.6 #39: parse `x-amz-tagging` (URL-encoded query string) so
2342 // the IAM policy gate sees the request's tags via
2343 // `s3:RequestObjectTag/<key>`. `existing_object_tags` is also
2344 // resolved from the Tagging manager (when wired) so
2345 // `s3:ExistingObjectTag/<key>` works on overwrite.
2346 let request_tags: Option<crate::tagging::TagSet> = req
2347 .input
2348 .tagging
2349 .as_deref()
2350 .map(crate::tagging::parse_tagging_header)
2351 .transpose()
2352 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
2353 let existing_tags: Option<crate::tagging::TagSet> = self
2354 .tagging
2355 .as_ref()
2356 .and_then(|m| m.get_object_tags(&put_bucket, &put_key));
2357 self.enforce_policy_with_extra(
2358 &req,
2359 "s3:PutObject",
2360 &put_bucket,
2361 Some(&put_key),
2362 request_tags.as_ref(),
2363 existing_tags.as_ref(),
2364 )?;
2365 // v0.5 #30: an Object Lock-protected key cannot be overwritten by
2366 // a non-versioned PUT (Suspended / Unversioned bucket). Enabled
2367 // bucket PUTs are exempt because they materialise a fresh
2368 // version under a shadow key (`<key>.__s4ver__/<vid>`) — the
2369 // locked version's bytes are untouched. The check mirrors the
2370 // delete path (Compliance never bypassable, Governance via the
2371 // bypass header, legal hold never).
2372 if let Some(mgr) = self.object_lock.as_ref()
2373 && let Some(state) = mgr.get(&put_bucket, &put_key)
2374 {
2375 let bucket_versioned_enabled = self
2376 .versioning
2377 .as_ref()
2378 .map(|v| v.state(&put_bucket) == crate::versioning::VersioningState::Enabled)
2379 .unwrap_or(false);
2380 if !bucket_versioned_enabled {
2381 let bypass = parse_bypass_governance_header(&req.headers);
2382 let now = chrono::Utc::now();
2383 if !state.can_delete(now, bypass) {
2384 crate::metrics::record_policy_denial("s3:PutObject", &put_bucket);
2385 return Err(S3Error::with_message(
2386 S3ErrorCode::AccessDenied,
2387 "Access Denied because object protected by object lock",
2388 ));
2389 }
2390 }
2391 }
2392 // v0.5 #30: per-PUT explicit retention / legal hold (S3
2393 // `x-amz-object-lock-mode`, `x-amz-object-lock-retain-until-date`,
2394 // `x-amz-object-lock-legal-hold`). Captured before the body
2395 // moves into the backend; persisted into the manager only on
2396 // backend success below.
2397 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
2398 .input
2399 .object_lock_mode
2400 .as_ref()
2401 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
2402 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
2403 .input
2404 .object_lock_retain_until_date
2405 .as_ref()
2406 .and_then(timestamp_to_chrono_utc);
2407 let explicit_legal_hold_on: Option<bool> = req
2408 .input
2409 .object_lock_legal_hold_status
2410 .as_ref()
2411 .map(|s| s.as_str().eq_ignore_ascii_case("ON"));
2412 if let Some(blob) = req.input.body.take() {
2413 // Sample 4 KiB から codec を決定。streaming-aware codec なら streaming
2414 // compress fast path、そうでなければ従来の collect-then-compress。
2415 let (sample, rest_stream) = peek_sample(blob, SAMPLE_BYTES)
2416 .await
2417 .map_err(internal("peek put sample"))?;
2418 let sample_len = sample.len().min(SAMPLE_BYTES);
2419 // v0.8 #56: pass the request's Content-Length (when present) so
2420 // the sampling dispatcher can promote large objects to a GPU
2421 // codec. Chunked transfers (no Content-Length) keep CPU.
2422 let total_size_hint = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2423 let kind = self
2424 .dispatcher
2425 .pick_with_size_hint(&sample[..sample_len], total_size_hint)
2426 .await;
2427
2428 // Passthrough buys nothing from S4F2 wrapping (no compression =
2429 // no per-chunk frame to skip past) and the +28-byte header
2430 // overhead breaks size-sensitive callers that expect a true
2431 // pass-through. So passthrough always uses the legacy raw-blob
2432 // path; only compressing codecs go through the framed path.
2433 //
2434 // v0.8.14 follow-up to #127 MED-B: the previous attempt
2435 // forced the buffered path whenever the client supplied
2436 // any whole-body checksum so `verify_client_body_checksums`
2437 // could run. Modern AWS SDKs auto-add an
2438 // `x-amz-checksum-crc32` trailer by default, which made
2439 // every SDK PUT lose the streaming-framed path and
2440 // therefore lose its sidecar — silent data path
2441 // regression caught by
2442 // `range_get_falls_back_to_full_when_sidecar_etag_stale`
2443 // and `upload_part_copy_propagates_source_version_id`
2444 // on the MinIO E2E job. The streaming PUT path now
2445 // passes through unchanged; client-supplied checksums on
2446 // streaming PUTs are NOT verified (same fail-open as
2447 // pre-v0.8.12). The buffered PUT branch and UploadPart
2448 // do verify, which covers the buffered upload case the
2449 // HIGH-12 audit was scoped to. True streaming verify
2450 // (tee-into-hasher on the chained input) remains the
2451 // tracked follow-up.
2452 let use_framed = supports_streaming_compress(kind) && kind != CodecKind::Passthrough;
2453 let (compressed, manifest, is_framed) = if use_framed {
2454 // streaming fast path: input は memory に collect しない
2455 let chained = chain_sample_with_rest(sample, rest_stream);
2456 debug!(
2457 bucket = ?req.input.bucket,
2458 key = ?req.input.key,
2459 codec = kind.as_str(),
2460 path = "streaming-framed",
2461 "S4 put_object: compressing (streaming, S4F2 multi-frame)"
2462 );
2463 // v0.4 #16: pick the chunk size based on the request's
2464 // Content-Length when known, falling back to the 4 MiB
2465 // default for chunked transfers.
2466 let chunk_size = pick_chunk_size(req.input.content_length.map(|n| n as u64));
2467 // v0.8.4 #73 M2: pass the request's Content-Length so
2468 // streaming_compress_to_frames can fail-fast on a mid-PUT
2469 // truncation (client disconnect after sending half the
2470 // body). `None` is the chunked-Transfer-Encoding case
2471 // where the upstream genuinely doesn't know the size and
2472 // the backend's framing layer is the only truncation
2473 // signal we have.
2474 let expected_input_size =
2475 req.input.content_length.and_then(|n| u64::try_from(n).ok());
2476 let (body, manifest) = streaming_compress_to_frames(
2477 chained,
2478 Arc::clone(&self.registry),
2479 kind,
2480 chunk_size,
2481 expected_input_size,
2482 )
2483 .await
2484 .map_err(|e| match e {
2485 s4_codec::CodecError::TruncatedStream { expected, got } => {
2486 // 400 IncompleteBody: client advertised N bytes
2487 // but disconnected after `got`. Mirrors AWS S3's
2488 // canonical error code for the same shape so SDK
2489 // retries kick in instead of treating the PUT as
2490 // a successful upload of a half-body.
2491 S3Error::with_message(
2492 S3ErrorCode::IncompleteBody,
2493 format!("PUT body truncated: expected {expected} bytes, got {got}"),
2494 )
2495 }
2496 // v0.8.15 M-4: 400
2497 // `RequestBodyLengthMismatch` for over-length
2498 // bodies. AWS S3 returns this when the declared
2499 // `Content-Length` is smaller than the wire body;
2500 // S4 used to silently accept the surplus bytes.
2501 // `IncompleteBody` is the closest typed variant
2502 // in the s3s enum — we widen the message so the
2503 // SDK / curl side sees the shape unambiguously.
2504 s4_codec::CodecError::OverlengthStream { expected, got } => {
2505 let code = S3ErrorCode::from_bytes(b"RequestBodyLengthMismatch")
2506 .unwrap_or(S3ErrorCode::IncompleteBody);
2507 S3Error::with_message(
2508 code,
2509 format!(
2510 "PUT body length mismatch: Content-Length declared {expected} \
2511 bytes, body carried at least {got}"
2512 ),
2513 )
2514 }
2515 other => internal("streaming framed compress")(other),
2516 })?;
2517 (body, manifest, true)
2518 } else {
2519 // GPU codec 等で streaming-aware でないものは bytes-buffered path
2520 // (raw 圧縮 bytes、framed なし — back-compat 互換 path)
2521 let bytes = collect_with_sample(sample, rest_stream, self.max_body_bytes)
2522 .await
2523 .map_err(internal("collect put body (buffered path)"))?;
2524 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
2525 // checksum algorithms against the received body on
2526 // the buffered path. The streaming-framed branch
2527 // above redirects here when ANY checksum header is
2528 // present (#127 MED-B), so this is the single
2529 // checkpoint for client-supplied integrity.
2530 verify_client_body_checksums(
2531 &bytes,
2532 req.input.content_md5.as_deref(),
2533 req.input.checksum_crc32.as_deref(),
2534 req.input.checksum_crc32c.as_deref(),
2535 req.input.checksum_sha1.as_deref(),
2536 req.input.checksum_sha256.as_deref(),
2537 req.input.checksum_crc64nvme.as_deref(),
2538 )?;
2539 debug!(
2540 bucket = ?req.input.bucket,
2541 key = ?req.input.key,
2542 bytes = bytes.len(),
2543 codec = kind.as_str(),
2544 path = "buffered",
2545 "S4 put_object: compressing (buffered, raw blob)"
2546 );
2547 // v0.8 #55: telemetry-returning compress so we can stamp
2548 // GPU-pipeline Prometheus metrics (`s4_gpu_compress_seconds`,
2549 // throughput gauge, OOM counter) for nvcomp / dietgpu codecs.
2550 // CPU codecs come back with `gpu_seconds = None` and the
2551 // stamp helper short-circuits — no extra cost on CPU path.
2552 let (compress_res, tel) = self.registry.compress_with_telemetry(bytes, kind).await;
2553 stamp_gpu_compress_telemetry(&tel);
2554 let (body, m) = compress_res.map_err(internal("registry compress"))?;
2555 (body, m, false)
2556 };
2557
2558 write_manifest(&mut req.input.metadata, &manifest);
2559 if is_framed {
2560 // v0.2 #4: framed body であることを GET 側に伝える meta flag。
2561 req.input
2562 .metadata
2563 .get_or_insert_with(Default::default)
2564 .insert(META_FRAMED.into(), "true".into());
2565 }
2566 // 重要: content_length を圧縮後サイズで更新する。
2567 // これを忘れると下流 (aws-sdk-s3 → S3) が宣言サイズ分の bytes を
2568 // 待ち続けて RequestTimeout で失敗する (S3 仕様)。
2569 req.input.content_length = Some(compressed.len() as i64);
2570 // body を書き換えたので、客側が送ってきた original body 用の
2571 // checksum / MD5 ヘッダは無効化する (そのまま転送すると下流 S3 が
2572 // XAmzContentChecksumMismatch を返す)。S4 自身の整合性は
2573 // ChunkManifest.crc32c で担保している。
2574 req.input.checksum_algorithm = None;
2575 req.input.checksum_crc32 = None;
2576 req.input.checksum_crc32c = None;
2577 req.input.checksum_crc64nvme = None;
2578 req.input.checksum_sha1 = None;
2579 req.input.checksum_sha256 = None;
2580 req.input.content_md5 = None;
2581 let original_size = manifest.original_size;
2582 let compressed_size = manifest.compressed_size;
2583 let codec_label = manifest.codec.as_str();
2584 // (sidecar_index is built below, after the SSE-mode
2585 // extraction, so v0.8.12 HIGH-10 can short-circuit the
2586 // build when the on-disk bytes are about to be encrypted.)
2587 // v0.4 #21 / v0.5 #29 / v0.5 #27: encrypt-after-compress.
2588 // Precedence:
2589 // - SSE-C headers present → per-request customer key (S4E3)
2590 // - server-managed keyring configured → active key (S4E2)
2591 // - neither → no encryption (raw compressed body)
2592 // The `s4-encrypted: aes-256-gcm` metadata flag is set in
2593 // both encrypted modes; the on-disk frame magic distinguishes
2594 // S4E1 / S4E2 / S4E3 so GET picks the right decrypt path.
2595 // v0.7 #48 BUG-2/3 fix: take() the SSE fields off req.input
2596 // so the encryption headers are NOT forwarded to the
2597 // backend. S4 owns the encrypt-then-store contract; if we
2598 // leave the headers in place, real S3-compat backends
2599 // (MinIO / AWS) try to apply their own SSE on top and
2600 // either reject (MinIO requires HTTPS for SSE-C) or fail
2601 // (MinIO has no KMS configured). MemoryBackend ignored
2602 // these so mock tests passed.
2603 let sse_c_alg = req.input.sse_customer_algorithm.take();
2604 let sse_c_key = req.input.sse_customer_key.take();
2605 let sse_c_md5 = req.input.sse_customer_key_md5.take();
2606 let sse_header = req.input.server_side_encryption.take();
2607 let sse_kms_key = req.input.ssekms_key_id.take();
2608 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
2609 // v0.5 #28: SSE-KMS request? Resolves to None unless the
2610 // request asks for `aws:kms` AND a key id is available
2611 // (explicit header or gateway default). When set, we'll
2612 // generate a per-object DEK below.
2613 let kms_key_id = extract_kms_key_id(
2614 &sse_header,
2615 &sse_kms_key,
2616 self.kms_default_key_id.as_deref(),
2617 );
2618 // v0.8.12 HIGH-10 fix: the sidecar offsets describe the
2619 // pre-encrypt `compressed` body, but the bytes the
2620 // backend stores when any SSE mode is active are
2621 // *post-encrypt* (different length, different layout).
2622 // A Range GET on an SSE-encrypted object would slice the
2623 // ciphertext at the stale offsets, hand the wrong bytes
2624 // to the frame parser, and 500. Suppress the sidecar
2625 // entirely when SSE is going to be applied below;
2626 // encrypted-object Range GET falls back to the buffered
2627 // path (decrypt full body → frame parse → slice), trading
2628 // partial-fetch performance for correctness. An
2629 // encryption-aware sidecar format is a follow-up issue.
2630 let will_encrypt =
2631 sse_c_material.is_some() || kms_key_id.is_some() || self.sse_keyring.is_some();
2632 let sidecar_index = if is_framed && !will_encrypt {
2633 s4_codec::index::build_index_from_body(&compressed).ok()
2634 } else {
2635 None
2636 };
2637 // v0.5 #32: in compliance-strict mode, every PUT must
2638 // declare SSE — either client-supplied (SSE-C), KMS, or by
2639 // virtue of a server-side keyring being configured (which
2640 // applies SSE-S4 to every PUT automatically). Requests that
2641 // would otherwise land as plain compressed bytes are
2642 // rejected with 400 InvalidRequest.
2643 if self.compliance_strict
2644 && sse_c_material.is_none()
2645 && kms_key_id.is_none()
2646 && self.sse_keyring.is_none()
2647 && sse_header.as_ref().map(|s| s.as_str()) != Some(ServerSideEncryption::AES256)
2648 {
2649 return Err(S3Error::with_message(
2650 S3ErrorCode::InvalidRequest,
2651 "compliance-mode strict: PUT must include x-amz-server-side-encryption \
2652 (AES256 or aws:kms) or x-amz-server-side-encryption-customer-* headers",
2653 ));
2654 }
2655 // SSE-C and SSE-KMS are mutually exclusive on a single PUT
2656 // (AWS S3 returns 400 InvalidArgument). SSE-C wins by spec.
2657 if sse_c_material.is_some() && kms_key_id.is_some() {
2658 return Err(S3Error::with_message(
2659 S3ErrorCode::InvalidArgument,
2660 "SSE-C and SSE-KMS cannot be used together on the same PUT",
2661 ));
2662 }
2663 // KMS path needs to call generate_dek().await before the
2664 // body_to_send branch; capture the result here.
2665 //
2666 // v0.8.1 #58: the plaintext DEK lives in three places
2667 // during one PUT:
2668 //
2669 // 1. The `Zeroizing<Vec<u8>>` returned by `generate_dek`
2670 // — wiped when the binding `dek` falls out of scope at
2671 // the end of this `if`-arm.
2672 // 2. The stack `[u8; 32]` we copy into for `SseSource::Kms`
2673 // — wrapped in `Zeroizing<[u8; 32]>` so it's wiped when
2674 // the outer `kms_wrap` `Option` is dropped at the end
2675 // of `put_object`.
2676 // 3. AES-GCM internal key state inside the `aes-gcm`
2677 // crate during `encrypt_with_source` — out of scope
2678 // for this fix; tracked separately in v0.8.2.
2679 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
2680 if let Some(ref key_id) = kms_key_id {
2681 let kms = self.kms.as_ref().ok_or_else(|| {
2682 S3Error::with_message(
2683 S3ErrorCode::InvalidRequest,
2684 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
2685 )
2686 })?;
2687 // `dek` is `Zeroizing<Vec<u8>>`; deref + slice access
2688 // works unchanged via `Deref<Target=Vec<u8>>`.
2689 let (dek, wrapped) = kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
2690 if dek.len() != 32 {
2691 return Err(S3Error::with_message(
2692 S3ErrorCode::InternalError,
2693 format!(
2694 "KMS backend returned a DEK of {} bytes (expected 32)",
2695 dek.len()
2696 ),
2697 ));
2698 }
2699 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
2700 zeroize::Zeroizing::new([0u8; 32]);
2701 dek_arr.copy_from_slice(&dek);
2702 // `dek` (the `Zeroizing<Vec<u8>>`) is dropped at the
2703 // end of this scope, wiping the heap allocation.
2704 Some((dek_arr, wrapped))
2705 } else {
2706 None
2707 };
2708 // v0.7 #48 BUG-4 fix: stamp the SSE *type* into metadata
2709 // alongside `s4-encrypted` so HEAD (which doesn't fetch the
2710 // body) can echo the correct `x-amz-server-side-encryption`
2711 // value. Without this, HEAD on an SSE-KMS object would not
2712 // echo `aws:kms` because the frame magic is only available
2713 // on the body (which HEAD doesn't read).
2714 let body_to_send = if let Some(ref m) = sse_c_material {
2715 let meta = req.input.metadata.get_or_insert_with(Default::default);
2716 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2717 meta.insert("s4-sse-type".into(), "AES256".into());
2718 meta.insert(
2719 "s4-sse-c-key-md5".into(),
2720 base64::engine::general_purpose::STANDARD.encode(m.key_md5),
2721 );
2722 crate::sse::encrypt_with_source(
2723 &compressed,
2724 crate::sse::SseSource::CustomerKey {
2725 key: &m.key,
2726 key_md5: &m.key_md5,
2727 },
2728 )
2729 } else if let Some((ref dek, ref wrapped)) = kms_wrap {
2730 let meta = req.input.metadata.get_or_insert_with(Default::default);
2731 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2732 meta.insert("s4-sse-type".into(), "aws:kms".into());
2733 meta.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
2734 // v0.8.1 #58: `dek` is `&Zeroizing<[u8; 32]>`; `SseSource::Kms`
2735 // wants `&[u8; 32]`. Rust auto-derefs `&Zeroizing<T>` to
2736 // `&T` here via `Deref<Target=T>`, so the binding picks
2737 // up the inner array reference without copying. The array
2738 // stays in the `Zeroizing` wrapper that owns it and gets
2739 // wiped when `kms_wrap` drops at the end of `put_object`.
2740 let dek_ref: &[u8; 32] = dek;
2741 crate::sse::encrypt_with_source(
2742 &compressed,
2743 crate::sse::SseSource::Kms {
2744 dek: dek_ref,
2745 wrapped,
2746 },
2747 )
2748 } else if let Some(keyring) = self.sse_keyring.as_ref() {
2749 // SSE-S4 is server-driven transparent encryption; the
2750 // client didn't ask for SSE. We stamp `s4-encrypted`
2751 // (internal flag the GET path needs) but deliberately
2752 // do NOT stamp `s4-sse-type` — that lights up the HEAD
2753 // echo of `x-amz-server-side-encryption: AES256`,
2754 // which would falsely advertise AWS-style SSE-S3
2755 // semantics the operator didn't request.
2756 let meta = req.input.metadata.get_or_insert_with(Default::default);
2757 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2758 // v0.8 #52: when `--sse-chunk-size > 0` is configured,
2759 // emit the chunked S4E5 frame so the matching GET can
2760 // stream-decrypt instead of buffering 5 GiB before
2761 // emitting a byte. Falls back to the buffered S4E2
2762 // frame at chunk_size=0 (default) so existing
2763 // deployments are bit-for-bit unchanged.
2764 if self.sse_chunk_size > 0 {
2765 crate::sse::encrypt_v2_chunked(&compressed, keyring, self.sse_chunk_size)
2766 .map_err(|e| {
2767 S3Error::with_message(
2768 S3ErrorCode::InternalError,
2769 format!("SSE-S4 chunked encrypt failed: {e}"),
2770 )
2771 })?
2772 } else {
2773 crate::sse::encrypt_v2(&compressed, keyring)
2774 }
2775 } else {
2776 compressed.clone()
2777 };
2778 // v0.6 #40: capture the about-to-be-sent body + metadata so
2779 // the replication dispatcher (run after the source PUT
2780 // succeeds) can hand the same backend bytes to the
2781 // destination bucket. `Bytes` clone is cheap (refcounted).
2782 let replication_body = body_to_send.clone();
2783 let replication_metadata = req.input.metadata.clone();
2784 // v0.7 #48 BUG-1 fix: SSE encryption (S4E1/E2/E3/E4 frames)
2785 // makes the body longer than the post-compression bytes
2786 // (header + nonce + tag overhead). The earlier
2787 // content_length stamp at compressed.len() is now stale, so
2788 // re-stamp from the actual bytes about to be sent or the
2789 // backend (real S3 / MinIO) rejects with
2790 // `StreamLengthMismatch`. MemoryBackend never validated
2791 // this, which is why mock-only tests passed.
2792 req.input.content_length = Some(body_to_send.len() as i64);
2793 req.input.body = Some(bytes_to_blob(body_to_send));
2794 // v0.5 #34: pre-allocate a version-id when the bucket is
2795 // Enabled, then redirect the backend storage key to the
2796 // shadow path so older versions survive newer PUTs.
2797 // Suspended / Unversioned buckets keep using the plain
2798 // `<key>` (S3 spec: Suspended overwrites the same backend
2799 // object). Pre-allocation (instead of recording after PUT)
2800 // ensures the shadow key + the response's
2801 // `x-amz-version-id` use the same vid.
2802 let pending_version: Option<crate::versioning::PutOutcome> = self
2803 .versioning
2804 .as_ref()
2805 .map(|mgr| mgr.state(&put_bucket))
2806 .map(|state| match state {
2807 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
2808 version_id: crate::versioning::VersioningManager::new_version_id(),
2809 versioned_response: true,
2810 },
2811 crate::versioning::VersioningState::Suspended
2812 | crate::versioning::VersioningState::Unversioned => {
2813 crate::versioning::PutOutcome {
2814 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
2815 versioned_response: false,
2816 }
2817 }
2818 });
2819 if let Some(ref pv) = pending_version
2820 && pv.versioned_response
2821 {
2822 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
2823 }
2824 // v0.8.4 #73 H-2: capture the to-be-stored body length BEFORE
2825 // the move into `req.input` is consumed by the backend call.
2826 // The sidecar's `source_compressed_size` is checked against
2827 // the live HEAD `Content-Length` on Range GET to detect a
2828 // backend-side mutation.
2829 let backend_object_size = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2830 let mut backend_resp = self.backend.put_object(req).await;
2831 if let Some(mut idx) = sidecar_index
2832 && let Ok(ref resp) = backend_resp
2833 && idx.entries.len() > 1
2834 {
2835 // 1 chunk しかない (small object) なら sidecar は意味がない (=
2836 // partial fetch しても full body と同じ範囲) ので省略。
2837 // Sidecar は user-visible key で書く (latest version の
2838 // partial fetch path 用)。Old versions の Range GET は今 task
2839 // の scope 外 (full read fallback でも意味的には正しい)。
2840 //
2841 // v0.8.4 #73 H-2: stamp the version-binding fields the
2842 // GET path needs to detect a stale / attacker-written
2843 // sidecar. ETag comes from the backend's PUT response —
2844 // when missing (some backends don't return an ETag) we
2845 // synthesize a CRC-derived stable identifier so the
2846 // sidecar still binds to *something*; the GET HEAD will
2847 // see the same backend ETag (None vs None) and treat the
2848 // pair as consistent.
2849 let source_etag = resp.output.e_tag.as_ref().map(|t| t.value().to_string());
2850 idx.source_etag = source_etag;
2851 idx.source_compressed_size = backend_object_size;
2852 self.write_sidecar(&put_bucket, &put_key, &idx).await;
2853 }
2854 // v0.5 #34: commit the new version into the manager only on
2855 // backend success. Use the pre-allocated vid so the response
2856 // header and the chain entry agree.
2857 if let (Some(mgr), Some(pv), Ok(resp)) = (
2858 self.versioning.as_ref(),
2859 pending_version.as_ref(),
2860 backend_resp.as_mut(),
2861 ) {
2862 let etag = resp
2863 .output
2864 .e_tag
2865 .clone()
2866 .map(ETag::into_value)
2867 .unwrap_or_else(|| format!("\"crc32c-{}\"", manifest.crc32c));
2868 let now = chrono::Utc::now();
2869 mgr.commit_put_with_version(
2870 &put_bucket,
2871 &put_key,
2872 crate::versioning::VersionEntry {
2873 version_id: pv.version_id.clone(),
2874 etag,
2875 size: original_size,
2876 is_delete_marker: false,
2877 created_at: now,
2878 },
2879 );
2880 if pv.versioned_response {
2881 resp.output.version_id = Some(pv.version_id.clone());
2882 }
2883 }
2884 // v0.5 #27: AWS S3 echoes the SSE-C headers back on success
2885 // so the client knows the server actually applied the
2886 // requested algorithm and which key fingerprint matched.
2887 if let (Some(m), Ok(resp)) = (sse_c_material.as_ref(), backend_resp.as_mut()) {
2888 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
2889 resp.output.sse_customer_key_md5 =
2890 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
2891 }
2892 // v0.5 #28: SSE-KMS echo — `aws:kms` + the canonical key id
2893 // the backend returned (AWS KMS returns the ARN even when
2894 // the request used an alias).
2895 if let (Some((_, wrapped)), Ok(resp)) = (kms_wrap.as_ref(), backend_resp.as_mut()) {
2896 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
2897 ServerSideEncryption::AWS_KMS,
2898 ));
2899 resp.output.ssekms_key_id = Some(wrapped.key_id.clone());
2900 }
2901 // v0.5 #30: persist any per-PUT explicit retention / legal
2902 // hold the client supplied, then auto-apply the bucket
2903 // default (no-op when state is already populated). The
2904 // explicit fields take precedence — the bucket-default
2905 // helper bails out as soon as it sees any retention.
2906 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
2907 if explicit_lock_mode.is_some()
2908 || explicit_retain_until.is_some()
2909 || explicit_legal_hold_on.is_some()
2910 {
2911 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
2912 if let Some(m) = explicit_lock_mode {
2913 state.mode = Some(m);
2914 }
2915 if let Some(u) = explicit_retain_until {
2916 state.retain_until = Some(u);
2917 }
2918 if let Some(lh) = explicit_legal_hold_on {
2919 state.legal_hold_on = lh;
2920 }
2921 mgr.set(&put_bucket, &put_key, state);
2922 }
2923 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
2924 }
2925 let _ = (original_size, compressed_size); // mute unused warnings
2926 let elapsed = put_start.elapsed();
2927 crate::metrics::record_put(
2928 codec_label,
2929 original_size,
2930 compressed_size,
2931 elapsed.as_secs_f64(),
2932 backend_resp.is_ok(),
2933 );
2934 // v0.4 #20: structured access-log entry (best-effort).
2935 self.record_access(
2936 access_preamble,
2937 "REST.PUT.OBJECT",
2938 &put_bucket,
2939 Some(&put_key),
2940 if backend_resp.is_ok() { 200 } else { 500 },
2941 compressed_size,
2942 original_size,
2943 elapsed.as_millis() as u64,
2944 backend_resp.as_ref().err().map(|e| e.code().as_str()),
2945 )
2946 .await;
2947 info!(
2948 op = "put_object",
2949 bucket = %put_bucket,
2950 key = %put_key,
2951 codec = codec_label,
2952 bytes_in = original_size,
2953 bytes_out = compressed_size,
2954 ratio = format!(
2955 "{:.3}",
2956 if original_size == 0 { 1.0 } else { compressed_size as f64 / original_size as f64 }
2957 ),
2958 latency_ms = elapsed.as_millis() as u64,
2959 ok = backend_resp.is_ok(),
2960 "S4 put completed"
2961 );
2962 // v0.6 #35: fire bucket-notification destinations (best-effort,
2963 // detached). Skipped when no manager is attached or when the
2964 // bucket has no rule matching `s3:ObjectCreated:Put` for this
2965 // key.
2966 if backend_resp.is_ok()
2967 && let Some(mgr) = self.notifications.as_ref()
2968 {
2969 let dests = mgr.match_destinations(
2970 &put_bucket,
2971 &crate::notifications::EventType::ObjectCreatedPut,
2972 &put_key,
2973 );
2974 if !dests.is_empty() {
2975 let etag = backend_resp
2976 .as_ref()
2977 .ok()
2978 .and_then(|r| r.output.e_tag.clone())
2979 .map(ETag::into_value);
2980 let version_id = pending_version
2981 .as_ref()
2982 .filter(|pv| pv.versioned_response)
2983 .map(|pv| pv.version_id.clone());
2984 tokio::spawn(crate::notifications::dispatch_event(
2985 Arc::clone(mgr),
2986 put_bucket.clone(),
2987 put_key.clone(),
2988 crate::notifications::EventType::ObjectCreatedPut,
2989 Some(original_size),
2990 etag,
2991 version_id,
2992 format!("S4-{}", uuid::Uuid::new_v4()),
2993 ));
2994 }
2995 }
2996 // v0.6 #39: persist parsed `x-amz-tagging` tags into the
2997 // tagging manager on a successful PUT. AWS PutObject's
2998 // tagging is a full-replace operation (not a merge), so
2999 // any pre-existing entry for `(bucket, key)` is overwritten.
3000 if backend_resp.is_ok()
3001 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3002 {
3003 mgr.put_object_tags(&put_bucket, &put_key, tags);
3004 }
3005 // v0.6 #40: cross-bucket replication fire-point. On
3006 // successful source PUT, consult the replication manager;
3007 // when an enabled rule matches, mark the source key
3008 // `Pending` and spawn a detached task that PUTs the same
3009 // backend bytes + metadata to the rule's destination
3010 // bucket. The dispatcher itself records `Completed` /
3011 // `Failed` and bumps the drop counter on retry-budget
3012 // exhaustion.
3013 self.spawn_replication_if_matched(
3014 &put_bucket,
3015 &put_key,
3016 &request_tags,
3017 &replication_body,
3018 &replication_metadata,
3019 backend_resp.is_ok(),
3020 pending_version.as_ref(),
3021 );
3022 return backend_resp;
3023 }
3024 // Body-less PUT (rare: zero-length object). Mirror the body-full
3025 // versioning hooks so list_object_versions / GET-by-version still see
3026 // empty-body objects in the chain.
3027 let pending_version: Option<crate::versioning::PutOutcome> = self
3028 .versioning
3029 .as_ref()
3030 .map(|mgr| mgr.state(&put_bucket))
3031 .map(|state| match state {
3032 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3033 version_id: crate::versioning::VersioningManager::new_version_id(),
3034 versioned_response: true,
3035 },
3036 _ => crate::versioning::PutOutcome {
3037 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3038 versioned_response: false,
3039 },
3040 });
3041 if let Some(ref pv) = pending_version
3042 && pv.versioned_response
3043 {
3044 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3045 }
3046 let mut backend_resp = self.backend.put_object(req).await;
3047 if let (Some(mgr), Some(pv), Ok(resp)) = (
3048 self.versioning.as_ref(),
3049 pending_version.as_ref(),
3050 backend_resp.as_mut(),
3051 ) {
3052 let etag = resp
3053 .output
3054 .e_tag
3055 .clone()
3056 .map(ETag::into_value)
3057 .unwrap_or_default();
3058 let now = chrono::Utc::now();
3059 mgr.commit_put_with_version(
3060 &put_bucket,
3061 &put_key,
3062 crate::versioning::VersionEntry {
3063 version_id: pv.version_id.clone(),
3064 etag,
3065 size: 0,
3066 is_delete_marker: false,
3067 created_at: now,
3068 },
3069 );
3070 if pv.versioned_response {
3071 resp.output.version_id = Some(pv.version_id.clone());
3072 }
3073 }
3074 // v0.5 #30: same explicit-then-default lock-state commit as the
3075 // body-bearing branch above, so a zero-length PUT also picks up
3076 // bucket-default retention.
3077 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3078 if explicit_lock_mode.is_some()
3079 || explicit_retain_until.is_some()
3080 || explicit_legal_hold_on.is_some()
3081 {
3082 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3083 if let Some(m) = explicit_lock_mode {
3084 state.mode = Some(m);
3085 }
3086 if let Some(u) = explicit_retain_until {
3087 state.retain_until = Some(u);
3088 }
3089 if let Some(lh) = explicit_legal_hold_on {
3090 state.legal_hold_on = lh;
3091 }
3092 mgr.set(&put_bucket, &put_key, state);
3093 }
3094 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3095 }
3096 // v0.6 #35: same notification fire-point as the body-bearing PUT
3097 // branch above (zero-length objects still match `ObjectCreated:Put`
3098 // rules per the AWS event taxonomy).
3099 if backend_resp.is_ok()
3100 && let Some(mgr) = self.notifications.as_ref()
3101 {
3102 let dests = mgr.match_destinations(
3103 &put_bucket,
3104 &crate::notifications::EventType::ObjectCreatedPut,
3105 &put_key,
3106 );
3107 if !dests.is_empty() {
3108 let etag = backend_resp
3109 .as_ref()
3110 .ok()
3111 .and_then(|r| r.output.e_tag.clone())
3112 .map(ETag::into_value);
3113 let version_id = pending_version
3114 .as_ref()
3115 .filter(|pv| pv.versioned_response)
3116 .map(|pv| pv.version_id.clone());
3117 tokio::spawn(crate::notifications::dispatch_event(
3118 Arc::clone(mgr),
3119 put_bucket.clone(),
3120 put_key.clone(),
3121 crate::notifications::EventType::ObjectCreatedPut,
3122 Some(0),
3123 etag,
3124 version_id,
3125 format!("S4-{}", uuid::Uuid::new_v4()),
3126 ));
3127 }
3128 }
3129 // v0.6 #39: persist parsed `x-amz-tagging` for the body-less
3130 // (zero-length) PUT branch too — same shape as the body-bearing
3131 // branch above.
3132 if backend_resp.is_ok()
3133 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3134 {
3135 mgr.put_object_tags(&put_bucket, &put_key, tags);
3136 }
3137 // v0.6 #40: cross-bucket replication for the zero-length PUT
3138 // branch — same shape as the body-bearing branch above.
3139 // v0.8.2 #61: pass `pending_version` so a versioned source's
3140 // destination receives the same shadow-key path.
3141 self.spawn_replication_if_matched(
3142 &put_bucket,
3143 &put_key,
3144 &request_tags,
3145 &bytes::Bytes::new(),
3146 &None,
3147 backend_resp.is_ok(),
3148 pending_version.as_ref(),
3149 );
3150 backend_resp
3151 }
3152
3153 // === 圧縮を解く path (GET) ===
3154 #[tracing::instrument(
3155 name = "s4.get_object",
3156 skip(self, req),
3157 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_out, range, path)
3158 )]
3159 async fn get_object(
3160 &self,
3161 mut req: S3Request<GetObjectInput>,
3162 ) -> S3Result<S3Response<GetObjectOutput>> {
3163 let get_start = Instant::now();
3164 let get_bucket = req.input.bucket.clone();
3165 let get_key = req.input.key.clone();
3166 // v0.8.16 F-13: reserved-name guard now also fires on GET.
3167 // The v0.8.15 #137 fix only blocked PUT / Copy / Create —
3168 // a curious or hostile client could still
3169 // `GetObject(<key>.s4index)` and read the raw sidecar
3170 // (frame layout, source ETag, source compressed size).
3171 // The list filter already hides the entry from listings;
3172 // explicit reject closes the directed-read leak.
3173 if s4_codec::index::is_reserved_sidecar_key(&get_key) {
3174 return Err(S3Error::with_message(
3175 S3ErrorCode::NoSuchKey,
3176 format!("object key {get_key:?} is reserved for S4 internal sidecars"),
3177 ));
3178 }
3179 self.enforce_rate_limit(&req, &get_bucket)?;
3180 self.enforce_policy(&req, "s3:GetObject", &get_bucket, Some(&get_key))?;
3181 // Range request の事前検出 (decompress 後 slice する path に使う)。
3182 let range_request = req.input.range.take();
3183 // v0.5 #27: pull SSE-C material from the input headers before
3184 // the request is moved into the backend. A header parse error
3185 // fails fast (no body fetch). The material is consumed below
3186 // when decrypting an S4E3-framed body; the SSE-C headers on
3187 // `req.input` are cleared so the backend doesn't see them.
3188 let sse_c_alg = req.input.sse_customer_algorithm.take();
3189 let sse_c_key = req.input.sse_customer_key.take();
3190 let sse_c_md5 = req.input.sse_customer_key_md5.take();
3191 let get_sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3192
3193 // v0.5 #34: route the GET through the VersioningManager when
3194 // attached AND the bucket is in a versioning-aware state.
3195 // Resolves which version to fetch (explicit `?versionId=` query
3196 // param vs. chain latest), translates a delete-marker into 404
3197 // NoSuchKey, and rewrites the backend storage key to the shadow
3198 // path (`<key>.__s4ver__/<vid>`) for non-null Enabled-bucket
3199 // versions. `resolved_version_id` is stamped onto the response
3200 // so clients see a coherent `x-amz-version-id` header.
3201 //
3202 // When the bucket is Unversioned (or no manager attached), the
3203 // chain-resolution step is skipped and the request flows
3204 // through the existing single-key path unchanged.
3205 let resolved_version_id: Option<String> = match self.versioning.as_ref() {
3206 Some(mgr)
3207 if mgr.state(&get_bucket) != crate::versioning::VersioningState::Unversioned =>
3208 {
3209 let req_vid = req.input.version_id.take();
3210 let entry = match req_vid.as_deref() {
3211 Some(vid) => {
3212 mgr.lookup_version(&get_bucket, &get_key, vid)
3213 .ok_or_else(|| {
3214 S3Error::with_message(
3215 S3ErrorCode::NoSuchVersion,
3216 format!("no such version: {vid}"),
3217 )
3218 })?
3219 }
3220 None => mgr.lookup_latest(&get_bucket, &get_key).ok_or_else(|| {
3221 S3Error::with_message(
3222 S3ErrorCode::NoSuchKey,
3223 format!("no such key: {get_key}"),
3224 )
3225 })?,
3226 };
3227 if entry.is_delete_marker {
3228 // S3 spec: GET without versionId on a
3229 // delete-marker latest → 404 NoSuchKey + the
3230 // response carries `x-amz-delete-marker: true`.
3231 // GET with explicit versionId pointing at a delete
3232 // marker → 405 MethodNotAllowed; we surface
3233 // NoSuchKey here for both since s3s collapses them
3234 // into the same not-found error path.
3235 return Err(S3Error::with_message(
3236 S3ErrorCode::NoSuchKey,
3237 format!("delete marker is the current version of {get_key}"),
3238 ));
3239 }
3240 if entry.version_id != crate::versioning::NULL_VERSION_ID {
3241 req.input.key = versioned_shadow_key(&get_key, &entry.version_id);
3242 }
3243 Some(entry.version_id)
3244 }
3245 _ => None,
3246 };
3247
3248 // ====== Range GET の partial-fetch fast path (sidecar index 利用) ======
3249 // sidecar `<key>.s4index` が存在し、multipart-framed object であれば
3250 // 必要 frame だけを backend に Range GET し帯域節約する。
3251 //
3252 // v0.8.4 #73 H-2: BEFORE trusting the sidecar's frame offsets,
3253 // verify the source object hasn't been overwritten / mutated since
3254 // the sidecar was stamped. The sidecar carries the backend ETag
3255 // captured at PUT time (`source_etag`); a HEAD against the current
3256 // backend object tells us the live ETag. If they disagree we treat
3257 // the sidecar as stale and fall through to the full-GET path —
3258 // returning the wrong frames for a Range request would surface as
3259 // a CRC mismatch deeper in the stack but would also potentially
3260 // disclose unrelated frames if a hostile operator wrote the
3261 // sidecar themselves. Fail-open to "full read" is the safe default.
3262 //
3263 // Legacy v1 sidecars (no `source_etag` populated) keep the old
3264 // best-effort behaviour so existing on-disk indexes don't suddenly
3265 // start missing the partial-fetch path.
3266 if let Some(ref r) = range_request
3267 && let Some(index) = self.read_sidecar(&req.input.bucket, &req.input.key).await
3268 && self
3269 .sidecar_version_binding_ok(&req.input.bucket, &req.input.key, &index)
3270 .await
3271 {
3272 let total = index.total_original_size();
3273 let (start, end_exclusive) = match resolve_range(r, total) {
3274 Ok(v) => v,
3275 Err(e) => {
3276 return Err(S3Error::with_message(S3ErrorCode::InvalidRange, e));
3277 }
3278 };
3279 if let Some(plan) = index.lookup_range(start, end_exclusive) {
3280 return self
3281 .partial_range_get(&req, plan, start, end_exclusive, total, get_start)
3282 .await;
3283 }
3284 }
3285 let mut resp = self.backend.get_object(req).await?;
3286 // v0.5 #34: stamp the resolved version-id so the client sees a
3287 // coherent `x-amz-version-id` header (only for chains owned by
3288 // the manager — Unversioned buckets / no-manager paths never
3289 // set this).
3290 if let Some(ref vid) = resolved_version_id {
3291 resp.output.version_id = Some(vid.clone());
3292 }
3293 let is_multipart = is_multipart_object(&resp.output.metadata);
3294 let is_framed_v2 = is_framed_v2_object(&resp.output.metadata);
3295 // v0.2 #4: framed-v2 single-PUT は多 frame parse が必要なので
3296 // multipart と同じ path に流す。
3297 let needs_frame_parse = is_multipart || is_framed_v2;
3298 let manifest_opt = extract_manifest(&resp.output.metadata);
3299
3300 if !needs_frame_parse && manifest_opt.is_none() {
3301 // S4 が書いていないオブジェクトは透過 (raw bucket pre-existing object 等)
3302 debug!("S4 get_object: object lacks s4-codec metadata, returning as-is");
3303 return Ok(resp);
3304 }
3305
3306 if let Some(blob) = resp.output.body.take() {
3307 // v0.4 #21 / v0.5 #27: if the object was stored under SSE
3308 // (metadata flag `s4-encrypted: aes-256-gcm`), decrypt
3309 // before any frame parse / streaming decompress. Encrypted
3310 // bodies are opaque to the codec; this also forces the
3311 // buffered path because AES-GCM needs the full body for tag
3312 // verify. SSE-C uses the per-request customer key, SSE-S4
3313 // falls back to the configured keyring.
3314 let blob = if is_sse_encrypted(&resp.output.metadata) {
3315 let body = collect_blob(blob, self.max_body_bytes)
3316 .await
3317 .map_err(internal("collect SSE-encrypted body"))?;
3318 // v0.5 #28: peek the frame magic to route the right
3319 // decrypt path. S4E4 means SSE-KMS — unwrap the DEK
3320 // through the KMS backend (async). S4E1/E2/E3 take
3321 // the sync path (keyring or customer key).
3322 //
3323 // v0.8 #52 (S4E5) / v0.8.1 #57 (S4E6): the chunked
3324 // SSE-S4 frames take the *streaming* path — we hand
3325 // the response body a per-chunk verify-and-emit
3326 // Stream so the client sees chunk 0 plaintext after
3327 // one chunk-worth of AES-GCM verify (vs. waiting
3328 // for the whole body's tag), and the gateway no
3329 // longer needs to materialize the full plaintext
3330 // in memory before responding. SSE-C is out of
3331 // scope for the chunked path (chunked S4E3 is a
3332 // follow-up), so this branch requires the SSE-S4
3333 // keyring to be wired and `get_sse_c_material` to
3334 // be absent — otherwise we surface a clear
3335 // misconfiguration error instead of silently
3336 // falling through to the buffered chunked path.
3337 // v0.8.11 CRIT-1 fix: the chunked stream early-return is
3338 // only correct when the decrypted body IS the user's
3339 // plaintext as-stored. If the object went through the
3340 // codec (compressed) or carries S4F2 frames, returning
3341 // the decrypt stream directly hands the client
3342 // compressed / framed bytes. Restrict the early-return
3343 // to codec=Passthrough + non-framed objects; everything
3344 // else falls through to the buffered path, which
3345 // decrypt-buffers S4E5/S4E6 via
3346 // `decrypt_chunked_buffered_default` and then runs the
3347 // existing decompress pipeline.
3348 let chunked_streaming_safe = !needs_frame_parse
3349 && manifest_opt
3350 .as_ref()
3351 .map(|m| m.codec == CodecKind::Passthrough)
3352 .unwrap_or(false);
3353 if matches!(crate::sse::peek_magic(&body), Some("S4E5") | Some("S4E6"))
3354 && get_sse_c_material.is_none()
3355 && chunked_streaming_safe
3356 {
3357 let keyring_arc = self.sse_keyring.clone().ok_or_else(|| {
3358 S3Error::with_message(
3359 S3ErrorCode::InvalidRequest,
3360 "object is SSE-S4 encrypted (S4E5/S4E6) but no --sse-s4-key is configured on this gateway",
3361 )
3362 })?;
3363 let body_len = body.len() as u64;
3364 let stream = crate::sse::decrypt_chunked_stream(body, keyring_arc.as_ref());
3365 // Stream is `'static` (the keyring borrow is
3366 // consumed up front; the cipher lives inside
3367 // the stream state — see decrypt_chunked_stream
3368 // doc), so we can move it straight into a
3369 // StreamingBlob without lifetime gymnastics.
3370 use futures::StreamExt;
3371 let mapped = stream.map(|r| {
3372 r.map_err(|e| std::io::Error::other(format!("SSE-S4 chunked decrypt: {e}")))
3373 });
3374 use s3s::dto::StreamingBlob;
3375 resp.output.body = Some(StreamingBlob::wrap(mapped));
3376 // Plaintext content_length is unknown until all
3377 // chunks have been verified; null it out so the
3378 // ByteStream wrapper reports `unknown` to the
3379 // HTTP layer (which then emits chunked transfer-
3380 // encoding) rather than lying about the size.
3381 resp.output.content_length = None;
3382 // The backend's checksums + ETag describe the
3383 // encrypted body (S4E5/S4E6 wire format), not
3384 // the plaintext we're about to stream — clear them
3385 // so the AWS SDK doesn't fail the GET with a
3386 // ChecksumMismatch on a successful round-trip.
3387 // Mirrors the streaming-zstd path at L1180-1185.
3388 resp.output.checksum_crc32 = None;
3389 resp.output.checksum_crc32c = None;
3390 resp.output.checksum_crc64nvme = None;
3391 resp.output.checksum_sha1 = None;
3392 resp.output.checksum_sha256 = None;
3393 resp.output.e_tag = None;
3394 let elapsed = get_start.elapsed();
3395 crate::metrics::record_get(
3396 "sse-s4-chunked",
3397 body_len,
3398 body_len,
3399 elapsed.as_secs_f64(),
3400 true,
3401 );
3402 return Ok(resp);
3403 }
3404 let plain = match crate::sse::peek_magic(&body) {
3405 Some("S4E4") => {
3406 let kms = self.kms.as_ref().ok_or_else(|| {
3407 S3Error::with_message(
3408 S3ErrorCode::InvalidRequest,
3409 "object is SSE-KMS encrypted but no --kms-local-dir / --kms-aws-region is configured on this gateway",
3410 )
3411 })?;
3412 let kms_ref: &dyn crate::kms::KmsBackend = kms.as_ref();
3413 crate::sse::decrypt_with_kms(&body, kms_ref)
3414 .await
3415 .map_err(|e| match e {
3416 crate::sse::SseError::KmsBackend(k) => kms_error_to_s3(k),
3417 other => S3Error::with_message(
3418 S3ErrorCode::InternalError,
3419 format!("SSE-KMS decrypt failed: {other}"),
3420 ),
3421 })?
3422 }
3423 _ => {
3424 if let Some(ref m) = get_sse_c_material {
3425 crate::sse::decrypt(
3426 &body,
3427 crate::sse::SseSource::CustomerKey {
3428 key: &m.key,
3429 key_md5: &m.key_md5,
3430 },
3431 )
3432 .map_err(sse_c_error_to_s3)?
3433 } else {
3434 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
3435 S3Error::with_message(
3436 S3ErrorCode::InvalidRequest,
3437 "object is SSE-S4 encrypted but no --sse-s4-key is configured on this gateway",
3438 )
3439 })?;
3440 crate::sse::decrypt(&body, keyring).map_err(|e| {
3441 S3Error::with_message(
3442 S3ErrorCode::InternalError,
3443 format!("SSE-S4 decrypt failed: {e}"),
3444 )
3445 })?
3446 }
3447 }
3448 };
3449 // v0.5 #28: parse out the on-disk wrapped DEK's key id
3450 // so the GET response can echo `x-amz-server-side-encryption-aws-kms-key-id`.
3451 if matches!(crate::sse::peek_magic(&body), Some("S4E4"))
3452 && let Ok(hdr) = crate::sse::parse_s4e4_header(&body)
3453 {
3454 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
3455 ServerSideEncryption::AWS_KMS,
3456 ));
3457 resp.output.ssekms_key_id = Some(hdr.key_id.to_string());
3458 }
3459 bytes_to_blob(plain)
3460 } else if let Some(ref m) = get_sse_c_material {
3461 // Client sent SSE-C headers for an unencrypted object —
3462 // mirror AWS S3's 400 InvalidRequest.
3463 let _ = m;
3464 return Err(sse_c_error_to_s3(
3465 crate::sse::SseError::CustomerKeyUnexpected,
3466 ));
3467 } else {
3468 blob
3469 };
3470 // v0.5 #27: SSE-C echo on success — algorithm + key MD5
3471 // tell the client that the supplied key was the one used.
3472 if let Some(ref m) = get_sse_c_material {
3473 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
3474 resp.output.sse_customer_key_md5 =
3475 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
3476 }
3477 // ====== Streaming fast path (CpuZstd, non-multipart, codec supports it) ======
3478 // 大規模 object (e.g. 5 GB) を memory に collect すると OOM するので、
3479 // codec が streaming-aware なら body を chunk-by-chunk で decompress して
3480 // 即座に client に流す。
3481 //
3482 // ただし Range request 時は streaming できない (slice するため total bytes
3483 // が必要) → buffered path に fall through。
3484 if range_request.is_none()
3485 && !needs_frame_parse
3486 && let Some(ref m) = manifest_opt
3487 && supports_streaming_decompress(m.codec)
3488 && m.codec == CodecKind::CpuZstd
3489 {
3490 // v0.8.4 #73 H-1: wrap the decompressor output in a
3491 // rolling-CRC32C verifier so a tampered ciphertext (or a
3492 // backend-side corruption that the zstd decoder happens
3493 // to "successfully" decode into wrong bytes) surfaces as
3494 // a streaming error tail at EOF instead of silently
3495 // delivering corrupt plaintext to the client. The wrap
3496 // is a pure pass-through during the body — no extra
3497 // buffering, TTFB unaffected — and the integrity
3498 // decision lands at the last chunk.
3499 let decompressed_blob = cpu_zstd_decompress_stream(blob);
3500 let verified_reader = Crc32cVerifyingReader::new(
3501 blob_to_async_read(decompressed_blob),
3502 m.crc32c,
3503 m.original_size,
3504 );
3505 let verified_blob = async_read_to_blob(verified_reader);
3506 resp.output.content_length = Some(m.original_size as i64);
3507 resp.output.checksum_crc32 = None;
3508 resp.output.checksum_crc32c = None;
3509 resp.output.checksum_crc64nvme = None;
3510 resp.output.checksum_sha1 = None;
3511 resp.output.checksum_sha256 = None;
3512 resp.output.e_tag = None;
3513 resp.output.body = Some(verified_blob);
3514 let elapsed = get_start.elapsed();
3515 crate::metrics::record_get(
3516 m.codec.as_str(),
3517 m.compressed_size,
3518 m.original_size,
3519 elapsed.as_secs_f64(),
3520 true,
3521 );
3522 info!(
3523 op = "get_object",
3524 bucket = %get_bucket,
3525 key = %get_key,
3526 codec = m.codec.as_str(),
3527 bytes_in = m.compressed_size,
3528 bytes_out = m.original_size,
3529 path = "streaming",
3530 setup_latency_ms = elapsed.as_millis() as u64,
3531 "S4 get started (streaming)"
3532 );
3533 return Ok(resp);
3534 }
3535 // Passthrough: そのまま流す (Range なしの場合のみ streaming)
3536 if range_request.is_none()
3537 && !needs_frame_parse
3538 && let Some(ref m) = manifest_opt
3539 && m.codec == CodecKind::Passthrough
3540 {
3541 resp.output.content_length = Some(m.original_size as i64);
3542 resp.output.checksum_crc32 = None;
3543 resp.output.checksum_crc32c = None;
3544 resp.output.checksum_crc64nvme = None;
3545 resp.output.checksum_sha1 = None;
3546 resp.output.checksum_sha256 = None;
3547 resp.output.e_tag = None;
3548 resp.output.body = Some(blob);
3549 debug!("S4 get_object: passthrough streaming");
3550 return Ok(resp);
3551 }
3552
3553 // ====== Buffered slow path (multipart frame parser, GPU codecs) ======
3554 let bytes = collect_blob(blob, self.max_body_bytes)
3555 .await
3556 .map_err(internal("collect get body"))?;
3557
3558 let decompressed = if needs_frame_parse {
3559 // multipart objects と framed-v2 single-PUT objects は同じ
3560 // S4F2 frame 列なので decompress_multipart で統一処理
3561 self.decompress_multipart(bytes).await?
3562 } else {
3563 let manifest = manifest_opt.as_ref().expect("non-multipart guarded above");
3564 self.registry
3565 .decompress(bytes, manifest)
3566 .await
3567 .map_err(internal("registry decompress"))?
3568 };
3569
3570 // Range request があれば slice。なければ full body を返す。
3571 let total_size = decompressed.len() as u64;
3572 let (final_bytes, status_override) = if let Some(r) = range_request.as_ref() {
3573 let (start, end) = resolve_range(r, total_size)
3574 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
3575 let sliced = decompressed.slice(start as usize..end as usize);
3576 resp.output.content_range = Some(format!(
3577 "bytes {start}-{}/{total_size}",
3578 end.saturating_sub(1)
3579 ));
3580 (sliced, Some(http::StatusCode::PARTIAL_CONTENT))
3581 } else {
3582 (decompressed, None)
3583 };
3584 // 解凍後の真のサイズを返す (S3 client は content_length を信頼するので
3585 // 圧縮 size のままだと downstream が body を途中で切ってしまう)
3586 resp.output.content_length = Some(final_bytes.len() as i64);
3587 // 圧縮済 bytes の checksum を返すと AWS SDK 側で StreamingError
3588 // (ChecksumMismatch) になる。ETag も backend が返した「圧縮済 bytes の
3589 // MD5/checksum」なので意味的にズレる — クリアして S4 自身の crc32c
3590 // (manifest 内 / frame 内) で integrity を保証する設計にする。
3591 resp.output.checksum_crc32 = None;
3592 resp.output.checksum_crc32c = None;
3593 resp.output.checksum_crc64nvme = None;
3594 resp.output.checksum_sha1 = None;
3595 resp.output.checksum_sha256 = None;
3596 resp.output.e_tag = None;
3597 let returned_size = final_bytes.len() as u64;
3598 let codec_label = manifest_opt
3599 .as_ref()
3600 .map(|m| m.codec.as_str())
3601 .unwrap_or("multipart");
3602 resp.output.body = Some(bytes_to_blob(final_bytes));
3603 if let Some(status) = status_override {
3604 resp.status = Some(status);
3605 }
3606 let elapsed = get_start.elapsed();
3607 crate::metrics::record_get(codec_label, 0, returned_size, elapsed.as_secs_f64(), true);
3608 info!(
3609 op = "get_object",
3610 bucket = %get_bucket,
3611 key = %get_key,
3612 codec = codec_label,
3613 bytes_out = returned_size,
3614 total_object_size = total_size,
3615 range = range_request.is_some(),
3616 path = "buffered",
3617 latency_ms = elapsed.as_millis() as u64,
3618 "S4 get completed (buffered)"
3619 );
3620 }
3621 // v0.6 #40: echo the recorded `x-amz-replication-status` so
3622 // consumers can poll progress (PENDING / COMPLETED / FAILED).
3623 if let Some(mgr) = self.replication.as_ref()
3624 && let Some(status) = mgr.lookup_status(&get_bucket, &get_key)
3625 {
3626 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3627 status.as_aws_str().to_owned(),
3628 ));
3629 }
3630 Ok(resp)
3631 }
3632
3633 // === passthrough delegations ===
3634 async fn head_bucket(
3635 &self,
3636 req: S3Request<HeadBucketInput>,
3637 ) -> S3Result<S3Response<HeadBucketOutput>> {
3638 self.backend.head_bucket(req).await
3639 }
3640 async fn list_buckets(
3641 &self,
3642 req: S3Request<ListBucketsInput>,
3643 ) -> S3Result<S3Response<ListBucketsOutput>> {
3644 self.backend.list_buckets(req).await
3645 }
3646 async fn create_bucket(
3647 &self,
3648 req: S3Request<CreateBucketInput>,
3649 ) -> S3Result<S3Response<CreateBucketOutput>> {
3650 self.backend.create_bucket(req).await
3651 }
3652 async fn delete_bucket(
3653 &self,
3654 req: S3Request<DeleteBucketInput>,
3655 ) -> S3Result<S3Response<DeleteBucketOutput>> {
3656 self.backend.delete_bucket(req).await
3657 }
3658 async fn head_object(
3659 &self,
3660 req: S3Request<HeadObjectInput>,
3661 ) -> S3Result<S3Response<HeadObjectOutput>> {
3662 // v0.6 #40: capture bucket/key before req is consumed so the
3663 // replication-status echo can look the entry up.
3664 let head_bucket = req.input.bucket.clone();
3665 let head_key = req.input.key.clone();
3666 // v0.8.16 F-13: same reserved-name guard as `get_object`.
3667 if s4_codec::index::is_reserved_sidecar_key(&head_key) {
3668 return Err(S3Error::with_message(
3669 S3ErrorCode::NoSuchKey,
3670 format!("object key {head_key:?} is reserved for S4 internal sidecars"),
3671 ));
3672 }
3673 let mut resp = self.backend.head_object(req).await?;
3674 if let Some(manifest) = extract_manifest(&resp.output.metadata) {
3675 // 客側には decompress 後の意味のある content_length / checksum を返す。
3676 // backend が返す圧縮済 bytes の checksum / e_tag は意味が違うため除去
3677 // (S4 は manifest 内の crc32c で integrity を担保する)。
3678 resp.output.content_length = Some(manifest.original_size as i64);
3679 resp.output.checksum_crc32 = None;
3680 resp.output.checksum_crc32c = None;
3681 resp.output.checksum_crc64nvme = None;
3682 resp.output.checksum_sha1 = None;
3683 resp.output.checksum_sha256 = None;
3684 resp.output.e_tag = None;
3685 }
3686 // v0.6 #40: echo `x-amz-replication-status` (PENDING / COMPLETED
3687 // / FAILED) so consumers can poll progress without a GET.
3688 if let Some(mgr) = self.replication.as_ref()
3689 && let Some(status) = mgr.lookup_status(&head_bucket, &head_key)
3690 {
3691 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3692 status.as_aws_str().to_owned(),
3693 ));
3694 }
3695 // v0.7 #48 BUG-4 fix: HEAD must echo SSE indicators so SDKs
3696 // and pipelines see the same posture they got on PUT. The PUT
3697 // path stamps `s4-sse-type` metadata for exactly this — HEAD
3698 // doesn't fetch the body, so it can't peek frame magic.
3699 if let Some(meta) = resp.output.metadata.as_ref()
3700 && let Some(sse_type) = meta.get("s4-sse-type")
3701 {
3702 {
3703 match sse_type.as_str() {
3704 "aws:kms" => {
3705 resp.output.server_side_encryption = Some(
3706 ServerSideEncryption::from_static(ServerSideEncryption::AWS_KMS),
3707 );
3708 if let Some(key_id) = meta.get("s4-sse-kms-key-id") {
3709 resp.output.ssekms_key_id = Some(key_id.clone());
3710 }
3711 }
3712 _ => {
3713 resp.output.server_side_encryption = Some(
3714 ServerSideEncryption::from_static(ServerSideEncryption::AES256),
3715 );
3716 if let Some(md5) = meta.get("s4-sse-c-key-md5") {
3717 resp.output.sse_customer_algorithm =
3718 Some(crate::sse::SSE_C_ALGORITHM.into());
3719 resp.output.sse_customer_key_md5 = Some(md5.clone());
3720 }
3721 }
3722 }
3723 }
3724 }
3725 Ok(resp)
3726 }
3727 async fn delete_object(
3728 &self,
3729 mut req: S3Request<DeleteObjectInput>,
3730 ) -> S3Result<S3Response<DeleteObjectOutput>> {
3731 let bucket = req.input.bucket.clone();
3732 let key = req.input.key.clone();
3733 // v0.8.16 F-13: reserved-name guard on DELETE. Without it a
3734 // hostile client could `DeleteObject(<key>.s4index)` to
3735 // orphan the sidecar, silently disabling Range-GET
3736 // partial-fetch for the corresponding `<key>`. The S4
3737 // internal cleanup path (`write_sidecar` and friends)
3738 // talks to `self.backend.delete_object(...)` directly, NOT
3739 // through this trait method, so the guard doesn't break
3740 // legitimate sidecar cleanup.
3741 if s4_codec::index::is_reserved_sidecar_key(&key) {
3742 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
3743 .unwrap_or(S3ErrorCode::InvalidArgument);
3744 return Err(S3Error::with_message(
3745 code,
3746 format!(
3747 "object key {key:?} is reserved (suffix `{}` is used for S4 internal sidecars)",
3748 s4_codec::index::SIDECAR_SUFFIX,
3749 ),
3750 ));
3751 }
3752 self.enforce_rate_limit(&req, &bucket)?;
3753 self.enforce_policy(&req, "s3:DeleteObject", &bucket, Some(&key))?;
3754 // v0.6 #42: MFA Delete enforcement. When the bucket has
3755 // MFA-Delete = Enabled, every DELETE / DELETE-version /
3756 // delete-marker form needs `x-amz-mfa: <serial> <code>` (RFC 6238
3757 // 6-digit TOTP). Runs *before* the WORM / versioning routers so
3758 // a missing token is denied for free regardless of which delete
3759 // path the request would otherwise take.
3760 if let Some(mgr) = self.mfa_delete.as_ref()
3761 && mgr.is_enabled(&bucket)
3762 {
3763 let header = req.input.mfa.as_deref();
3764 if let Err(e) = crate::mfa::check_mfa(&bucket, header, mgr, current_unix_secs()) {
3765 crate::metrics::record_mfa_delete_denial(&bucket);
3766 return Err(mfa_error_to_s3(e));
3767 }
3768 }
3769 // v0.5 #30: refuse the delete while a WORM lock is in effect.
3770 // Compliance can never be bypassed; Governance can be overridden
3771 // via `x-amz-bypass-governance-retention: true`; legal hold
3772 // never. The check happens before the versioning router so a
3773 // locked object can't be soft-deleted (delete-marker push) on an
3774 // Enabled bucket either — S3 spec says lock applies to all
3775 // delete forms.
3776 if let Some(mgr) = self.object_lock.as_ref()
3777 && let Some(state) = mgr.get(&bucket, &key)
3778 {
3779 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
3780 // v0.8.12 HIGH-7 fix: the bypass header alone used to be
3781 // enough to override Governance retention. AWS spec
3782 // requires the caller hold `s3:BypassGovernanceRetention`
3783 // for the target ARN; without that, the header is
3784 // silently ignored (not an error — it lines up with how
3785 // AWS' canonical behaviour treats unprivileged callers).
3786 let bypass_allowed = if bypass_header {
3787 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
3788 .is_ok()
3789 } else {
3790 false
3791 };
3792 let now = chrono::Utc::now();
3793 if !state.can_delete(now, bypass_allowed) {
3794 crate::metrics::record_policy_denial("s3:DeleteObject", &bucket);
3795 return Err(S3Error::with_message(
3796 S3ErrorCode::AccessDenied,
3797 "Access Denied because object protected by object lock",
3798 ));
3799 }
3800 }
3801 // v0.5 #34: route DELETE through the VersioningManager when the
3802 // bucket is in a versioning-aware state.
3803 //
3804 // - Enabled bucket, no version_id → push a delete marker into
3805 // the chain. NO backend object is touched (older versions
3806 // stay reachable via specific-version GET).
3807 // - Enabled / Suspended bucket, with version_id → physical
3808 // delete. Backend bytes at the shadow key (or `<key>` for
3809 // `null`) are removed; chain entry is dropped. If the deleted
3810 // entry was a delete marker, no backend bytes exist for it
3811 // (record-only).
3812 // - Suspended bucket, no version_id → push a "null" delete
3813 // marker (S3 spec); backend bytes at `<key>` are physically
3814 // removed (same as legacy).
3815 // - Unversioned bucket → fall through to legacy passthrough.
3816 if let Some(mgr) = self.versioning.as_ref() {
3817 let state = mgr.state(&bucket);
3818 if state != crate::versioning::VersioningState::Unversioned {
3819 let req_vid = req.input.version_id.take();
3820 if let Some(vid) = req_vid {
3821 // Specific-version DELETE: touch backend bytes only
3822 // when the entry was a real version (not a delete
3823 // marker, which has no backend bytes).
3824 let outcome = mgr.record_delete_specific(&bucket, &key, &vid);
3825 let backend_target = if vid == crate::versioning::NULL_VERSION_ID {
3826 key.clone()
3827 } else {
3828 versioned_shadow_key(&key, &vid)
3829 };
3830 let was_real_version = outcome
3831 .as_ref()
3832 .map(|o| !o.is_delete_marker)
3833 .unwrap_or(false);
3834 if was_real_version {
3835 // Best-effort backend cleanup; missing bytes
3836 // are not an error (e.g. shadow key already
3837 // GC'd).
3838 let backend_input = DeleteObjectInput {
3839 bucket: bucket.clone(),
3840 key: backend_target,
3841 ..Default::default()
3842 };
3843 let backend_req = S3Request {
3844 input: backend_input,
3845 method: http::Method::DELETE,
3846 uri: req.uri.clone(),
3847 headers: req.headers.clone(),
3848 extensions: http::Extensions::new(),
3849 credentials: req.credentials.clone(),
3850 region: req.region.clone(),
3851 service: req.service.clone(),
3852 trailing_headers: None,
3853 };
3854 let _ = self.backend.delete_object(backend_req).await;
3855 }
3856 let mut output = DeleteObjectOutput {
3857 version_id: Some(vid.clone()),
3858 ..Default::default()
3859 };
3860 if let Some(o) = outcome.as_ref()
3861 && o.is_delete_marker
3862 {
3863 output.delete_marker = Some(true);
3864 }
3865 // v0.6 #35: specific-version DELETE always counts as
3866 // a hard `ObjectRemoved:Delete` event (the chain
3867 // entry, marker or not, is gone after this call).
3868 self.fire_delete_notification(
3869 &bucket,
3870 &key,
3871 crate::notifications::EventType::ObjectRemovedDelete,
3872 Some(vid.clone()),
3873 );
3874 return Ok(S3Response::new(output));
3875 }
3876 // No version_id: record a delete marker (state-aware).
3877 let outcome = mgr.record_delete(&bucket, &key);
3878 if state == crate::versioning::VersioningState::Suspended {
3879 // Suspended buckets also evict the prior `<key>`
3880 // bytes (the previous null version is gone too).
3881 let backend_input = DeleteObjectInput {
3882 bucket: bucket.clone(),
3883 key: key.clone(),
3884 ..Default::default()
3885 };
3886 let backend_req = S3Request {
3887 input: backend_input,
3888 method: http::Method::DELETE,
3889 uri: req.uri.clone(),
3890 headers: req.headers.clone(),
3891 extensions: http::Extensions::new(),
3892 credentials: req.credentials.clone(),
3893 region: req.region.clone(),
3894 service: req.service.clone(),
3895 trailing_headers: None,
3896 };
3897 let _ = self.backend.delete_object(backend_req).await;
3898 }
3899 let output = DeleteObjectOutput {
3900 delete_marker: Some(true),
3901 version_id: outcome.version_id.clone(),
3902 ..Default::default()
3903 };
3904 // v0.6 #35: versioned bucket DELETE without a version-id
3905 // creates a delete marker — the dedicated AWS event
3906 // taxonomy entry. Suspended-state buckets also push a
3907 // (null) marker, so the same event fires there.
3908 self.fire_delete_notification(
3909 &bucket,
3910 &key,
3911 crate::notifications::EventType::ObjectRemovedDeleteMarker,
3912 outcome.version_id,
3913 );
3914 return Ok(S3Response::new(output));
3915 }
3916 }
3917 // Legacy / Unversioned path: physical delete on the backend +
3918 // best-effort sidecar cleanup (mirrors v0.4 behaviour).
3919 let resp = self.backend.delete_object(req).await?;
3920 // v0.5 #30: drop any per-object lock state once the delete has
3921 // succeeded so the freed key can be re-armed by a future PUT
3922 // under the bucket default. Reaching here implies the lock had
3923 // already passed `can_delete` above, so this is purely cleanup.
3924 if let Some(mgr) = self.object_lock.as_ref() {
3925 mgr.clear(&bucket, &key);
3926 }
3927 // v0.6 #39: drop any object-level tag set on physical delete —
3928 // the freed key starts a fresh tag history if a future PUT
3929 // re-creates it. (Versioned-delete branches above return early
3930 // and do NOT touch tags, mirroring AWS where tag state is
3931 // attached to the logical key, not the version chain.)
3932 if let Some(mgr) = self.tagging.as_ref() {
3933 mgr.delete_object_tags(&bucket, &key);
3934 }
3935 let sidecar = sidecar_key(&key);
3936 // v0.7 #49: skip the sidecar DELETE if the key + sidecar suffix
3937 // can't be encoded into a request URI — the primary delete
3938 // already succeeded and a stale sidecar is harmless (Range GET
3939 // re-validates the underlying object on next read).
3940 if let Ok(uri) = safe_object_uri(&bucket, &sidecar) {
3941 let sidecar_input = DeleteObjectInput {
3942 bucket: bucket.clone(),
3943 key: sidecar,
3944 ..Default::default()
3945 };
3946 let sidecar_req = S3Request {
3947 input: sidecar_input,
3948 method: http::Method::DELETE,
3949 uri,
3950 headers: http::HeaderMap::new(),
3951 extensions: http::Extensions::new(),
3952 credentials: None,
3953 region: None,
3954 service: None,
3955 trailing_headers: None,
3956 };
3957 let _ = self.backend.delete_object(sidecar_req).await;
3958 }
3959 // v0.6 #35: legacy unversioned-bucket hard delete fires the
3960 // canonical `ObjectRemoved:Delete` event.
3961 self.fire_delete_notification(
3962 &bucket,
3963 &key,
3964 crate::notifications::EventType::ObjectRemovedDelete,
3965 None,
3966 );
3967 Ok(resp)
3968 }
3969 async fn delete_objects(
3970 &self,
3971 req: S3Request<DeleteObjectsInput>,
3972 ) -> S3Result<S3Response<DeleteObjectsOutput>> {
3973 // v0.6 #42: MFA Delete applies once to the whole batch (S3 spec:
3974 // when MFA-Delete is on the bucket, a missing / invalid token
3975 // fails the entire DeleteObjects request, not per-object).
3976 if let Some(mgr) = self.mfa_delete.as_ref()
3977 && mgr.is_enabled(&req.input.bucket)
3978 {
3979 let header = req.input.mfa.as_deref();
3980 if let Err(e) =
3981 crate::mfa::check_mfa(&req.input.bucket, header, mgr, current_unix_secs())
3982 {
3983 crate::metrics::record_mfa_delete_denial(&req.input.bucket);
3984 return Err(mfa_error_to_s3(e));
3985 }
3986 }
3987 // v0.8.11 CRIT-3 fix: route every entry through the gated
3988 // per-object `delete_object` path so Object Lock, IAM policy,
3989 // versioning, tagging, sidecar cleanup and notification fan-
3990 // out all fire for batch DELETE. The previous
3991 // `self.backend.delete_objects(req).await` straight-through
3992 // bypassed every gate, so a `legal_hold=on` key listed inside
3993 // a DeleteObjects XML was happily removed.
3994 //
3995 // S3 spec note: DeleteObjects is "best-effort per object" —
3996 // a failure on one key surfaces as an `Errors` entry without
3997 // aborting the rest of the batch. Quiet-mode suppresses the
3998 // `Deleted` list (errors are still reported). We honour both.
3999 let bucket = req.input.bucket.clone();
4000 let bypass_governance = req.input.bypass_governance_retention.unwrap_or(false);
4001 let mfa_header = req.input.mfa.clone();
4002 let quiet = req.input.delete.quiet.unwrap_or(false);
4003 let mut deleted: Vec<DeletedObject> = Vec::new();
4004 let mut errors: Vec<s3s::dto::Error> = Vec::new();
4005 for ident in req.input.delete.objects.iter() {
4006 let key = ident.key.clone();
4007 let version_id = ident.version_id.clone();
4008 let per_input = DeleteObjectInput {
4009 bucket: bucket.clone(),
4010 key: key.clone(),
4011 version_id: version_id.clone(),
4012 bypass_governance_retention: Some(bypass_governance),
4013 mfa: mfa_header.clone(),
4014 ..Default::default()
4015 };
4016 let per_uri = match safe_object_uri(&bucket, &key) {
4017 Ok(u) => u,
4018 Err(_) => {
4019 errors.push(s3s::dto::Error {
4020 code: Some("InvalidArgument".to_owned()),
4021 key: Some(key),
4022 message: Some("object key is not URI-encodable".to_owned()),
4023 version_id,
4024 });
4025 continue;
4026 }
4027 };
4028 let per_req = S3Request {
4029 input: per_input,
4030 method: http::Method::DELETE,
4031 uri: per_uri,
4032 headers: req.headers.clone(),
4033 extensions: http::Extensions::new(),
4034 credentials: req.credentials.clone(),
4035 region: req.region.clone(),
4036 service: req.service.clone(),
4037 trailing_headers: None,
4038 };
4039 match self.delete_object(per_req).await {
4040 Ok(resp) => {
4041 let out = resp.output;
4042 // DeleteObjectOutput doesn't surface a separate
4043 // `delete_marker_version_id`; the marker's version
4044 // id is whatever `version_id` carries (when the
4045 // versioning manager pushed a delete-marker, that
4046 // field already holds the marker's vid).
4047 let vid = out.version_id.clone().or(version_id);
4048 deleted.push(DeletedObject {
4049 key: Some(key),
4050 version_id: vid.clone(),
4051 delete_marker: out.delete_marker,
4052 delete_marker_version_id: vid,
4053 });
4054 }
4055 Err(e) => {
4056 let code_str = e.code().as_str().to_owned();
4057 let msg = e.message().unwrap_or(code_str.as_str()).to_owned();
4058 errors.push(s3s::dto::Error {
4059 code: Some(code_str),
4060 key: Some(key),
4061 message: Some(msg),
4062 version_id,
4063 });
4064 }
4065 }
4066 }
4067 let output = DeleteObjectsOutput {
4068 deleted: if quiet || deleted.is_empty() {
4069 None
4070 } else {
4071 Some(deleted)
4072 },
4073 errors: if errors.is_empty() {
4074 None
4075 } else {
4076 Some(errors)
4077 },
4078 ..Default::default()
4079 };
4080 Ok(S3Response::new(output))
4081 }
4082 async fn copy_object(
4083 &self,
4084 mut req: S3Request<CopyObjectInput>,
4085 ) -> S3Result<S3Response<CopyObjectOutput>> {
4086 // copy is conceptually "GetObject src + PutObject dst" — enforce both.
4087 let dst_bucket = req.input.bucket.clone();
4088 let dst_key = req.input.key.clone();
4089 // v0.8.15 M-1: same reserved-name guard as `put_object`. A
4090 // copy whose destination would land at `<x>.s4index` carries
4091 // the same listing / cleanup hazards.
4092 if s4_codec::index::is_reserved_sidecar_key(&dst_key) {
4093 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
4094 .unwrap_or(S3ErrorCode::InvalidArgument);
4095 return Err(S3Error::with_message(
4096 code,
4097 format!(
4098 "destination key {dst_key:?} is reserved (suffix `{}` is used for S4 \
4099 internal sidecars)",
4100 s4_codec::index::SIDECAR_SUFFIX,
4101 ),
4102 ));
4103 }
4104 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
4105 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4106 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
4107 }
4108 // S4-aware copy: source object に s4-* metadata がある場合、それを
4109 // destination に確実に preserve する。
4110 //
4111 // - MetadataDirective::COPY (default): backend が source metadata を
4112 // そのまま copy するので S4 metadata も自動で渡る。介入不要
4113 // - MetadataDirective::REPLACE: 客が指定した metadata で source を
4114 // 上書き → s4-* metadata が消えると destination は decompress 不能に
4115 // なる (silent corruption)。S4 が source metadata を HEAD で取得し、
4116 // s4-* fields を input.metadata に強制 merge する
4117 let needs_merge = req
4118 .input
4119 .metadata_directive
4120 .as_ref()
4121 .map(|d| d.as_str() == MetadataDirective::REPLACE)
4122 .unwrap_or(false);
4123 if needs_merge && let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4124 // v0.8.16 F-8: strip the client-supplied `s4-*` keys
4125 // *unconditionally* — the v0.8.15 M-2 fix only ran the
4126 // strip inside the `if let Ok(head) = ...` block, so a
4127 // backend HEAD failure (transient 5xx, NoSuchKey on a
4128 // racing delete) left attacker-injected `s4-*` /
4129 // `S4-*` metadata intact on the destination. Now we
4130 // strip first, then re-populate from the source HEAD
4131 // when available — HEAD failure simply means the
4132 // destination loses the codec markers (correct: a
4133 // CopyObject without the source's codec metadata
4134 // produces an unreadable object, but doesn't allow
4135 // injection).
4136 let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4137 dest_meta.retain(|k, _| !k.to_ascii_lowercase().starts_with("s4-"));
4138 let head_input = HeadObjectInput {
4139 bucket: bucket.to_string(),
4140 key: key.to_string(),
4141 ..Default::default()
4142 };
4143 let head_req = S3Request {
4144 input: head_input,
4145 method: req.method.clone(),
4146 uri: req.uri.clone(),
4147 headers: req.headers.clone(),
4148 extensions: http::Extensions::new(),
4149 credentials: req.credentials.clone(),
4150 region: req.region.clone(),
4151 service: req.service.clone(),
4152 trailing_headers: None,
4153 };
4154 if let Ok(head) = self.backend.head_object(head_req).await
4155 && let Some(src_meta) = head.output.metadata.as_ref()
4156 {
4157 let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4158 for key in [
4159 META_CODEC,
4160 META_ORIGINAL_SIZE,
4161 META_COMPRESSED_SIZE,
4162 META_CRC32C,
4163 META_MULTIPART,
4164 META_FRAMED,
4165 ] {
4166 if let Some(v) = src_meta.get(key) {
4167 dest_meta.insert(key.to_string(), v.clone());
4168 }
4169 }
4170 // SSE markers are equally reserved — propagate any
4171 // source flags so a copy of an encrypted object stays
4172 // marked as encrypted at the destination.
4173 for sse_key in [
4174 "s4-encrypted",
4175 "s4-sse-type",
4176 "s4-sse-c-key-md5",
4177 "s4-sse-kms-key-id",
4178 ] {
4179 if let Some(v) = src_meta.get(sse_key) {
4180 dest_meta.insert(sse_key.to_string(), v.clone());
4181 }
4182 }
4183 debug!(
4184 src_bucket = %bucket,
4185 src_key = %key,
4186 "S4 copy_object: replaced client s4-* metadata with source values across REPLACE directive (v0.8.15 M-2)"
4187 );
4188 }
4189 }
4190 self.backend.copy_object(req).await
4191 }
4192 async fn list_objects(
4193 &self,
4194 req: S3Request<ListObjectsInput>,
4195 ) -> S3Result<S3Response<ListObjectsOutput>> {
4196 self.enforce_rate_limit(&req, &req.input.bucket)?;
4197 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4198 let mut resp = self.backend.list_objects(req).await?;
4199 // S4 内部 object (`*.s4index` sidecar、`.__s4ver__/` shadow versions
4200 // — v0.5 #34) を顧客から隠す。
4201 if let Some(contents) = resp.output.contents.as_mut() {
4202 contents.retain(|o| {
4203 o.key
4204 .as_ref()
4205 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4206 .unwrap_or(true)
4207 });
4208 }
4209 Ok(resp)
4210 }
4211 async fn list_objects_v2(
4212 &self,
4213 req: S3Request<ListObjectsV2Input>,
4214 ) -> S3Result<S3Response<ListObjectsV2Output>> {
4215 self.enforce_rate_limit(&req, &req.input.bucket)?;
4216 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4217 let mut resp = self.backend.list_objects_v2(req).await?;
4218 if let Some(contents) = resp.output.contents.as_mut() {
4219 let before = contents.len();
4220 contents.retain(|o| {
4221 o.key
4222 .as_ref()
4223 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4224 .unwrap_or(true)
4225 });
4226 // key_count も補正 (S3 spec compliance)
4227 if let Some(kc) = resp.output.key_count.as_mut() {
4228 *kc -= (before - contents.len()) as i32;
4229 }
4230 }
4231 Ok(resp)
4232 }
4233 /// v0.4 #17: filter S4-internal sidecars from versioned listings.
4234 /// v0.5 #34: when a [`crate::versioning::VersioningManager`] is
4235 /// attached AND the bucket is in a versioning-aware state, build
4236 /// the `Versions` / `DeleteMarkers` arrays directly from the
4237 /// in-memory chain (paginated + ordered the S3 way: key asc,
4238 /// version newest-first inside each key). Otherwise fall back to
4239 /// passthrough + sidecar-filter (legacy v0.4 behaviour).
4240 async fn list_object_versions(
4241 &self,
4242 req: S3Request<ListObjectVersionsInput>,
4243 ) -> S3Result<S3Response<ListObjectVersionsOutput>> {
4244 self.enforce_rate_limit(&req, &req.input.bucket)?;
4245 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4246 // v0.5 #34: VersioningManager-owned path.
4247 if let Some(mgr) = self.versioning.as_ref()
4248 && mgr.state(&req.input.bucket) != crate::versioning::VersioningState::Unversioned
4249 {
4250 let max_keys = req.input.max_keys.unwrap_or(1000) as usize;
4251 let page = mgr.list_versions(
4252 &req.input.bucket,
4253 req.input.prefix.as_deref(),
4254 req.input.key_marker.as_deref(),
4255 req.input.version_id_marker.as_deref(),
4256 max_keys,
4257 );
4258 let versions: Vec<ObjectVersion> = page
4259 .versions
4260 .into_iter()
4261 .map(|e| ObjectVersion {
4262 key: Some(e.key),
4263 version_id: Some(e.version_id),
4264 is_latest: Some(e.is_latest),
4265 e_tag: Some(ETag::Strong(e.etag)),
4266 size: Some(e.size as i64),
4267 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4268 ..Default::default()
4269 })
4270 .collect();
4271 let delete_markers: Vec<DeleteMarkerEntry> = page
4272 .delete_markers
4273 .into_iter()
4274 .map(|e| DeleteMarkerEntry {
4275 key: Some(e.key),
4276 version_id: Some(e.version_id),
4277 is_latest: Some(e.is_latest),
4278 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4279 ..Default::default()
4280 })
4281 .collect();
4282 let output = ListObjectVersionsOutput {
4283 name: Some(req.input.bucket.clone()),
4284 prefix: req.input.prefix.clone(),
4285 key_marker: req.input.key_marker.clone(),
4286 version_id_marker: req.input.version_id_marker.clone(),
4287 max_keys: req.input.max_keys,
4288 versions: if versions.is_empty() {
4289 None
4290 } else {
4291 Some(versions)
4292 },
4293 delete_markers: if delete_markers.is_empty() {
4294 None
4295 } else {
4296 Some(delete_markers)
4297 },
4298 is_truncated: Some(page.is_truncated),
4299 next_key_marker: page.next_key_marker,
4300 next_version_id_marker: page.next_version_id_marker,
4301 ..Default::default()
4302 };
4303 return Ok(S3Response::new(output));
4304 }
4305 // Legacy passthrough path (v0.4 #17 sidecar filter retained).
4306 let mut resp = self.backend.list_object_versions(req).await?;
4307 if let Some(versions) = resp.output.versions.as_mut() {
4308 versions.retain(|v| {
4309 v.key
4310 .as_ref()
4311 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4312 .unwrap_or(true)
4313 });
4314 }
4315 if let Some(markers) = resp.output.delete_markers.as_mut() {
4316 markers.retain(|m| {
4317 m.key
4318 .as_ref()
4319 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4320 .unwrap_or(true)
4321 });
4322 }
4323 Ok(resp)
4324 }
4325
4326 async fn create_multipart_upload(
4327 &self,
4328 mut req: S3Request<CreateMultipartUploadInput>,
4329 ) -> S3Result<S3Response<CreateMultipartUploadOutput>> {
4330 // v0.8.12 HIGH-9 fix: gate multipart Create on `s3:PutObject` —
4331 // the destination is conceptually about to host a new object,
4332 // matching what `put_object` enforces L2078. Without this, a
4333 // bucket policy denying `s3:PutObject` was bypassable simply
4334 // by switching the client to the multipart wire path.
4335 let mp_bucket = req.input.bucket.clone();
4336 let mp_key = req.input.key.clone();
4337 // v0.8.15 M-1: reserved-name guard on the multipart entry too.
4338 if s4_codec::index::is_reserved_sidecar_key(&mp_key) {
4339 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
4340 .unwrap_or(S3ErrorCode::InvalidArgument);
4341 return Err(S3Error::with_message(
4342 code,
4343 format!(
4344 "object key {mp_key:?} is reserved (suffix `{}` is used for S4 internal \
4345 sidecars)",
4346 s4_codec::index::SIDECAR_SUFFIX,
4347 ),
4348 ));
4349 }
4350 self.enforce_policy(&req, "s3:PutObject", &mp_bucket, Some(&mp_key))?;
4351 self.enforce_rate_limit(&req, &mp_bucket)?;
4352 // Multipart object は per-part 圧縮 + frame 形式で書く。GET 時に
4353 // frame parse を起動するため、object metadata に flag を立てる。
4354 // codec は dispatcher の default kind を採用 (per-part 別 codec は Phase 2)。
4355 let codec_kind = self.registry.default_kind();
4356 let meta = req.input.metadata.get_or_insert_with(Default::default);
4357 meta.insert(META_MULTIPART.into(), "true".into());
4358 meta.insert(META_CODEC.into(), codec_kind.as_str().into());
4359 // v0.8 #54 BUG-10 fix: take() the SSE request fields off
4360 // `req.input` so they are NOT forwarded to the backend on
4361 // CreateMultipartUpload. Same root cause as v0.7 #48 BUG-2/3 on
4362 // single-PUT — MinIO rejects SSE-C with "HTTPS required" and
4363 // SSE-KMS with "KMS not configured" when the headers reach it.
4364 // S4 owns the encrypt-then-store contract; we capture the
4365 // recipe in `multipart_state` here and apply it on Complete.
4366 let sse_c_alg = req.input.sse_customer_algorithm.take();
4367 let sse_c_key = req.input.sse_customer_key.take();
4368 let sse_c_md5 = req.input.sse_customer_key_md5.take();
4369 let sse_header = req.input.server_side_encryption.take();
4370 let sse_kms_key = req.input.ssekms_key_id.take();
4371 // Strip the encryption-context too — leaving it would make
4372 // MinIO try to validate it against a non-existent KMS key.
4373 let _ = req.input.ssekms_encryption_context.take();
4374 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
4375 let kms_key_id = extract_kms_key_id(
4376 &sse_header,
4377 &sse_kms_key,
4378 self.kms_default_key_id.as_deref(),
4379 );
4380 // SSE-C / SSE-KMS exclusivity (mirrors put_object L1870).
4381 if sse_c_material.is_some() && kms_key_id.is_some() {
4382 return Err(S3Error::with_message(
4383 S3ErrorCode::InvalidArgument,
4384 "SSE-C and SSE-KMS cannot be used together on the same multipart upload",
4385 ));
4386 }
4387 let sse_mode = if let Some(ref m) = sse_c_material {
4388 // v0.8.2 #62 (H-6 audit fix): wrap the customer-supplied
4389 // 32-byte key in `Zeroizing` so abandoned uploads (or
4390 // normal Complete/Abort) wipe the key bytes on drop. The
4391 // `key_md5` is the public fingerprint and stays as a
4392 // bare `[u8; 16]`.
4393 crate::multipart_state::MultipartSseMode::SseC {
4394 key: zeroize::Zeroizing::new(m.key),
4395 key_md5: m.key_md5,
4396 }
4397 } else if let Some(ref kid) = kms_key_id {
4398 // KMS pre-flight: fail at Create rather than at Complete if
4399 // the gateway has no KMS backend wired (mirrors the
4400 // put_object L1879 check).
4401 if self.kms.is_none() {
4402 return Err(S3Error::with_message(
4403 S3ErrorCode::InvalidRequest,
4404 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4405 ));
4406 }
4407 crate::multipart_state::MultipartSseMode::SseKms {
4408 key_id: kid.clone(),
4409 }
4410 } else if self.sse_keyring.is_some() {
4411 // SSE-S4: server-driven transparent encryption. Activates
4412 // whenever the gateway has a keyring configured AND the
4413 // client didn't pick a different SSE mode.
4414 crate::multipart_state::MultipartSseMode::SseS4
4415 } else {
4416 crate::multipart_state::MultipartSseMode::None
4417 };
4418 // v0.8 #54 BUG-9 fix: parse the Tagging header on Create. The
4419 // single-PUT path does this on PutObject; the multipart path
4420 // captures it now and commits via TagManager on Complete.
4421 let request_tags: Option<crate::tagging::TagSet> = req
4422 .input
4423 .tagging
4424 .as_deref()
4425 .map(crate::tagging::parse_tagging_header)
4426 .transpose()
4427 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
4428 // Strip the `Tagging` field off the input so the backend
4429 // doesn't try to apply it (no-op on MinIO but keeps the wire
4430 // clean).
4431 let _ = req.input.tagging.take();
4432 // Object Lock recipe (BUG-7 — captured here, applied on Complete).
4433 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
4434 .input
4435 .object_lock_mode
4436 .as_ref()
4437 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
4438 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
4439 .input
4440 .object_lock_retain_until_date
4441 .as_ref()
4442 .and_then(timestamp_to_chrono_utc);
4443 let explicit_legal_hold_on: bool = req
4444 .input
4445 .object_lock_legal_hold_status
4446 .as_ref()
4447 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
4448 .unwrap_or(false);
4449 let bucket = req.input.bucket.clone();
4450 let key = req.input.key.clone();
4451 debug!(
4452 bucket = %bucket,
4453 key = %key,
4454 codec = codec_kind.as_str(),
4455 sse = ?sse_mode,
4456 "S4 create_multipart_upload: marking object for per-part compression"
4457 );
4458 let mut resp = self.backend.create_multipart_upload(req).await?;
4459 // Stash the per-upload context only after the backend handed
4460 // us an upload_id (failed Creates leave nothing in the store).
4461 if let Some(upload_id) = resp.output.upload_id.as_ref() {
4462 self.multipart_state.put(
4463 upload_id,
4464 crate::multipart_state::MultipartUploadContext {
4465 bucket,
4466 key,
4467 sse: sse_mode.clone(),
4468 tags: request_tags,
4469 object_lock_mode: explicit_lock_mode,
4470 object_lock_retain_until: explicit_retain_until,
4471 object_lock_legal_hold: explicit_legal_hold_on,
4472 },
4473 );
4474 }
4475 // SSE-C / SSE-KMS response echo (mirrors put_object L2036-L2050).
4476 match &sse_mode {
4477 crate::multipart_state::MultipartSseMode::SseC { key_md5, .. } => {
4478 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
4479 resp.output.sse_customer_key_md5 =
4480 Some(base64::engine::general_purpose::STANDARD.encode(key_md5));
4481 }
4482 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
4483 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
4484 ServerSideEncryption::AWS_KMS,
4485 ));
4486 resp.output.ssekms_key_id = Some(key_id.clone());
4487 }
4488 _ => {}
4489 }
4490 Ok(resp)
4491 }
4492
4493 async fn upload_part(
4494 &self,
4495 mut req: S3Request<UploadPartInput>,
4496 ) -> S3Result<S3Response<UploadPartOutput>> {
4497 // v0.8.12 HIGH-9 fix: same `s3:PutObject` gate as
4498 // `put_object` / `create_multipart_upload`. Even though
4499 // Create already passed the gate, a bucket policy that
4500 // *revokes* `s3:PutObject` mid-flight should stop further
4501 // parts (e.g. legal hold drops, retention shortened).
4502 let part_bucket = req.input.bucket.clone();
4503 let part_key = req.input.key.clone();
4504 self.enforce_policy(&req, "s3:PutObject", &part_bucket, Some(&part_key))?;
4505 self.enforce_rate_limit(&req, &part_bucket)?;
4506 // 各 part を圧縮して frame header 付きで forward。GET 時に
4507 // `decompress_multipart` が frame iter で順に解凍する。
4508 // **per-part codec dispatch**: dispatcher が body 先頭 sample から
4509 // codec を選ぶので、parquet 風の mixed-content multipart で part ごとに
4510 // 最適 codec を使える (整数列 part → Bitcomp、text 列 part → zstd 等)。
4511 //
4512 // v0.8 #54 BUG-5/BUG-10 fix: lookup the per-upload SSE
4513 // context captured by `create_multipart_upload` and (a) strip
4514 // any SSE-C request headers off `req.input` so the backend
4515 // doesn't see them — same root cause as v0.7 #48 BUG-2/3 on
4516 // single-PUT; MinIO refuses SSE-C parts over HTTP — and (b)
4517 // observe that an upload context exists for `upload_id`. The
4518 // actual encrypt happens once at `complete_multipart_upload`
4519 // time on the assembled body (the per-part-encrypt approach
4520 // would require a matching multi-segment decrypt path on GET;
4521 // encrypting the whole assembled body keeps the GET path's
4522 // `is_sse_encrypted` branch in get_object L2429 working
4523 // unchanged).
4524 let sse_ctx = self.multipart_state.get(req.input.upload_id.as_str());
4525 // v0.8.2 #62 (H-1 audit fix): SSE-C key consistency check.
4526 // The AWS S3 spec requires the same SSE-C key headers on
4527 // every UploadPart and rejects mismatches with 400. Prior to
4528 // #62 we silently stripped the headers (BUG-10 fix) without
4529 // validating them, allowing a client to send part 1 under
4530 // key-A and part 2 under key-B; both got stored, then
4531 // re-encrypted with key-A on Complete — the client thinks
4532 // part 2 is under key-B but a GET with key-B would in fact
4533 // hit the part-1 ciphertext that was actually encrypted with
4534 // key-A. That would either decrypt successfully (silent
4535 // corruption: client lost track of which key encrypts what)
4536 // or fail in a confusing way. Validate the per-part headers
4537 // now and reject with 400 InvalidArgument on mismatch /
4538 // omission / partial supply, matching real-S3 behaviour.
4539 if let Some(ref ctx) = sse_ctx {
4540 if let crate::multipart_state::MultipartSseMode::SseC {
4541 key_md5: ctx_md5, ..
4542 } = &ctx.sse
4543 {
4544 let alg = req.input.sse_customer_algorithm.take();
4545 let key_b64 = req.input.sse_customer_key.take();
4546 let md5_b64 = req.input.sse_customer_key_md5.take();
4547 match (alg, key_b64, md5_b64) {
4548 (Some(a), Some(k), Some(m)) => {
4549 // Parse + validate; if the per-part headers
4550 // are themselves malformed (algorithm not
4551 // AES256, MD5 mismatch, key not 32 bytes)
4552 // surface the same 400 the single-PUT path
4553 // would. Then compare the parsed MD5 to the
4554 // upload-context's MD5; mismatch is a
4555 // different-key UploadPart and must reject.
4556 let part_material = crate::sse::parse_customer_key_headers(&a, &k, &m)
4557 .map_err(sse_c_error_to_s3)?;
4558 if part_material.key_md5 != *ctx_md5 {
4559 return Err(S3Error::with_message(
4560 S3ErrorCode::InvalidArgument,
4561 "SSE-C key on UploadPart does not match the key supplied on CreateMultipartUpload",
4562 ));
4563 }
4564 // OK — same key as Create. Headers are
4565 // already taken off `req.input` so the
4566 // backend never sees them.
4567 }
4568 (None, None, None) => {
4569 // AWS S3 spec: SSE-C headers MUST be replayed
4570 // on every UploadPart of an SSE-C multipart.
4571 // Real-S3 returns 400 InvalidRequest in this
4572 // case; mirror that.
4573 return Err(S3Error::with_message(
4574 S3ErrorCode::InvalidRequest,
4575 "SSE-C requires customer-key headers on every UploadPart (CreateMultipartUpload was SSE-C)",
4576 ));
4577 }
4578 _ => {
4579 // Partial header set (e.g. algorithm + key
4580 // but no MD5) — same handling as the
4581 // single-PUT `extract_sse_c_material` helper.
4582 return Err(S3Error::with_message(
4583 S3ErrorCode::InvalidRequest,
4584 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
4585 ));
4586 }
4587 }
4588 } else {
4589 // CreateMultipartUpload was non-SSE-C (None / SseS4 /
4590 // SseKms). A part that arrives carrying SSE-C headers
4591 // is either a confused client or an attempt to
4592 // smuggle SSE-C around the gateway-internal SSE
4593 // recipe. Reject with 400 InvalidRequest rather than
4594 // silently strip — the strip would let the client
4595 // believe the part was encrypted under their key
4596 // when in fact the upload's encryption recipe is
4597 // whatever the Create captured.
4598 if req.input.sse_customer_algorithm.is_some()
4599 || req.input.sse_customer_key.is_some()
4600 || req.input.sse_customer_key_md5.is_some()
4601 {
4602 return Err(S3Error::with_message(
4603 S3ErrorCode::InvalidRequest,
4604 "UploadPart sent SSE-C headers but CreateMultipartUpload was not SSE-C",
4605 ));
4606 }
4607 }
4608 } else {
4609 // No upload context registered (gateway crashed between
4610 // Create and Part, or pre-#62 abandoned-upload restore).
4611 // We can't check key consistency in this case — strip
4612 // the headers and let the request through unchanged so
4613 // the backend's `NoSuchUpload` reply (or whatever it
4614 // chooses to do) flows back to the client.
4615 let _ = req.input.sse_customer_algorithm.take();
4616 let _ = req.input.sse_customer_key.take();
4617 let _ = req.input.sse_customer_key_md5.take();
4618 }
4619 let _sse_ctx = sse_ctx;
4620 if let Some(blob) = req.input.body.take() {
4621 let bytes = collect_blob(blob, self.max_body_bytes)
4622 .await
4623 .map_err(internal("collect upload_part body"))?;
4624 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
4625 // checksum algorithms against the received part body.
4626 verify_client_body_checksums(
4627 &bytes,
4628 req.input.content_md5.as_deref(),
4629 req.input.checksum_crc32.as_deref(),
4630 req.input.checksum_crc32c.as_deref(),
4631 req.input.checksum_sha1.as_deref(),
4632 req.input.checksum_sha256.as_deref(),
4633 req.input.checksum_crc64nvme.as_deref(),
4634 )?;
4635 let sample_len = bytes.len().min(SAMPLE_BYTES);
4636 // v0.8 #56: full part body is already in memory here; use its
4637 // length as the size hint so the dispatcher can promote to GPU
4638 // if it's big enough.
4639 let codec_kind = self
4640 .dispatcher
4641 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
4642 .await;
4643 let original_size = bytes.len() as u64;
4644 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
4645 let (compress_res, tel) = self
4646 .registry
4647 .compress_with_telemetry(bytes, codec_kind)
4648 .await;
4649 stamp_gpu_compress_telemetry(&tel);
4650 let (compressed, manifest) =
4651 compress_res.map_err(internal("registry compress part"))?;
4652 let header = FrameHeader {
4653 codec: codec_kind,
4654 original_size,
4655 compressed_size: compressed.len() as u64,
4656 crc32c: manifest.crc32c,
4657 };
4658 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
4659 write_frame(&mut framed, header, &compressed);
4660 // v0.2 #5: heuristic-based padding skip for likely-final parts.
4661 //
4662 // AWS SDK / aws-cli / boto3 always send the final (and only the
4663 // final) part below the configured part_size. So if the raw user
4664 // part is already smaller than S3's 5 MiB multipart minimum, this
4665 // is overwhelmingly likely to be the final part — and the final
4666 // part is exempt from S3's size constraint. Skipping padding here
4667 // saves up to ~5 MiB per object on highly compressible workloads.
4668 //
4669 // If a misbehaving client sends a tiny **non-final** part, S3
4670 // itself rejects with EntityTooSmall at CompleteMultipartUpload —
4671 // identical outcome to a vanilla S3 PUT, just earlier than
4672 // padding-then-complete would catch it.
4673 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
4674 if !likely_final {
4675 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
4676 }
4677 let framed_bytes = framed.freeze();
4678 let new_len = framed_bytes.len() as i64;
4679 // 同じ wire 互換問題が multipart にもある (content-length / checksum)
4680 req.input.content_length = Some(new_len);
4681 req.input.checksum_algorithm = None;
4682 req.input.checksum_crc32 = None;
4683 req.input.checksum_crc32c = None;
4684 req.input.checksum_crc64nvme = None;
4685 req.input.checksum_sha1 = None;
4686 req.input.checksum_sha256 = None;
4687 req.input.content_md5 = None;
4688 req.input.body = Some(bytes_to_blob(framed_bytes));
4689 debug!(
4690 part_number = ?req.input.part_number,
4691 upload_id = ?req.input.upload_id,
4692 original_size,
4693 framed_size = new_len,
4694 "S4 upload_part: framed compressed payload"
4695 );
4696 }
4697 self.backend.upload_part(req).await
4698 }
4699 async fn complete_multipart_upload(
4700 &self,
4701 mut req: S3Request<CompleteMultipartUploadInput>,
4702 ) -> S3Result<S3Response<CompleteMultipartUploadOutput>> {
4703 let bucket = req.input.bucket.clone();
4704 let key = req.input.key.clone();
4705 let upload_id = req.input.upload_id.clone();
4706 // v0.8.12 HIGH-9 fix: gate Complete on `s3:PutObject` (the
4707 // commit point for the multipart-assembled object).
4708 self.enforce_policy(&req, "s3:PutObject", &bucket, Some(&key))?;
4709 self.enforce_rate_limit(&req, &bucket)?;
4710 // v0.8.12 HIGH-6 fix: re-verify Object Lock on the target key
4711 // at Complete time. Without this an attacker with PutObject
4712 // permission could `CreateMultipartUpload` against a key
4713 // that's currently under retention / legal hold and silently
4714 // overwrite it on Complete (the single-PUT path runs the
4715 // same check at L2007). Compliance retention is never
4716 // bypassable; Governance only with explicit IAM permission
4717 // (HIGH-7 gate below).
4718 if let Some(mgr) = self.object_lock.as_ref()
4719 && let Some(state) = mgr.get(&bucket, &key)
4720 {
4721 // CompleteMultipartUpload doesn't carry the bypass header
4722 // (the s3s DTO matches AWS' wire schema). A locked key
4723 // therefore cannot be overwritten by Complete regardless
4724 // of caller permission — operators who need to break a
4725 // Governance lock do it via PutObjectRetention before
4726 // calling Complete.
4727 let now = chrono::Utc::now();
4728 if !state.can_delete(now, false) {
4729 crate::metrics::record_policy_denial("s3:PutObject", &bucket);
4730 return Err(S3Error::with_message(
4731 S3ErrorCode::AccessDenied,
4732 "Access Denied because target key is protected by object lock",
4733 ));
4734 }
4735 }
4736 // v0.8.1 #59: serialise concurrent Complete invocations on the
4737 // same `(bucket, key)`. The race window the lock closes is the
4738 // GET-assembled-body → encrypt → PUT-encrypted-body triple
4739 // below (BUG-5 fix); without serialisation, two Completes for
4740 // different `upload_id` but the same logical key could each
4741 // read the other's plaintext assembled body and overwrite the
4742 // peer's encrypted result. The guard is held to function exit
4743 // (drop on `Ok` / `Err`), covering version-id mint, object-
4744 // lock apply, tagging persist, and replication enqueue too.
4745 let completion_lock = self.multipart_state.completion_lock(&bucket, &key);
4746 let _completion_guard = completion_lock.lock().await;
4747 // v0.8 #54 — fetch the per-upload context captured on Create.
4748 // `None` means an abandoned / unknown upload_id (gateway
4749 // crashed between Create and Complete, or pre-v0.8 state
4750 // restore); we still let the backend do its thing for
4751 // transparency, but we can't apply any SSE / version / lock /
4752 // tag / replication post-processing because we never captured
4753 // the recipe.
4754 let ctx = self.multipart_state.get(upload_id.as_str());
4755 // v0.8 #54 BUG-10 fix: same SSE-C header strip as upload_part
4756 // — some clients (boto3 / aws-sdk-cpp older versions) replay
4757 // the SSE-C triple on Complete too, and MinIO will choke if
4758 // they reach the backend.
4759 let _ = req.input.sse_customer_algorithm.take();
4760 let _ = req.input.sse_customer_key.take();
4761 let _ = req.input.sse_customer_key_md5.take();
4762 let mut resp = self.backend.complete_multipart_upload(req).await?;
4763 // CompleteMultipartUpload 成功 → 完成した object を full fetch して frame
4764 // index を build、`<key>.s4index` sidecar として保存。これで Range GET の
4765 // partial fetch path が利用可能になる (Range request の帯域節約)。
4766 // 注: 巨大 object の場合この pass は重いが、Range query は一度 sidecar が
4767 // できれば爆速になるので 1 回の cost は payback される
4768 //
4769 // v0.8 #54 BUG-5..9: this same fetch is the choke-point for
4770 // the SSE encrypt re-PUT + versioning shadow-key rewrite +
4771 // replication source-bytes capture, so we GET once and reuse
4772 // the bytes for every post-processing step.
4773 let assembled_body: Option<bytes::Bytes> = if let Ok(uri) = safe_object_uri(&bucket, &key) {
4774 let get_input = GetObjectInput {
4775 bucket: bucket.clone(),
4776 key: key.clone(),
4777 ..Default::default()
4778 };
4779 let get_req = S3Request {
4780 input: get_input,
4781 method: http::Method::GET,
4782 uri,
4783 headers: http::HeaderMap::new(),
4784 extensions: http::Extensions::new(),
4785 credentials: None,
4786 region: None,
4787 service: None,
4788 trailing_headers: None,
4789 };
4790 match self.backend.get_object(get_req).await {
4791 Ok(get_resp) => match get_resp.output.body {
4792 Some(blob) => collect_blob(blob, self.max_body_bytes).await.ok(),
4793 None => None,
4794 },
4795 Err(e) => {
4796 // v0.8.4 #71 (C-1 audit fix): a silent
4797 // `Err(_) => None` here is a SSE plaintext
4798 // leak. The post-processing block below only
4799 // runs the SSE re-encrypt branch when
4800 // `assembled_body.is_some()`, so swallowing a
4801 // backend error skipped the encrypt step and
4802 // left the multipart object on disk as
4803 // plaintext, even on SSE-S4 / SSE-C / SSE-KMS
4804 // configured buckets. Same root-cause family
4805 // as v0.8 BUG-5; this branch closes the
4806 // remaining read-side window.
4807 //
4808 // We distinguish two cases:
4809 // - `NoSuchKey`: the object is genuinely
4810 // missing post-Complete. This is rare and
4811 // typically races with a concurrent
4812 // DeleteObject; there is nothing to re-
4813 // encrypt and no SSE markers to honour, so
4814 // falling through to the legacy
4815 // `assembled_body = None` path is safe.
4816 // - everything else (5xx, network, auth,
4817 // etc.): we must FAIL the Complete so the
4818 // client can retry. Returning Ok with
4819 // `assembled_body = None` would silently
4820 // skip the SSE re-encrypt and leave the
4821 // backend bytes plaintext.
4822 if matches!(e.code(), &S3ErrorCode::NoSuchKey) {
4823 tracing::warn!(
4824 bucket = %bucket,
4825 key = %key,
4826 "multipart Complete: backend GET returned NoSuchKey; \
4827 skipping post-processing (object likely raced with DeleteObject)"
4828 );
4829 None
4830 } else {
4831 tracing::error!(
4832 bucket = %bucket,
4833 key = %key,
4834 error = %e,
4835 "multipart Complete: backend GET failed; failing the Complete \
4836 so the client retries (silent fall-through would skip SSE \
4837 re-encrypt and store plaintext)"
4838 );
4839 return Err(internal("multipart Complete: backend body fetch failed")(e));
4840 }
4841 }
4842 }
4843 } else {
4844 None
4845 };
4846 // Sidecar build (existing behaviour, gated on assembled body).
4847 //
4848 // v0.8.12 HIGH-10 fix: skip the sidecar when the Complete is
4849 // going to SSE-encrypt the assembled body before re-PUT (the
4850 // single-PUT path applies the same suppression at L2271).
4851 // Stale offsets into the pre-encrypt body would break Range
4852 // GET on the encrypted on-disk bytes. `ctx.sse != None`
4853 // covers all three SSE modes captured at Create time.
4854 let mp_will_encrypt = ctx
4855 .as_ref()
4856 .map(|c| !matches!(c.sse, crate::multipart_state::MultipartSseMode::None))
4857 .unwrap_or(false);
4858 // v0.8.16 F-7: versioned multipart writes the assembled body
4859 // under `versioned_shadow_key(&key, vid)` *after* this
4860 // sidecar block, then deletes the original `<key>`. Stamping
4861 // the sidecar against the to-be-deleted `<key>` (which is
4862 // what H-g did) leaves an orphan `<key>.s4index` whose
4863 // source-ETag binding can never match the live shadow body
4864 // — the Range GET fast-path's stale-sidecar check then
4865 // falls through to a full read on every request, silently
4866 // disabling partial fetch. Skip the sidecar build entirely
4867 // for versioned buckets; a follow-up issue tracks writing
4868 // the sidecar under the shadow key with the shadow's ETag.
4869 let mp_skip_sidecar_for_versioning = self
4870 .versioning
4871 .as_ref()
4872 .map(|mgr| mgr.state(&bucket))
4873 .map(|state| state == crate::versioning::VersioningState::Enabled)
4874 .unwrap_or(false);
4875 if let Some(ref body) = assembled_body
4876 && !mp_will_encrypt
4877 && !mp_skip_sidecar_for_versioning
4878 && let Ok(mut index) = build_index_from_body(body)
4879 {
4880 // v0.8.15 H-g: stamp the source-ETag / source-compressed-size
4881 // binding on the multipart sidecar. The single-PUT path
4882 // does this at L2519-L2521 via the backend's PUT response,
4883 // but Complete returns its own ETag (an opaque manifest
4884 // hash) so we have to HEAD the freshly-completed object
4885 // to pick up what backend actually wrote, then bind the
4886 // sidecar to those values. Without the binding, a
4887 // subsequent backend-side mutation (lifecycle rewrite,
4888 // out-of-band CopyObject) wouldn't trip the staleness
4889 // check on the next Range GET — the GET would happily
4890 // slice the new bytes at the old sidecar offsets, with
4891 // silent data corruption.
4892 if let Ok(uri) = safe_object_uri(&bucket, &key) {
4893 let head_req = S3Request {
4894 input: HeadObjectInput {
4895 bucket: bucket.clone(),
4896 key: key.clone(),
4897 ..Default::default()
4898 },
4899 method: http::Method::HEAD,
4900 uri,
4901 headers: http::HeaderMap::new(),
4902 extensions: http::Extensions::new(),
4903 credentials: None,
4904 region: None,
4905 service: None,
4906 trailing_headers: None,
4907 };
4908 if let Ok(head) = self.backend.head_object(head_req).await {
4909 index.source_etag = head.output.e_tag.as_ref().map(|t| t.value().to_string());
4910 index.source_compressed_size = head
4911 .output
4912 .content_length
4913 .and_then(|n| u64::try_from(n).ok());
4914 }
4915 // HEAD failure is non-fatal — the sidecar still works
4916 // as a v1-style best-effort fast path; the Range GET
4917 // simply falls back to a full read on any consistency
4918 // signal.
4919 }
4920 self.write_sidecar(&bucket, &key, &index).await;
4921 }
4922 // From here on, post-processing depends on the context —
4923 // short-circuit when the upload had no captured recipe
4924 // (legacy / crashed-Create / pre-v0.8 state restore).
4925 if let Some(ctx) = ctx {
4926 // v0.8 #54 BUG-6 fix: mint a version-id when the bucket
4927 // is versioning-Enabled. The single-PUT path does this in
4928 // `put_object` ~L1968; multipart was the missing branch.
4929 // We mint here (post-Complete, before any re-PUT) so the
4930 // same vid threads into both the shadow-key rewrite and
4931 // the VersionEntry the manager records.
4932 let pending_version: Option<crate::versioning::PutOutcome> = self
4933 .versioning
4934 .as_ref()
4935 .map(|mgr| mgr.state(&bucket))
4936 .map(|state| match state {
4937 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
4938 version_id: crate::versioning::VersioningManager::new_version_id(),
4939 versioned_response: true,
4940 },
4941 crate::versioning::VersioningState::Suspended
4942 | crate::versioning::VersioningState::Unversioned => {
4943 crate::versioning::PutOutcome {
4944 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
4945 versioned_response: false,
4946 }
4947 }
4948 });
4949 // v0.8 #54 BUG-5 fix: encrypt the assembled framed body
4950 // and re-PUT it to the backend so the on-disk bytes are
4951 // SSE-encrypted. The single-PUT path does this body-by-
4952 // body inside `put_object` (L1907-L1942); for multipart,
4953 // encrypt-per-part would require a multi-segment decrypt
4954 // path on GET — we instead do a single encrypt over the
4955 // assembled framed body so the existing GET decrypt
4956 // branch (`is_sse_encrypted` → `decrypt(body, source)` →
4957 // FrameIter) handles it unchanged.
4958 //
4959 // The cost is one extra round-trip per Complete for SSE-
4960 // enabled multipart (already-paid for the sidecar build).
4961 // For single-instance gateways pointing at a co-located
4962 // backend this is negligible; cross-region operators
4963 // would benefit from per-part encrypt + multi-segment
4964 // decrypt as a follow-up.
4965 let needs_re_put = matches!(
4966 ctx.sse,
4967 crate::multipart_state::MultipartSseMode::SseS4
4968 | crate::multipart_state::MultipartSseMode::SseC { .. }
4969 | crate::multipart_state::MultipartSseMode::SseKms { .. }
4970 ) || pending_version
4971 .as_ref()
4972 .map(|pv| pv.versioned_response)
4973 .unwrap_or(false);
4974 // v0.8.11 CRIT-2 fix: seed the replication body with the
4975 // pre-encrypt assembled bytes, but overwrite it with the
4976 // post-encrypt `new_body` once the re-PUT branch lands.
4977 // The previous "snapshot in advance" pattern shipped the
4978 // *plaintext* framed body to the destination bucket even
4979 // when SSE-S4 / SSE-C / SSE-KMS was active — the GET on
4980 // the destination would then fail to decrypt (or, worse,
4981 // succeed in handing out plaintext that the source had
4982 // promised was encrypted at rest). When `needs_re_put`
4983 // is false (no SSE, no versioning), the backend still
4984 // holds the original plaintext-framed bytes, and the
4985 // seed value is what the destination should receive.
4986 let mut replication_body = assembled_body.clone();
4987 let mut applied_metadata: Option<std::collections::HashMap<String, String>> = None;
4988 if needs_re_put && let Some(body) = assembled_body {
4989 // v0.8.1 #58: same Zeroizing pattern as put_object's
4990 // single-PUT KMS branch — DEK plaintext lives in
4991 // `Zeroizing<[u8; 32]>` for the lifetime of this
4992 // Complete handler, then is wiped on drop.
4993 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
4994 if let crate::multipart_state::MultipartSseMode::SseKms { ref key_id } = ctx.sse
4995 {
4996 let kms = self.kms.as_ref().ok_or_else(|| {
4997 S3Error::with_message(
4998 S3ErrorCode::InvalidRequest,
4999 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
5000 )
5001 })?;
5002 let (dek, wrapped) =
5003 kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
5004 if dek.len() != 32 {
5005 return Err(S3Error::with_message(
5006 S3ErrorCode::InternalError,
5007 format!(
5008 "KMS backend returned a DEK of {} bytes (expected 32)",
5009 dek.len()
5010 ),
5011 ));
5012 }
5013 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
5014 zeroize::Zeroizing::new([0u8; 32]);
5015 dek_arr.copy_from_slice(&dek);
5016 // `dek` (Zeroizing<Vec<u8>>) is dropped at scope end.
5017 Some((dek_arr, wrapped))
5018 } else {
5019 None
5020 };
5021 // Build the new metadata map: re-fetch via HEAD so
5022 // the multipart / codec markers the backend stamped
5023 // on Create flow through unchanged, then layer the
5024 // SSE markers on top.
5025 let head_req = S3Request {
5026 input: HeadObjectInput {
5027 bucket: bucket.clone(),
5028 key: key.clone(),
5029 ..Default::default()
5030 },
5031 method: http::Method::HEAD,
5032 uri: safe_object_uri(&bucket, &key)?,
5033 headers: http::HeaderMap::new(),
5034 extensions: http::Extensions::new(),
5035 credentials: None,
5036 region: None,
5037 service: None,
5038 trailing_headers: None,
5039 };
5040 let mut new_metadata: std::collections::HashMap<String, String> =
5041 match self.backend.head_object(head_req).await {
5042 Ok(h) => h.output.metadata.unwrap_or_default(),
5043 Err(_) => std::collections::HashMap::new(),
5044 };
5045 let new_body = match &ctx.sse {
5046 crate::multipart_state::MultipartSseMode::SseC { key, key_md5 } => {
5047 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5048 new_metadata.insert("s4-sse-type".into(), "AES256".into());
5049 new_metadata.insert(
5050 "s4-sse-c-key-md5".into(),
5051 base64::engine::general_purpose::STANDARD.encode(key_md5),
5052 );
5053 // v0.8.2 #62: `key` is `&Zeroizing<[u8; 32]>`;
5054 // auto-deref through one explicit binding so
5055 // `SseSource::CustomerKey` gets the `&[u8; 32]`
5056 // it expects (mirrors the SSE-KMS DEK shape
5057 // a few lines down).
5058 let key_ref: &[u8; 32] = key;
5059 crate::sse::encrypt_with_source(
5060 &body,
5061 crate::sse::SseSource::CustomerKey {
5062 key: key_ref,
5063 key_md5,
5064 },
5065 )
5066 }
5067 crate::multipart_state::MultipartSseMode::SseKms { .. } => {
5068 let (dek, wrapped) = kms_wrap
5069 .as_ref()
5070 .expect("SseKms branch implies kms_wrap is Some");
5071 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5072 new_metadata.insert("s4-sse-type".into(), "aws:kms".into());
5073 new_metadata.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
5074 // v0.8.1 #58: auto-deref from `&Zeroizing<[u8; 32]>`
5075 // to `&[u8; 32]` (same shape as the put_object
5076 // single-PUT branch).
5077 let dek_ref: &[u8; 32] = dek;
5078 crate::sse::encrypt_with_source(
5079 &body,
5080 crate::sse::SseSource::Kms {
5081 dek: dek_ref,
5082 wrapped,
5083 },
5084 )
5085 }
5086 crate::multipart_state::MultipartSseMode::SseS4 => {
5087 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
5088 S3Error::with_message(
5089 S3ErrorCode::InternalError,
5090 "SSE-S4 captured at Create but keyring missing at Complete",
5091 )
5092 })?;
5093 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5094 // SSE-S4 deliberately omits `s4-sse-type` so
5095 // HEAD doesn't falsely advertise AWS-style
5096 // SSE-S3 (matches the put_object L1929-L1939
5097 // comment).
5098 // v0.8 #52: same chunk_size dispatch as the
5099 // single-PUT branch — multipart Complete
5100 // re-encrypts the assembled body, so honoring
5101 // the chunked path here is required to keep
5102 // GET streaming on multipart-uploaded objects.
5103 if self.sse_chunk_size > 0 {
5104 crate::sse::encrypt_v2_chunked(&body, keyring, self.sse_chunk_size)
5105 .map_err(|e| {
5106 S3Error::with_message(
5107 S3ErrorCode::InternalError,
5108 format!("SSE-S4 chunked encrypt failed at Complete: {e}"),
5109 )
5110 })?
5111 } else {
5112 crate::sse::encrypt_v2(&body, keyring)
5113 }
5114 }
5115 crate::multipart_state::MultipartSseMode::None => body.clone(),
5116 };
5117 // v0.8 #54 BUG-6 fix: write the re-PUT under the
5118 // shadow key so the version chain doesn't overwrite
5119 // the previous version on a versioned bucket. The
5120 // original (unshadowed) key was assembled by the
5121 // backend on Complete; we delete it after the shadow
5122 // PUT lands.
5123 let put_target_key = if let Some(pv) = pending_version.as_ref() {
5124 if pv.versioned_response {
5125 versioned_shadow_key(&key, &pv.version_id)
5126 } else {
5127 key.clone()
5128 }
5129 } else {
5130 key.clone()
5131 };
5132 let new_body_len = new_body.len() as i64;
5133 let put_req = S3Request {
5134 input: PutObjectInput {
5135 bucket: bucket.clone(),
5136 key: put_target_key.clone(),
5137 body: Some(bytes_to_blob(new_body.clone())),
5138 metadata: Some(new_metadata.clone()),
5139 content_length: Some(new_body_len),
5140 ..Default::default()
5141 },
5142 method: http::Method::PUT,
5143 uri: safe_object_uri(&bucket, &put_target_key)?,
5144 headers: http::HeaderMap::new(),
5145 extensions: http::Extensions::new(),
5146 credentials: None,
5147 region: None,
5148 service: None,
5149 trailing_headers: None,
5150 };
5151 self.backend.put_object(put_req).await?;
5152 // v0.8.11 CRIT-2 fix: refresh the replication snapshot
5153 // with the bytes that were actually persisted to the
5154 // backend (post-SSE-encrypt for SSE modes; identical to
5155 // `body` for `MultipartSseMode::None` + versioning-only
5156 // re-PUT). The destination then sees the same on-disk
5157 // shape the source does, and a destination GET decrypts
5158 // correctly when SSE is on.
5159 replication_body = Some(new_body.clone());
5160 // If we rewrote the storage key (versioning shadow),
5161 // we must drop the original (unshadowed) Complete-
5162 // assembled bytes so subsequent listings don't see a
5163 // duplicate.
5164 if put_target_key != key {
5165 let del_req = S3Request {
5166 input: DeleteObjectInput {
5167 bucket: bucket.clone(),
5168 key: key.clone(),
5169 ..Default::default()
5170 },
5171 method: http::Method::DELETE,
5172 uri: safe_object_uri(&bucket, &key)?,
5173 headers: http::HeaderMap::new(),
5174 extensions: http::Extensions::new(),
5175 credentials: None,
5176 region: None,
5177 service: None,
5178 trailing_headers: None,
5179 };
5180 let _ = self.backend.delete_object(del_req).await;
5181 }
5182 applied_metadata = Some(new_metadata);
5183 }
5184 // v0.8 #54 BUG-6 commit: register the new version with
5185 // the VersioningManager so list_object_versions /
5186 // GET ?versionId= see it.
5187 if let (Some(mgr), Some(pv)) = (self.versioning.as_ref(), pending_version.as_ref()) {
5188 let etag = resp
5189 .output
5190 .e_tag
5191 .clone()
5192 .map(ETag::into_value)
5193 .unwrap_or_default();
5194 let now = chrono::Utc::now();
5195 mgr.commit_put_with_version(
5196 &bucket,
5197 &key,
5198 crate::versioning::VersionEntry {
5199 version_id: pv.version_id.clone(),
5200 etag,
5201 size: replication_body
5202 .as_ref()
5203 .map(|b| b.len() as u64)
5204 .unwrap_or(0),
5205 is_delete_marker: false,
5206 created_at: now,
5207 },
5208 );
5209 if pv.versioned_response {
5210 resp.output.version_id = Some(pv.version_id.clone());
5211 }
5212 }
5213 // v0.8 #54 BUG-7 fix: persist any per-upload Object Lock
5214 // recipe + auto-apply the bucket default. Mirrors the
5215 // put_object L2057-L2074 block.
5216 if let Some(mgr) = self.object_lock.as_ref() {
5217 if ctx.object_lock_mode.is_some()
5218 || ctx.object_lock_retain_until.is_some()
5219 || ctx.object_lock_legal_hold
5220 {
5221 let mut state = mgr.get(&bucket, &key).unwrap_or_default();
5222 if let Some(m) = ctx.object_lock_mode {
5223 state.mode = Some(m);
5224 }
5225 if let Some(u) = ctx.object_lock_retain_until {
5226 state.retain_until = Some(u);
5227 }
5228 if ctx.object_lock_legal_hold {
5229 state.legal_hold_on = true;
5230 }
5231 mgr.set(&bucket, &key, state);
5232 }
5233 mgr.apply_default_on_put(&bucket, &key, chrono::Utc::now());
5234 }
5235 // v0.8 #54 BUG-9 fix: persist the captured tags via the
5236 // TagManager so GetObjectTagging returns them.
5237 if let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), ctx.tags.as_ref()) {
5238 mgr.put_object_tags(&bucket, &key, tags.clone());
5239 }
5240 // SSE-C / SSE-KMS response echo. The
5241 // CompleteMultipartUploadOutput only exposes
5242 // `server_side_encryption` + `ssekms_key_id` (no
5243 // sse_customer_* — those round-tripped on Create / parts).
5244 match &ctx.sse {
5245 crate::multipart_state::MultipartSseMode::SseC { .. } => {
5246 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5247 ServerSideEncryption::AES256,
5248 ));
5249 }
5250 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5251 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5252 ServerSideEncryption::AWS_KMS,
5253 ));
5254 resp.output.ssekms_key_id = Some(key_id.clone());
5255 }
5256 _ => {}
5257 }
5258 // v0.8 #54 BUG-8 fix: fire cross-bucket replication just
5259 // like put_object L2165 does. We hand the dispatcher the
5260 // assembled body bytes (post-encrypt where applicable, so
5261 // the destination ends up byte-identical to the source's
5262 // on-disk shape) plus the metadata that was actually
5263 // committed.
5264 let replication_body_bytes = replication_body.unwrap_or_default();
5265 // v0.8.2 #61: thread the multipart-Complete `pending_version`
5266 // through so a versioning-Enabled source's destination
5267 // receives the same shadow-key path (mirror of the
5268 // single-PUT branch above).
5269 self.spawn_replication_if_matched(
5270 &bucket,
5271 &key,
5272 &ctx.tags,
5273 &replication_body_bytes,
5274 &applied_metadata,
5275 true,
5276 pending_version.as_ref(),
5277 );
5278 self.multipart_state.remove(upload_id.as_str());
5279 }
5280 // v0.8.1 #59 janitor: best-effort sweep of stale completion
5281 // locks while we are still on the critical path of a single
5282 // Complete (so steady-state workloads of unique keys don't
5283 // accumulate `DashMap` entries). The sweep only retires
5284 // entries whose `Arc::strong_count == 1`, so any other in-
5285 // flight Complete on a different key keeps its lock alive.
5286 // Our own `_completion_guard` keeps `bucket`/`key`'s entry
5287 // alive across this call; it's reaped on the next Complete or
5288 // the next caller-driven prune.
5289 self.multipart_state.prune_completion_locks();
5290 Ok(resp)
5291 }
5292 async fn abort_multipart_upload(
5293 &self,
5294 req: S3Request<AbortMultipartUploadInput>,
5295 ) -> S3Result<S3Response<AbortMultipartUploadOutput>> {
5296 // v0.8.12 HIGH-9 fix: gate Abort on `s3:AbortMultipartUpload`
5297 // — the AWS-spec action verb for this operation. Without the
5298 // gate, anyone who could guess an upload_id could throw away
5299 // someone else's in-flight multipart upload.
5300 let abort_bucket = req.input.bucket.clone();
5301 let abort_key = req.input.key.clone();
5302 self.enforce_policy(
5303 &req,
5304 "s3:AbortMultipartUpload",
5305 &abort_bucket,
5306 Some(&abort_key),
5307 )?;
5308 // v0.8 #54: drop the per-upload state (SSE-C key bytes / tag
5309 // set) promptly so an aborted upload doesn't leak the
5310 // customer's key into a long-running gateway's RSS.
5311 //
5312 // v0.8.4 #71 (H-7 audit fix): backend.abort_multipart_upload
5313 // FIRST, then drop in-process state ONLY on success. The
5314 // previous order ("remove → call backend") meant a transient
5315 // backend abort failure (5xx, network) wiped the SSE-C key
5316 // bytes locally while leaving the parts on the backend, so a
5317 // client retry would have to re-validate the SSE-C key against
5318 // a context the gateway no longer has — and the retried abort
5319 // would still hit the unaborted backend parts. Calling the
5320 // backend first lets the failure propagate to the client with
5321 // state intact for a clean retry; only on success do we wipe
5322 // the local state.
5323 let upload_id = req.input.upload_id.as_str().to_owned();
5324 let resp = self.backend.abort_multipart_upload(req).await?;
5325 self.multipart_state.remove(&upload_id);
5326 Ok(resp)
5327 }
5328 async fn list_multipart_uploads(
5329 &self,
5330 req: S3Request<ListMultipartUploadsInput>,
5331 ) -> S3Result<S3Response<ListMultipartUploadsOutput>> {
5332 self.backend.list_multipart_uploads(req).await
5333 }
5334 async fn list_parts(
5335 &self,
5336 req: S3Request<ListPartsInput>,
5337 ) -> S3Result<S3Response<ListPartsOutput>> {
5338 self.backend.list_parts(req).await
5339 }
5340
5341 // =========================================================================
5342 // Phase 2 — pure passthrough delegations。S4 はこれらに対して圧縮 hook を
5343 // 持たないので、backend (= AWS S3) の動作と完全に同一。
5344 //
5345 // 既知の制限事項:
5346 // - copy_object / upload_part_copy: source object が S4-compressed の場合、
5347 // backend が bytes を copy するだけなので metadata (s4-codec etc) も一緒に
5348 // coppied される (AWS S3 default = MetadataDirective COPY)。GET は manifest
5349 // 経由で正しく decompress できる。MetadataDirective REPLACE で上書き
5350 // されると圧縮 metadata が消えて壊れる — 顧客側の運用で注意
5351 // - list_object_versions: versioning enabled bucket では各 version も S4
5352 // metadata を維持する。古い version も S4 経由で正しく GET できる。
5353 // =========================================================================
5354
5355 // ---- Object ACL / tagging / attributes ----
5356 async fn get_object_acl(
5357 &self,
5358 req: S3Request<GetObjectAclInput>,
5359 ) -> S3Result<S3Response<GetObjectAclOutput>> {
5360 self.backend.get_object_acl(req).await
5361 }
5362 async fn put_object_acl(
5363 &self,
5364 req: S3Request<PutObjectAclInput>,
5365 ) -> S3Result<S3Response<PutObjectAclOutput>> {
5366 self.backend.put_object_acl(req).await
5367 }
5368 // v0.6 #39: object tagging — when a `TagManager` is attached the
5369 // configuration / per-(bucket, key) state lives in the manager and
5370 // these handlers serve directly from it; when no manager is
5371 // attached they fall back to the backend (legacy passthrough so
5372 // v0.5 deployments are unaffected).
5373 async fn get_object_tagging(
5374 &self,
5375 req: S3Request<GetObjectTaggingInput>,
5376 ) -> S3Result<S3Response<GetObjectTaggingOutput>> {
5377 let Some(mgr) = self.tagging.as_ref() else {
5378 return self.backend.get_object_tagging(req).await;
5379 };
5380 let tags = mgr
5381 .get_object_tags(&req.input.bucket, &req.input.key)
5382 .unwrap_or_default();
5383 Ok(S3Response::new(GetObjectTaggingOutput {
5384 tag_set: tagset_to_aws(&tags),
5385 ..Default::default()
5386 }))
5387 }
5388 async fn put_object_tagging(
5389 &self,
5390 req: S3Request<PutObjectTaggingInput>,
5391 ) -> S3Result<S3Response<PutObjectTaggingOutput>> {
5392 let Some(mgr) = self.tagging.as_ref() else {
5393 return self.backend.put_object_tagging(req).await;
5394 };
5395 let bucket = req.input.bucket.clone();
5396 let key = req.input.key.clone();
5397 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
5398 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5399 // v0.6 #39: gate via IAM policy with both the request tags
5400 // (`s3:RequestObjectTag/<key>`) and any existing tags on the
5401 // target object (`s3:ExistingObjectTag/<key>`).
5402 let existing = mgr.get_object_tags(&bucket, &key);
5403 self.enforce_policy_with_extra(
5404 &req,
5405 "s3:PutObjectTagging",
5406 &bucket,
5407 Some(&key),
5408 Some(&parsed),
5409 existing.as_ref(),
5410 )?;
5411 mgr.put_object_tags(&bucket, &key, parsed);
5412 Ok(S3Response::new(PutObjectTaggingOutput::default()))
5413 }
5414 async fn delete_object_tagging(
5415 &self,
5416 req: S3Request<DeleteObjectTaggingInput>,
5417 ) -> S3Result<S3Response<DeleteObjectTaggingOutput>> {
5418 let Some(mgr) = self.tagging.as_ref() else {
5419 return self.backend.delete_object_tagging(req).await;
5420 };
5421 let bucket = req.input.bucket.clone();
5422 let key = req.input.key.clone();
5423 let existing = mgr.get_object_tags(&bucket, &key);
5424 self.enforce_policy_with_extra(
5425 &req,
5426 "s3:DeleteObjectTagging",
5427 &bucket,
5428 Some(&key),
5429 None,
5430 existing.as_ref(),
5431 )?;
5432 mgr.delete_object_tags(&bucket, &key);
5433 Ok(S3Response::new(DeleteObjectTaggingOutput::default()))
5434 }
5435 async fn get_object_attributes(
5436 &self,
5437 req: S3Request<GetObjectAttributesInput>,
5438 ) -> S3Result<S3Response<GetObjectAttributesOutput>> {
5439 self.backend.get_object_attributes(req).await
5440 }
5441 async fn restore_object(
5442 &self,
5443 req: S3Request<RestoreObjectInput>,
5444 ) -> S3Result<S3Response<RestoreObjectOutput>> {
5445 self.backend.restore_object(req).await
5446 }
5447 async fn upload_part_copy(
5448 &self,
5449 req: S3Request<UploadPartCopyInput>,
5450 ) -> S3Result<S3Response<UploadPartCopyOutput>> {
5451 // v0.8.12 HIGH-9 fix: same per-action gates as `copy_object` —
5452 // destination PUT + source GET.
5453 let dst_bucket = req.input.bucket.clone();
5454 let dst_key = req.input.key.clone();
5455 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
5456 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
5457 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
5458 }
5459 self.enforce_rate_limit(&req, &dst_bucket)?;
5460 // v0.2 #6: byte-range aware copy when the source is S4-framed.
5461 //
5462 // For a framed source (multipart upload OR single-PUT framed-v2),
5463 // a naive byte-range passthrough would copy compressed bytes that
5464 // don't align with S4 frame boundaries — silently corrupting the
5465 // result. Instead we GET the source through S4 (which handles
5466 // decompression + Range), re-compress + re-frame as a new part,
5467 // and forward as upload_part. For non-framed sources (S4-untouched
5468 // raw objects), passthrough is correct and we keep the original
5469 // (cheaper) code path.
5470 // v0.8.4 #74: propagate the optional `?versionId=<vid>` from the
5471 // copy-source header. Without this, a versioned source bucket
5472 // copy that pins a specific old version would silently fall
5473 // back to "latest", assembling wrong bytes into the destination
5474 // multipart object (silent data corruption).
5475 let CopySource::Bucket {
5476 bucket: src_bucket,
5477 key: src_key,
5478 version_id: src_version_id,
5479 } = &req.input.copy_source
5480 else {
5481 return self.backend.upload_part_copy(req).await;
5482 };
5483 let src_bucket = src_bucket.to_string();
5484 let src_key = src_key.to_string();
5485 let src_version_id: Option<String> = src_version_id.as_deref().map(str::to_owned);
5486
5487 // Probe metadata to decide whether the source needs S4-aware copy.
5488 let head_input = HeadObjectInput {
5489 bucket: src_bucket.clone(),
5490 key: src_key.clone(),
5491 version_id: src_version_id.clone(),
5492 ..Default::default()
5493 };
5494 let head_req = S3Request {
5495 input: head_input,
5496 method: http::Method::HEAD,
5497 uri: req.uri.clone(),
5498 headers: req.headers.clone(),
5499 extensions: http::Extensions::new(),
5500 credentials: req.credentials.clone(),
5501 region: req.region.clone(),
5502 service: req.service.clone(),
5503 trailing_headers: None,
5504 };
5505 let needs_s4_copy = match self.backend.head_object(head_req).await {
5506 Ok(h) => {
5507 is_multipart_object(&h.output.metadata) || is_framed_v2_object(&h.output.metadata)
5508 }
5509 Err(_) => false,
5510 };
5511 if !needs_s4_copy {
5512 return self.backend.upload_part_copy(req).await;
5513 }
5514
5515 // Resolve the optional source byte range to pass to GET.
5516 let source_range = req
5517 .input
5518 .copy_source_range
5519 .as_ref()
5520 .map(|r| parse_copy_source_range(r))
5521 .transpose()
5522 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
5523
5524 // GET source via S4 (handles decompression + sidecar partial fetch
5525 // when range is present). The result is the requested user-visible
5526 // byte range, fully decompressed. version_id is propagated so
5527 // pinned-version copies fetch the exact version requested.
5528 let mut get_input = GetObjectInput {
5529 bucket: src_bucket.clone(),
5530 key: src_key.clone(),
5531 version_id: src_version_id.clone(),
5532 ..Default::default()
5533 };
5534 get_input.range = source_range;
5535 let get_req = S3Request {
5536 input: get_input,
5537 method: http::Method::GET,
5538 uri: req.uri.clone(),
5539 headers: req.headers.clone(),
5540 extensions: http::Extensions::new(),
5541 credentials: req.credentials.clone(),
5542 region: req.region.clone(),
5543 service: req.service.clone(),
5544 trailing_headers: None,
5545 };
5546 let get_resp = self.get_object(get_req).await?;
5547 let blob = get_resp.output.body.ok_or_else(|| {
5548 S3Error::with_message(
5549 S3ErrorCode::InternalError,
5550 "upload_part_copy: empty body from source GET",
5551 )
5552 })?;
5553 let bytes = collect_blob(blob, self.max_body_bytes)
5554 .await
5555 .map_err(internal("collect upload_part_copy source body"))?;
5556
5557 // Compress + frame as a fresh part (mirrors upload_part path).
5558 let sample_len = bytes.len().min(SAMPLE_BYTES);
5559 // v0.8 #56: same size-hint promotion as the upload_part path.
5560 let codec_kind = self
5561 .dispatcher
5562 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
5563 .await;
5564 let original_size = bytes.len() as u64;
5565 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
5566 let (compress_res, tel) = self
5567 .registry
5568 .compress_with_telemetry(bytes, codec_kind)
5569 .await;
5570 stamp_gpu_compress_telemetry(&tel);
5571 let (compressed, manifest) =
5572 compress_res.map_err(internal("registry compress upload_part_copy"))?;
5573 let header = FrameHeader {
5574 codec: codec_kind,
5575 original_size,
5576 compressed_size: compressed.len() as u64,
5577 crc32c: manifest.crc32c,
5578 };
5579 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
5580 write_frame(&mut framed, header, &compressed);
5581 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
5582 if !likely_final {
5583 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
5584 }
5585 let framed_bytes = framed.freeze();
5586 let framed_len = framed_bytes.len() as i64;
5587
5588 // Forward as upload_part to the destination multipart upload.
5589 let part_input = UploadPartInput {
5590 bucket: req.input.bucket.clone(),
5591 key: req.input.key.clone(),
5592 part_number: req.input.part_number,
5593 upload_id: req.input.upload_id.clone(),
5594 body: Some(bytes_to_blob(framed_bytes)),
5595 content_length: Some(framed_len),
5596 ..Default::default()
5597 };
5598 let part_req = S3Request {
5599 input: part_input,
5600 method: http::Method::PUT,
5601 uri: req.uri.clone(),
5602 headers: req.headers.clone(),
5603 extensions: http::Extensions::new(),
5604 credentials: req.credentials.clone(),
5605 region: req.region.clone(),
5606 service: req.service.clone(),
5607 trailing_headers: None,
5608 };
5609 let upload_resp = self.backend.upload_part(part_req).await?;
5610
5611 let copy_output = UploadPartCopyOutput {
5612 copy_part_result: Some(CopyPartResult {
5613 e_tag: upload_resp.output.e_tag.clone(),
5614 ..Default::default()
5615 }),
5616 ..Default::default()
5617 };
5618 Ok(S3Response::new(copy_output))
5619 }
5620
5621 // ---- Object lock / retention / legal hold (v0.5 #30) ----
5622 //
5623 // When an `ObjectLockManager` is attached the configuration / per-object
5624 // state lives in the manager and these handlers serve directly from it;
5625 // when no manager is attached they fall back to the backend (legacy
5626 // passthrough so v0.4 deployments are unaffected).
5627 async fn get_object_lock_configuration(
5628 &self,
5629 req: S3Request<GetObjectLockConfigurationInput>,
5630 ) -> S3Result<S3Response<GetObjectLockConfigurationOutput>> {
5631 self.enforce_policy(
5632 &req,
5633 "s3:GetBucketObjectLockConfiguration",
5634 &req.input.bucket,
5635 None,
5636 )?;
5637 if let Some(mgr) = self.object_lock.as_ref() {
5638 let cfg = mgr
5639 .bucket_default(&req.input.bucket)
5640 .map(|d| ObjectLockConfiguration {
5641 object_lock_enabled: Some(ObjectLockEnabled::from_static(
5642 ObjectLockEnabled::ENABLED,
5643 )),
5644 rule: Some(ObjectLockRule {
5645 default_retention: Some(DefaultRetention {
5646 days: Some(d.retention_days as i32),
5647 mode: Some(ObjectLockRetentionMode::from_static(match d.mode {
5648 crate::object_lock::LockMode::Governance => {
5649 ObjectLockRetentionMode::GOVERNANCE
5650 }
5651 crate::object_lock::LockMode::Compliance => {
5652 ObjectLockRetentionMode::COMPLIANCE
5653 }
5654 })),
5655 years: None,
5656 }),
5657 }),
5658 });
5659 let output = GetObjectLockConfigurationOutput {
5660 object_lock_configuration: cfg,
5661 };
5662 return Ok(S3Response::new(output));
5663 }
5664 self.backend.get_object_lock_configuration(req).await
5665 }
5666 async fn put_object_lock_configuration(
5667 &self,
5668 req: S3Request<PutObjectLockConfigurationInput>,
5669 ) -> S3Result<S3Response<PutObjectLockConfigurationOutput>> {
5670 self.enforce_policy(
5671 &req,
5672 "s3:PutBucketObjectLockConfiguration",
5673 &req.input.bucket,
5674 None,
5675 )?;
5676 if let Some(mgr) = self.object_lock.as_ref() {
5677 let bucket = req.input.bucket.clone();
5678 if let Some(cfg) = req.input.object_lock_configuration.as_ref()
5679 && let Some(rule) = cfg.rule.as_ref()
5680 && let Some(d) = rule.default_retention.as_ref()
5681 {
5682 let mode = d
5683 .mode
5684 .as_ref()
5685 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()))
5686 .ok_or_else(|| {
5687 S3Error::with_message(
5688 S3ErrorCode::InvalidRequest,
5689 "Object Lock default retention requires a valid Mode (GOVERNANCE | COMPLIANCE)",
5690 )
5691 })?;
5692 // S3 spec: exactly one of Days / Years (we accept Days
5693 // outright and convert Years → Days for storage; Years
5694 // is just a UX shorthand on the wire).
5695 let days: u32 = match (d.days, d.years) {
5696 (Some(d), None) if d > 0 => d as u32,
5697 (None, Some(y)) if y > 0 => (y as u32).saturating_mul(365),
5698 _ => {
5699 return Err(S3Error::with_message(
5700 S3ErrorCode::InvalidRequest,
5701 "Object Lock default retention requires exactly one of Days or Years (positive integer)",
5702 ));
5703 }
5704 };
5705 mgr.set_bucket_default(
5706 &bucket,
5707 crate::object_lock::BucketObjectLockDefault {
5708 mode,
5709 retention_days: days,
5710 },
5711 );
5712 }
5713 return Ok(S3Response::new(PutObjectLockConfigurationOutput::default()));
5714 }
5715 self.backend.put_object_lock_configuration(req).await
5716 }
5717 async fn get_object_legal_hold(
5718 &self,
5719 req: S3Request<GetObjectLegalHoldInput>,
5720 ) -> S3Result<S3Response<GetObjectLegalHoldOutput>> {
5721 let key = req.input.key.clone();
5722 self.enforce_policy(&req, "s3:GetObjectLegalHold", &req.input.bucket, Some(&key))?;
5723 if let Some(mgr) = self.object_lock.as_ref() {
5724 let on = mgr
5725 .get(&req.input.bucket, &req.input.key)
5726 .map(|s| s.legal_hold_on)
5727 .unwrap_or(false);
5728 let status = ObjectLockLegalHoldStatus::from_static(if on {
5729 ObjectLockLegalHoldStatus::ON
5730 } else {
5731 ObjectLockLegalHoldStatus::OFF
5732 });
5733 let output = GetObjectLegalHoldOutput {
5734 legal_hold: Some(ObjectLockLegalHold {
5735 status: Some(status),
5736 }),
5737 };
5738 return Ok(S3Response::new(output));
5739 }
5740 self.backend.get_object_legal_hold(req).await
5741 }
5742 async fn put_object_legal_hold(
5743 &self,
5744 req: S3Request<PutObjectLegalHoldInput>,
5745 ) -> S3Result<S3Response<PutObjectLegalHoldOutput>> {
5746 let key = req.input.key.clone();
5747 self.enforce_policy(&req, "s3:PutObjectLegalHold", &req.input.bucket, Some(&key))?;
5748 if let Some(mgr) = self.object_lock.as_ref() {
5749 let on = req
5750 .input
5751 .legal_hold
5752 .as_ref()
5753 .and_then(|h| h.status.as_ref())
5754 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
5755 .unwrap_or(false);
5756 mgr.set_legal_hold(&req.input.bucket, &req.input.key, on);
5757 return Ok(S3Response::new(PutObjectLegalHoldOutput::default()));
5758 }
5759 self.backend.put_object_legal_hold(req).await
5760 }
5761 async fn get_object_retention(
5762 &self,
5763 req: S3Request<GetObjectRetentionInput>,
5764 ) -> S3Result<S3Response<GetObjectRetentionOutput>> {
5765 let key = req.input.key.clone();
5766 self.enforce_policy(&req, "s3:GetObjectRetention", &req.input.bucket, Some(&key))?;
5767 if let Some(mgr) = self.object_lock.as_ref() {
5768 let retention = mgr
5769 .get(&req.input.bucket, &req.input.key)
5770 .filter(|s| s.mode.is_some() || s.retain_until.is_some())
5771 .map(|s| {
5772 let mode = s.mode.map(|m| {
5773 ObjectLockRetentionMode::from_static(match m {
5774 crate::object_lock::LockMode::Governance => {
5775 ObjectLockRetentionMode::GOVERNANCE
5776 }
5777 crate::object_lock::LockMode::Compliance => {
5778 ObjectLockRetentionMode::COMPLIANCE
5779 }
5780 })
5781 });
5782 let until = s.retain_until.map(chrono_utc_to_timestamp);
5783 ObjectLockRetention {
5784 mode,
5785 retain_until_date: until,
5786 }
5787 });
5788 let output = GetObjectRetentionOutput { retention };
5789 return Ok(S3Response::new(output));
5790 }
5791 self.backend.get_object_retention(req).await
5792 }
5793 async fn put_object_retention(
5794 &self,
5795 req: S3Request<PutObjectRetentionInput>,
5796 ) -> S3Result<S3Response<PutObjectRetentionOutput>> {
5797 let key = req.input.key.clone();
5798 self.enforce_policy(&req, "s3:PutObjectRetention", &req.input.bucket, Some(&key))?;
5799 if let Some(mgr) = self.object_lock.as_ref() {
5800 let bucket = req.input.bucket.clone();
5801 let key = req.input.key.clone();
5802 // v0.8.12 HIGH-7 fix: the bypass header gates Governance
5803 // shortening only when the caller has the matching IAM
5804 // action explicitly allowed; otherwise it's silently
5805 // dropped to `false` and the "shortening Governance
5806 // requires bypass" branch below rejects.
5807 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
5808 let bypass = if bypass_header {
5809 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
5810 .is_ok()
5811 } else {
5812 false
5813 };
5814 let retention = req.input.retention.as_ref().ok_or_else(|| {
5815 S3Error::with_message(
5816 S3ErrorCode::InvalidRequest,
5817 "PutObjectRetention requires a Retention element",
5818 )
5819 })?;
5820 let new_mode = retention
5821 .mode
5822 .as_ref()
5823 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
5824 let new_until = retention
5825 .retain_until_date
5826 .as_ref()
5827 .map(timestamp_to_chrono_utc)
5828 .unwrap_or(None);
5829 let now = chrono::Utc::now();
5830 let existing = mgr.get(&bucket, &key).unwrap_or_default();
5831 // S3 immutability rules:
5832 // - Compliance is one-way: once set, mode cannot move to
5833 // Governance, and retain-until cannot be shortened.
5834 // - Governance can be lengthened freely; shortened only
5835 // with bypass=true.
5836 if let Some(existing_mode) = existing.mode
5837 && existing_mode == crate::object_lock::LockMode::Compliance
5838 && existing.is_locked(now)
5839 {
5840 if matches!(new_mode, Some(crate::object_lock::LockMode::Governance)) {
5841 return Err(S3Error::with_message(
5842 S3ErrorCode::AccessDenied,
5843 "Cannot downgrade Compliance retention to Governance while lock is active",
5844 ));
5845 }
5846 if let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5847 && next < prev
5848 {
5849 return Err(S3Error::with_message(
5850 S3ErrorCode::AccessDenied,
5851 "Cannot shorten Compliance retention while lock is active",
5852 ));
5853 }
5854 }
5855 if let Some(existing_mode) = existing.mode
5856 && existing_mode == crate::object_lock::LockMode::Governance
5857 && existing.is_locked(now)
5858 && !bypass
5859 && let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5860 && next < prev
5861 {
5862 return Err(S3Error::with_message(
5863 S3ErrorCode::AccessDenied,
5864 "Shortening Governance retention requires x-amz-bypass-governance-retention: true",
5865 ));
5866 }
5867 let mut state = existing;
5868 if new_mode.is_some() {
5869 state.mode = new_mode;
5870 }
5871 if new_until.is_some() {
5872 state.retain_until = new_until;
5873 }
5874 mgr.set(&bucket, &key, state);
5875 return Ok(S3Response::new(PutObjectRetentionOutput::default()));
5876 }
5877 self.backend.put_object_retention(req).await
5878 }
5879
5880 // ---- Versioning ----
5881 // list_object_versions is implemented above in the compression-hook
5882 // section so it filters S4-internal sidecars (v0.4 #17) AND, when a
5883 // VersioningManager is attached (v0.5 #34), serves chains directly
5884 // from the in-memory index.
5885 async fn get_bucket_versioning(
5886 &self,
5887 req: S3Request<GetBucketVersioningInput>,
5888 ) -> S3Result<S3Response<GetBucketVersioningOutput>> {
5889 // v0.5 #34: when a VersioningManager is attached, the bucket's
5890 // versioning state lives in the manager (= S4-server's
5891 // authoritative source). Pass-through hits the backend only
5892 // when no manager is configured (legacy v0.4 behaviour).
5893 if let Some(mgr) = self.versioning.as_ref() {
5894 let output = match mgr.state(&req.input.bucket).as_aws_status() {
5895 Some(s) => GetBucketVersioningOutput {
5896 status: Some(BucketVersioningStatus::from(s.to_owned())),
5897 ..Default::default()
5898 },
5899 None => GetBucketVersioningOutput::default(),
5900 };
5901 return Ok(S3Response::new(output));
5902 }
5903 self.backend.get_bucket_versioning(req).await
5904 }
5905 async fn put_bucket_versioning(
5906 &self,
5907 req: S3Request<PutBucketVersioningInput>,
5908 ) -> S3Result<S3Response<PutBucketVersioningOutput>> {
5909 // v0.6 #42: MFA gating on the `PutBucketVersioning` request
5910 // itself. S3 spec: when the request body carries an
5911 // `MfaDelete` element (either `Enabled` or `Disabled`), the
5912 // request must include a valid `x-amz-mfa` token — both for
5913 // the *first* enable (so the operator can't quietly side-step
5914 // the gate by never enabling it) and for any subsequent
5915 // change (so a leaked credential alone can't disable MFA
5916 // Delete to bypass it on subsequent DELETEs). Requests that
5917 // omit the `MfaDelete` element entirely (i.e. they flip only
5918 // `Status`) skip this gate, matching AWS.
5919 if let Some(mgr) = self.mfa_delete.as_ref()
5920 && let Some(target_enabled) = req
5921 .input
5922 .versioning_configuration
5923 .mfa_delete
5924 .as_ref()
5925 .map(|m| m.as_str().eq_ignore_ascii_case("Enabled"))
5926 {
5927 let bucket = req.input.bucket.clone();
5928 let header = req.input.mfa.as_deref();
5929 let secret = mgr.lookup_secret(&bucket);
5930 let verified = match (header, secret.as_ref()) {
5931 (Some(h), Some(s)) => match crate::mfa::parse_mfa_header(h) {
5932 Ok((serial, code)) => {
5933 serial == s.serial
5934 && crate::mfa::verify_totp(&s.secret_base32, &code, current_unix_secs())
5935 }
5936 Err(_) => false,
5937 },
5938 _ => false,
5939 };
5940 if !verified {
5941 crate::metrics::record_mfa_delete_denial(&bucket);
5942 let err = if header.is_none() {
5943 crate::mfa::MfaError::Missing
5944 } else {
5945 crate::mfa::MfaError::InvalidCode
5946 };
5947 return Err(mfa_error_to_s3(err));
5948 }
5949 mgr.set_bucket_state(&bucket, target_enabled);
5950 }
5951 // v0.5 #34: stash the new state in the manager, then forward to
5952 // the backend so any downstream that *also* tracks state
5953 // (e.g. a real S3 backend) stays in sync. Manager-attached but
5954 // backend rejection is treated as a soft-fail (state is still
5955 // owned by the manager).
5956 if let Some(mgr) = self.versioning.as_ref() {
5957 let new_state = match req
5958 .input
5959 .versioning_configuration
5960 .status
5961 .as_ref()
5962 .map(|s| s.as_str())
5963 {
5964 Some(s) if s.eq_ignore_ascii_case("Enabled") => {
5965 crate::versioning::VersioningState::Enabled
5966 }
5967 Some(s) if s.eq_ignore_ascii_case("Suspended") => {
5968 crate::versioning::VersioningState::Suspended
5969 }
5970 _ => crate::versioning::VersioningState::Unversioned,
5971 };
5972 mgr.set_state(&req.input.bucket, new_state);
5973 return Ok(S3Response::new(PutBucketVersioningOutput::default()));
5974 }
5975 self.backend.put_bucket_versioning(req).await
5976 }
5977
5978 // ---- Bucket location ----
5979 async fn get_bucket_location(
5980 &self,
5981 req: S3Request<GetBucketLocationInput>,
5982 ) -> S3Result<S3Response<GetBucketLocationOutput>> {
5983 self.backend.get_bucket_location(req).await
5984 }
5985
5986 // ---- Bucket policy ----
5987 async fn get_bucket_policy(
5988 &self,
5989 req: S3Request<GetBucketPolicyInput>,
5990 ) -> S3Result<S3Response<GetBucketPolicyOutput>> {
5991 self.backend.get_bucket_policy(req).await
5992 }
5993 async fn put_bucket_policy(
5994 &self,
5995 req: S3Request<PutBucketPolicyInput>,
5996 ) -> S3Result<S3Response<PutBucketPolicyOutput>> {
5997 self.backend.put_bucket_policy(req).await
5998 }
5999 async fn delete_bucket_policy(
6000 &self,
6001 req: S3Request<DeleteBucketPolicyInput>,
6002 ) -> S3Result<S3Response<DeleteBucketPolicyOutput>> {
6003 self.backend.delete_bucket_policy(req).await
6004 }
6005 async fn get_bucket_policy_status(
6006 &self,
6007 req: S3Request<GetBucketPolicyStatusInput>,
6008 ) -> S3Result<S3Response<GetBucketPolicyStatusOutput>> {
6009 self.backend.get_bucket_policy_status(req).await
6010 }
6011
6012 // ---- Bucket ACL ----
6013 async fn get_bucket_acl(
6014 &self,
6015 req: S3Request<GetBucketAclInput>,
6016 ) -> S3Result<S3Response<GetBucketAclOutput>> {
6017 self.backend.get_bucket_acl(req).await
6018 }
6019 async fn put_bucket_acl(
6020 &self,
6021 req: S3Request<PutBucketAclInput>,
6022 ) -> S3Result<S3Response<PutBucketAclOutput>> {
6023 self.backend.put_bucket_acl(req).await
6024 }
6025
6026 // ---- Bucket CORS (v0.6 #38) ----
6027 async fn get_bucket_cors(
6028 &self,
6029 req: S3Request<GetBucketCorsInput>,
6030 ) -> S3Result<S3Response<GetBucketCorsOutput>> {
6031 if let Some(mgr) = self.cors.as_ref() {
6032 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6033 S3Error::with_message(
6034 S3ErrorCode::NoSuchCORSConfiguration,
6035 "The CORS configuration does not exist".to_string(),
6036 )
6037 })?;
6038 let rules: Vec<CORSRule> = cfg
6039 .rules
6040 .into_iter()
6041 .map(|r| CORSRule {
6042 allowed_headers: if r.allowed_headers.is_empty() {
6043 None
6044 } else {
6045 Some(r.allowed_headers)
6046 },
6047 allowed_methods: r.allowed_methods,
6048 allowed_origins: r.allowed_origins,
6049 expose_headers: if r.expose_headers.is_empty() {
6050 None
6051 } else {
6052 Some(r.expose_headers)
6053 },
6054 id: r.id,
6055 max_age_seconds: r.max_age_seconds.map(|s| s as i32),
6056 })
6057 .collect();
6058 return Ok(S3Response::new(GetBucketCorsOutput {
6059 cors_rules: Some(rules),
6060 }));
6061 }
6062 self.backend.get_bucket_cors(req).await
6063 }
6064 async fn put_bucket_cors(
6065 &self,
6066 req: S3Request<PutBucketCorsInput>,
6067 ) -> S3Result<S3Response<PutBucketCorsOutput>> {
6068 if let Some(mgr) = self.cors.as_ref() {
6069 let cfg = crate::cors::CorsConfig {
6070 rules: req
6071 .input
6072 .cors_configuration
6073 .cors_rules
6074 .into_iter()
6075 .map(|r| crate::cors::CorsRule {
6076 allowed_origins: r.allowed_origins,
6077 allowed_methods: r.allowed_methods,
6078 allowed_headers: r.allowed_headers.unwrap_or_default(),
6079 expose_headers: r.expose_headers.unwrap_or_default(),
6080 max_age_seconds: r
6081 .max_age_seconds
6082 .and_then(|s| if s < 0 { None } else { Some(s as u32) }),
6083 id: r.id,
6084 })
6085 .collect(),
6086 };
6087 // v0.8.15 M-3: AWS S3 rejects `AllowedMethods` outside
6088 // the canonical {GET,PUT,POST,DELETE,HEAD} set (including
6089 // the `*` wildcard). Validate at PutBucketCors time so
6090 // operators see the misconfiguration in the API response
6091 // instead of having silently-broken preflights at the
6092 // browser later.
6093 if let Err(e) = crate::cors::CorsManager::validate(&cfg) {
6094 return Err(S3Error::with_message(
6095 S3ErrorCode::InvalidArgument,
6096 e.to_string(),
6097 ));
6098 }
6099 mgr.put(&req.input.bucket, cfg);
6100 return Ok(S3Response::new(PutBucketCorsOutput::default()));
6101 }
6102 self.backend.put_bucket_cors(req).await
6103 }
6104 async fn delete_bucket_cors(
6105 &self,
6106 req: S3Request<DeleteBucketCorsInput>,
6107 ) -> S3Result<S3Response<DeleteBucketCorsOutput>> {
6108 if let Some(mgr) = self.cors.as_ref() {
6109 mgr.delete(&req.input.bucket);
6110 return Ok(S3Response::new(DeleteBucketCorsOutput::default()));
6111 }
6112 self.backend.delete_bucket_cors(req).await
6113 }
6114
6115 // ---- Bucket lifecycle (v0.6 #37) ----
6116 async fn get_bucket_lifecycle_configuration(
6117 &self,
6118 req: S3Request<GetBucketLifecycleConfigurationInput>,
6119 ) -> S3Result<S3Response<GetBucketLifecycleConfigurationOutput>> {
6120 if let Some(mgr) = self.lifecycle.as_ref() {
6121 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6122 S3Error::with_message(
6123 S3ErrorCode::NoSuchLifecycleConfiguration,
6124 "The lifecycle configuration does not exist".to_string(),
6125 )
6126 })?;
6127 let rules: Vec<LifecycleRule> = cfg.rules.iter().map(internal_rule_to_dto).collect();
6128 return Ok(S3Response::new(GetBucketLifecycleConfigurationOutput {
6129 rules: Some(rules),
6130 transition_default_minimum_object_size: None,
6131 }));
6132 }
6133 self.backend.get_bucket_lifecycle_configuration(req).await
6134 }
6135 async fn put_bucket_lifecycle_configuration(
6136 &self,
6137 req: S3Request<PutBucketLifecycleConfigurationInput>,
6138 ) -> S3Result<S3Response<PutBucketLifecycleConfigurationOutput>> {
6139 if let Some(mgr) = self.lifecycle.as_ref() {
6140 let bucket = req.input.bucket.clone();
6141 let dto_cfg = req.input.lifecycle_configuration.unwrap_or_default();
6142 let cfg = dto_lifecycle_to_internal(&dto_cfg);
6143 mgr.put(&bucket, cfg);
6144 return Ok(S3Response::new(
6145 PutBucketLifecycleConfigurationOutput::default(),
6146 ));
6147 }
6148 self.backend.put_bucket_lifecycle_configuration(req).await
6149 }
6150 async fn delete_bucket_lifecycle(
6151 &self,
6152 req: S3Request<DeleteBucketLifecycleInput>,
6153 ) -> S3Result<S3Response<DeleteBucketLifecycleOutput>> {
6154 if let Some(mgr) = self.lifecycle.as_ref() {
6155 mgr.delete(&req.input.bucket);
6156 return Ok(S3Response::new(DeleteBucketLifecycleOutput::default()));
6157 }
6158 self.backend.delete_bucket_lifecycle(req).await
6159 }
6160
6161 // ---- Bucket tagging (v0.6 #39) ----
6162 async fn get_bucket_tagging(
6163 &self,
6164 req: S3Request<GetBucketTaggingInput>,
6165 ) -> S3Result<S3Response<GetBucketTaggingOutput>> {
6166 let Some(mgr) = self.tagging.as_ref() else {
6167 return self.backend.get_bucket_tagging(req).await;
6168 };
6169 let tags = mgr.get_bucket_tags(&req.input.bucket).unwrap_or_default();
6170 Ok(S3Response::new(GetBucketTaggingOutput {
6171 tag_set: tagset_to_aws(&tags),
6172 }))
6173 }
6174 async fn put_bucket_tagging(
6175 &self,
6176 req: S3Request<PutBucketTaggingInput>,
6177 ) -> S3Result<S3Response<PutBucketTaggingOutput>> {
6178 let Some(mgr) = self.tagging.as_ref() else {
6179 return self.backend.put_bucket_tagging(req).await;
6180 };
6181 let bucket = req.input.bucket.clone();
6182 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6183 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6184 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6185 mgr.put_bucket_tags(&bucket, parsed);
6186 Ok(S3Response::new(PutBucketTaggingOutput::default()))
6187 }
6188 async fn delete_bucket_tagging(
6189 &self,
6190 req: S3Request<DeleteBucketTaggingInput>,
6191 ) -> S3Result<S3Response<DeleteBucketTaggingOutput>> {
6192 let Some(mgr) = self.tagging.as_ref() else {
6193 return self.backend.delete_bucket_tagging(req).await;
6194 };
6195 let bucket = req.input.bucket.clone();
6196 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6197 mgr.delete_bucket_tags(&bucket);
6198 Ok(S3Response::new(DeleteBucketTaggingOutput::default()))
6199 }
6200
6201 // ---- Bucket encryption ----
6202 async fn get_bucket_encryption(
6203 &self,
6204 req: S3Request<GetBucketEncryptionInput>,
6205 ) -> S3Result<S3Response<GetBucketEncryptionOutput>> {
6206 self.backend.get_bucket_encryption(req).await
6207 }
6208 async fn put_bucket_encryption(
6209 &self,
6210 req: S3Request<PutBucketEncryptionInput>,
6211 ) -> S3Result<S3Response<PutBucketEncryptionOutput>> {
6212 self.backend.put_bucket_encryption(req).await
6213 }
6214 async fn delete_bucket_encryption(
6215 &self,
6216 req: S3Request<DeleteBucketEncryptionInput>,
6217 ) -> S3Result<S3Response<DeleteBucketEncryptionOutput>> {
6218 self.backend.delete_bucket_encryption(req).await
6219 }
6220
6221 // ---- Bucket logging ----
6222 async fn get_bucket_logging(
6223 &self,
6224 req: S3Request<GetBucketLoggingInput>,
6225 ) -> S3Result<S3Response<GetBucketLoggingOutput>> {
6226 self.backend.get_bucket_logging(req).await
6227 }
6228 async fn put_bucket_logging(
6229 &self,
6230 req: S3Request<PutBucketLoggingInput>,
6231 ) -> S3Result<S3Response<PutBucketLoggingOutput>> {
6232 self.backend.put_bucket_logging(req).await
6233 }
6234
6235 // ---- Bucket notification (v0.6 #35) ----
6236 //
6237 // When a `NotificationManager` is attached, S4 itself owns per-bucket
6238 // notification configurations and the PUT / GET handlers route through
6239 // the manager. The wire DTO's queue / topic configurations map onto
6240 // S4's `Destination::Sqs` / `Destination::Sns`; LambdaFunction and
6241 // EventBridge configurations are accepted on PUT but silently dropped
6242 // (out of scope for v0.6 #35). When no manager is attached the legacy
6243 // backend-passthrough behaviour applies.
6244 async fn get_bucket_notification_configuration(
6245 &self,
6246 req: S3Request<GetBucketNotificationConfigurationInput>,
6247 ) -> S3Result<S3Response<GetBucketNotificationConfigurationOutput>> {
6248 if let Some(mgr) = self.notifications.as_ref() {
6249 let cfg = mgr.get(&req.input.bucket).unwrap_or_default();
6250 let dto = notif_to_dto(&cfg);
6251 return Ok(S3Response::new(GetBucketNotificationConfigurationOutput {
6252 event_bridge_configuration: dto.event_bridge_configuration,
6253 lambda_function_configurations: dto.lambda_function_configurations,
6254 queue_configurations: dto.queue_configurations,
6255 topic_configurations: dto.topic_configurations,
6256 }));
6257 }
6258 self.backend
6259 .get_bucket_notification_configuration(req)
6260 .await
6261 }
6262 async fn put_bucket_notification_configuration(
6263 &self,
6264 req: S3Request<PutBucketNotificationConfigurationInput>,
6265 ) -> S3Result<S3Response<PutBucketNotificationConfigurationOutput>> {
6266 if let Some(mgr) = self.notifications.as_ref() {
6267 let cfg = notif_from_dto(&req.input.notification_configuration);
6268 mgr.put(&req.input.bucket, cfg);
6269 return Ok(S3Response::new(
6270 PutBucketNotificationConfigurationOutput::default(),
6271 ));
6272 }
6273 self.backend
6274 .put_bucket_notification_configuration(req)
6275 .await
6276 }
6277
6278 // ---- Bucket request payment ----
6279 async fn get_bucket_request_payment(
6280 &self,
6281 req: S3Request<GetBucketRequestPaymentInput>,
6282 ) -> S3Result<S3Response<GetBucketRequestPaymentOutput>> {
6283 self.backend.get_bucket_request_payment(req).await
6284 }
6285 async fn put_bucket_request_payment(
6286 &self,
6287 req: S3Request<PutBucketRequestPaymentInput>,
6288 ) -> S3Result<S3Response<PutBucketRequestPaymentOutput>> {
6289 self.backend.put_bucket_request_payment(req).await
6290 }
6291
6292 // ---- Bucket website ----
6293 async fn get_bucket_website(
6294 &self,
6295 req: S3Request<GetBucketWebsiteInput>,
6296 ) -> S3Result<S3Response<GetBucketWebsiteOutput>> {
6297 self.backend.get_bucket_website(req).await
6298 }
6299 async fn put_bucket_website(
6300 &self,
6301 req: S3Request<PutBucketWebsiteInput>,
6302 ) -> S3Result<S3Response<PutBucketWebsiteOutput>> {
6303 self.backend.put_bucket_website(req).await
6304 }
6305 async fn delete_bucket_website(
6306 &self,
6307 req: S3Request<DeleteBucketWebsiteInput>,
6308 ) -> S3Result<S3Response<DeleteBucketWebsiteOutput>> {
6309 self.backend.delete_bucket_website(req).await
6310 }
6311
6312 // ---- Bucket replication (v0.6 #40) ----
6313 async fn get_bucket_replication(
6314 &self,
6315 req: S3Request<GetBucketReplicationInput>,
6316 ) -> S3Result<S3Response<GetBucketReplicationOutput>> {
6317 if let Some(mgr) = self.replication.as_ref() {
6318 return match mgr.get(&req.input.bucket) {
6319 Some(cfg) => Ok(S3Response::new(GetBucketReplicationOutput {
6320 replication_configuration: Some(replication_to_dto(&cfg)),
6321 })),
6322 None => Err(S3Error::with_message(
6323 S3ErrorCode::Custom("ReplicationConfigurationNotFoundError".into()),
6324 format!(
6325 "no replication configuration on bucket {}",
6326 req.input.bucket
6327 ),
6328 )),
6329 };
6330 }
6331 self.backend.get_bucket_replication(req).await
6332 }
6333 async fn put_bucket_replication(
6334 &self,
6335 req: S3Request<PutBucketReplicationInput>,
6336 ) -> S3Result<S3Response<PutBucketReplicationOutput>> {
6337 if let Some(mgr) = self.replication.as_ref() {
6338 let cfg = replication_from_dto(&req.input.replication_configuration);
6339 mgr.put(&req.input.bucket, cfg);
6340 return Ok(S3Response::new(PutBucketReplicationOutput::default()));
6341 }
6342 self.backend.put_bucket_replication(req).await
6343 }
6344 async fn delete_bucket_replication(
6345 &self,
6346 req: S3Request<DeleteBucketReplicationInput>,
6347 ) -> S3Result<S3Response<DeleteBucketReplicationOutput>> {
6348 if let Some(mgr) = self.replication.as_ref() {
6349 mgr.delete(&req.input.bucket);
6350 return Ok(S3Response::new(DeleteBucketReplicationOutput::default()));
6351 }
6352 self.backend.delete_bucket_replication(req).await
6353 }
6354
6355 // ---- Bucket accelerate ----
6356 async fn get_bucket_accelerate_configuration(
6357 &self,
6358 req: S3Request<GetBucketAccelerateConfigurationInput>,
6359 ) -> S3Result<S3Response<GetBucketAccelerateConfigurationOutput>> {
6360 self.backend.get_bucket_accelerate_configuration(req).await
6361 }
6362 async fn put_bucket_accelerate_configuration(
6363 &self,
6364 req: S3Request<PutBucketAccelerateConfigurationInput>,
6365 ) -> S3Result<S3Response<PutBucketAccelerateConfigurationOutput>> {
6366 self.backend.put_bucket_accelerate_configuration(req).await
6367 }
6368
6369 // ---- Bucket ownership controls ----
6370 async fn get_bucket_ownership_controls(
6371 &self,
6372 req: S3Request<GetBucketOwnershipControlsInput>,
6373 ) -> S3Result<S3Response<GetBucketOwnershipControlsOutput>> {
6374 self.backend.get_bucket_ownership_controls(req).await
6375 }
6376 async fn put_bucket_ownership_controls(
6377 &self,
6378 req: S3Request<PutBucketOwnershipControlsInput>,
6379 ) -> S3Result<S3Response<PutBucketOwnershipControlsOutput>> {
6380 self.backend.put_bucket_ownership_controls(req).await
6381 }
6382 async fn delete_bucket_ownership_controls(
6383 &self,
6384 req: S3Request<DeleteBucketOwnershipControlsInput>,
6385 ) -> S3Result<S3Response<DeleteBucketOwnershipControlsOutput>> {
6386 self.backend.delete_bucket_ownership_controls(req).await
6387 }
6388
6389 // ---- Public access block ----
6390 async fn get_public_access_block(
6391 &self,
6392 req: S3Request<GetPublicAccessBlockInput>,
6393 ) -> S3Result<S3Response<GetPublicAccessBlockOutput>> {
6394 self.backend.get_public_access_block(req).await
6395 }
6396 async fn put_public_access_block(
6397 &self,
6398 req: S3Request<PutPublicAccessBlockInput>,
6399 ) -> S3Result<S3Response<PutPublicAccessBlockOutput>> {
6400 self.backend.put_public_access_block(req).await
6401 }
6402 async fn delete_public_access_block(
6403 &self,
6404 req: S3Request<DeletePublicAccessBlockInput>,
6405 ) -> S3Result<S3Response<DeletePublicAccessBlockOutput>> {
6406 self.backend.delete_public_access_block(req).await
6407 }
6408
6409 // ====================================================================
6410 // v0.6 #41: S3 Select — server-side SQL filter on object body.
6411 //
6412 // Fetch the object via the regular `get_object` path (so SSE-C /
6413 // SSE-S4 / SSE-KMS / S4 codec all decompress + decrypt transparently),
6414 // run a small SQL subset (CSV + JSON Lines, equality / inequality /
6415 // LIKE / AND / OR / NOT) over the in-memory body, and stream the
6416 // matched rows back as AWS event-stream `Records` + `Stats` + `End`
6417 // frames.
6418 //
6419 // Limitations (deliberate, documented):
6420 // - Parquet input is rejected with NotImplemented.
6421 // - Aggregates / GROUP BY / JOIN / ORDER BY / LIMIT are rejected at
6422 // parse time as InvalidRequest (s3s 0.13 doesn't expose AWS's
6423 // domain-specific `InvalidSqlExpression` code).
6424 // - The body is fully buffered before SQL evaluation (S3 Select
6425 // streaming-during-evaluation is v0.7 scope).
6426 // - GPU-accelerated WHERE evaluation is stubbed out (always None).
6427 async fn select_object_content(
6428 &self,
6429 req: S3Request<SelectObjectContentInput>,
6430 ) -> S3Result<S3Response<SelectObjectContentOutput>> {
6431 use crate::select::{
6432 EventStreamWriter, SelectInputFormat, SelectOutputFormat, run_select_csv,
6433 run_select_jsonlines,
6434 };
6435
6436 let select_bucket = req.input.bucket.clone();
6437 let select_key = req.input.key.clone();
6438 self.enforce_rate_limit(&req, &select_bucket)?;
6439 self.enforce_policy(&req, "s3:GetObject", &select_bucket, Some(&select_key))?;
6440
6441 let request = req.input.request;
6442 let sql = request.expression.clone();
6443 if request.expression_type.as_str() != "SQL" {
6444 return Err(S3Error::with_message(
6445 S3ErrorCode::InvalidExpressionType,
6446 format!(
6447 "ExpressionType must be SQL, got: {}",
6448 request.expression_type.as_str()
6449 ),
6450 ));
6451 }
6452
6453 let input_format = if let Some(_json) = request.input_serialization.json.as_ref() {
6454 SelectInputFormat::JsonLines
6455 } else if let Some(csv) = request.input_serialization.csv.as_ref() {
6456 let has_header = csv
6457 .file_header_info
6458 .as_ref()
6459 .map(|h| {
6460 let s = h.as_str();
6461 s.eq_ignore_ascii_case("USE") || s.eq_ignore_ascii_case("IGNORE")
6462 })
6463 .unwrap_or(false);
6464 let delim = csv
6465 .field_delimiter
6466 .as_deref()
6467 .and_then(|s| s.chars().next())
6468 .unwrap_or(',');
6469 SelectInputFormat::Csv {
6470 has_header,
6471 delimiter: delim,
6472 }
6473 } else if request.input_serialization.parquet.is_some() {
6474 return Err(S3Error::with_message(
6475 S3ErrorCode::NotImplemented,
6476 "Parquet input is not supported by this S3 Select implementation (v0.6: CSV / JSON Lines only)",
6477 ));
6478 } else {
6479 return Err(S3Error::with_message(
6480 S3ErrorCode::InvalidRequest,
6481 "InputSerialization requires exactly one of CSV / JSON / Parquet",
6482 ));
6483 };
6484 if let Some(ct) = request.input_serialization.compression_type.as_ref()
6485 && !ct.as_str().eq_ignore_ascii_case("NONE")
6486 {
6487 return Err(S3Error::with_message(
6488 S3ErrorCode::NotImplemented,
6489 format!(
6490 "InputSerialization CompressionType={} is not supported (v0.6: NONE only)",
6491 ct.as_str()
6492 ),
6493 ));
6494 }
6495
6496 let output_format = if request.output_serialization.json.is_some() {
6497 SelectOutputFormat::Json
6498 } else if request.output_serialization.csv.is_some() {
6499 SelectOutputFormat::Csv
6500 } else {
6501 return Err(S3Error::with_message(
6502 S3ErrorCode::InvalidRequest,
6503 "OutputSerialization requires exactly one of CSV / JSON",
6504 ));
6505 };
6506
6507 let get_input = GetObjectInput {
6508 bucket: select_bucket.clone(),
6509 key: select_key.clone(),
6510 sse_customer_algorithm: req.input.sse_customer_algorithm.clone(),
6511 sse_customer_key: req.input.sse_customer_key.clone(),
6512 sse_customer_key_md5: req.input.sse_customer_key_md5.clone(),
6513 ..Default::default()
6514 };
6515 let get_req = S3Request {
6516 input: get_input,
6517 method: http::Method::GET,
6518 uri: format!("/{}/{}", select_bucket, select_key)
6519 .parse()
6520 .map_err(|e| {
6521 S3Error::with_message(
6522 S3ErrorCode::InternalError,
6523 format!("constructing inner GET URI: {e}"),
6524 )
6525 })?,
6526 headers: http::HeaderMap::new(),
6527 extensions: http::Extensions::new(),
6528 credentials: req.credentials.clone(),
6529 region: req.region.clone(),
6530 service: req.service.clone(),
6531 trailing_headers: None,
6532 };
6533 let mut get_resp = self.get_object(get_req).await?;
6534 let blob = get_resp.output.body.take().ok_or_else(|| {
6535 S3Error::with_message(
6536 S3ErrorCode::InternalError,
6537 "Select: object body was empty after GET",
6538 )
6539 })?;
6540 let body_bytes = crate::blob::collect_blob(blob, self.max_body_bytes)
6541 .await
6542 .map_err(internal("collect Select body"))?;
6543 let scanned = body_bytes.len() as u64;
6544
6545 let matched_payload = match input_format {
6546 SelectInputFormat::JsonLines => run_select_jsonlines(&sql, &body_bytes, output_format)
6547 .map_err(|e| select_error_to_s3(e, "JSON Lines"))?,
6548 SelectInputFormat::Csv { .. } => {
6549 run_select_csv(&sql, &body_bytes, input_format, output_format)
6550 .map_err(|e| select_error_to_s3(e, "CSV"))?
6551 }
6552 };
6553
6554 let returned = matched_payload.len() as u64;
6555 let processed = scanned;
6556 let mut events: Vec<S3Result<SelectObjectContentEvent>> = Vec::with_capacity(3);
6557 if !matched_payload.is_empty() {
6558 events.push(Ok(SelectObjectContentEvent::Records(RecordsEvent {
6559 payload: Some(bytes::Bytes::from(matched_payload)),
6560 })));
6561 }
6562 events.push(Ok(SelectObjectContentEvent::Stats(StatsEvent {
6563 details: Some(Stats {
6564 bytes_scanned: Some(scanned as i64),
6565 bytes_processed: Some(processed as i64),
6566 bytes_returned: Some(returned as i64),
6567 }),
6568 })));
6569 events.push(Ok(SelectObjectContentEvent::End(EndEvent {})));
6570 // Touch EventStreamWriter so the public API stays linked into the
6571 // build (the actual wire framing is delegated to s3s).
6572 let _writer = EventStreamWriter::new();
6573
6574 let stream = SelectObjectContentEventStream::new(futures::stream::iter(events));
6575 let output = SelectObjectContentOutput {
6576 payload: Some(stream),
6577 };
6578 Ok(S3Response::new(output))
6579 }
6580
6581 // ---- Bucket Inventory configuration (v0.6 #36) ----
6582 //
6583 // When an `InventoryManager` is attached, S4-server owns the
6584 // configuration store and these handlers no longer pass through to
6585 // the backend. The mapping between the s3s-typed
6586 // `InventoryConfiguration` and the inventory module's internal
6587 // `InventoryConfig` is intentionally lossy: only the fields S4
6588 // actually uses for periodic CSV emission survive the round trip
6589 // (id, source bucket, destination bucket / prefix, format, included
6590 // versions, schedule frequency). Optional fields, encryption, and
6591 // filter prefixes are accepted on PUT and re-surfaced on GET via
6592 // a best-effort default-shape `InventoryConfiguration` so the
6593 // client sees a roundtrip-clean response.
6594 async fn put_bucket_inventory_configuration(
6595 &self,
6596 req: S3Request<PutBucketInventoryConfigurationInput>,
6597 ) -> S3Result<S3Response<PutBucketInventoryConfigurationOutput>> {
6598 if let Some(mgr) = self.inventory.as_ref() {
6599 let cfg = inv_from_dto(
6600 &req.input.bucket,
6601 &req.input.id,
6602 &req.input.inventory_configuration,
6603 );
6604 mgr.put(cfg);
6605 return Ok(S3Response::new(
6606 PutBucketInventoryConfigurationOutput::default(),
6607 ));
6608 }
6609 self.backend.put_bucket_inventory_configuration(req).await
6610 }
6611
6612 async fn get_bucket_inventory_configuration(
6613 &self,
6614 req: S3Request<GetBucketInventoryConfigurationInput>,
6615 ) -> S3Result<S3Response<GetBucketInventoryConfigurationOutput>> {
6616 if let Some(mgr) = self.inventory.as_ref() {
6617 let cfg = mgr.get(&req.input.bucket, &req.input.id);
6618 if let Some(cfg) = cfg {
6619 let out = GetBucketInventoryConfigurationOutput {
6620 inventory_configuration: Some(inv_to_dto(&cfg)),
6621 };
6622 return Ok(S3Response::new(out));
6623 }
6624 // AWS returns `NoSuchConfiguration` (404) when the id has no
6625 // matching inventory configuration on the bucket. The
6626 // generated `S3ErrorCode` enum doesn't expose a typed variant
6627 // for this code, so we round-trip through `from_bytes` which
6628 // wraps unknown codes as `Custom(...)` (= the AWS-canonical
6629 // error-code string survives into the XML response envelope).
6630 let code =
6631 S3ErrorCode::from_bytes(b"NoSuchConfiguration").unwrap_or(S3ErrorCode::NoSuchKey);
6632 return Err(S3Error::with_message(
6633 code,
6634 format!(
6635 "no inventory configuration with id={} on bucket={}",
6636 req.input.id, req.input.bucket
6637 ),
6638 ));
6639 }
6640 self.backend.get_bucket_inventory_configuration(req).await
6641 }
6642
6643 async fn list_bucket_inventory_configurations(
6644 &self,
6645 req: S3Request<ListBucketInventoryConfigurationsInput>,
6646 ) -> S3Result<S3Response<ListBucketInventoryConfigurationsOutput>> {
6647 if let Some(mgr) = self.inventory.as_ref() {
6648 let list = mgr.list_for_bucket(&req.input.bucket);
6649 let dto_list: Vec<InventoryConfiguration> = list.iter().map(inv_to_dto).collect();
6650 let out = ListBucketInventoryConfigurationsOutput {
6651 continuation_token: req.input.continuation_token.clone(),
6652 inventory_configuration_list: if dto_list.is_empty() {
6653 None
6654 } else {
6655 Some(dto_list)
6656 },
6657 is_truncated: Some(false),
6658 next_continuation_token: None,
6659 };
6660 return Ok(S3Response::new(out));
6661 }
6662 self.backend.list_bucket_inventory_configurations(req).await
6663 }
6664
6665 async fn delete_bucket_inventory_configuration(
6666 &self,
6667 req: S3Request<DeleteBucketInventoryConfigurationInput>,
6668 ) -> S3Result<S3Response<DeleteBucketInventoryConfigurationOutput>> {
6669 if let Some(mgr) = self.inventory.as_ref() {
6670 mgr.delete(&req.input.bucket, &req.input.id);
6671 return Ok(S3Response::new(
6672 DeleteBucketInventoryConfigurationOutput::default(),
6673 ));
6674 }
6675 self.backend
6676 .delete_bucket_inventory_configuration(req)
6677 .await
6678 }
6679}
6680
6681// ---------------------------------------------------------------------------
6682// v0.6 #36: Convert between the s3s-typed `InventoryConfiguration` (the wire
6683// surface) and our internal `crate::inventory::InventoryConfig`. Only the
6684// fields S4 actually uses for CSV emission survive the round trip; the
6685// missing fields (filter prefix, optional fields, encryption) are dropped on
6686// PUT and re-rendered as the AWS-default shape on GET so the client sees a
6687// well-formed `InventoryConfiguration`.
6688// ---------------------------------------------------------------------------
6689
6690fn inv_from_dto(
6691 bucket: &str,
6692 id: &str,
6693 dto: &InventoryConfiguration,
6694) -> crate::inventory::InventoryConfig {
6695 let frequency_hours = match dto.schedule.frequency.as_str() {
6696 "Weekly" => 24 * 7,
6697 // Daily is the default; anything S4 doesn't recognise (incl.
6698 // empty, which is the s3s-default) maps to Daily so the
6699 // operator's PUT doesn't silently turn into a no-op cadence.
6700 _ => 24,
6701 };
6702 // Parquet/ORC are not supported (issue #36 scope); we still accept
6703 // the PUT so callers don't fail-loud, but we record CSV and rely on
6704 // the operator catching the discrepancy on GET.
6705 let format = crate::inventory::InventoryFormat::Csv;
6706 crate::inventory::InventoryConfig {
6707 id: id.to_owned(),
6708 bucket: bucket.to_owned(),
6709 destination_bucket: dto.destination.s3_bucket_destination.bucket.clone(),
6710 destination_prefix: dto
6711 .destination
6712 .s3_bucket_destination
6713 .prefix
6714 .clone()
6715 .unwrap_or_default(),
6716 frequency_hours,
6717 format,
6718 included_object_versions: crate::inventory::IncludedVersions::from_aws_str(
6719 dto.included_object_versions.as_str(),
6720 ),
6721 }
6722}
6723
6724fn inv_to_dto(cfg: &crate::inventory::InventoryConfig) -> InventoryConfiguration {
6725 InventoryConfiguration {
6726 id: cfg.id.clone(),
6727 is_enabled: true,
6728 included_object_versions: InventoryIncludedObjectVersions::from(
6729 cfg.included_object_versions.as_aws_str().to_owned(),
6730 ),
6731 destination: InventoryDestination {
6732 s3_bucket_destination: InventoryS3BucketDestination {
6733 account_id: None,
6734 bucket: cfg.destination_bucket.clone(),
6735 encryption: None,
6736 format: InventoryFormat::from(cfg.format.as_aws_str().to_owned()),
6737 prefix: if cfg.destination_prefix.is_empty() {
6738 None
6739 } else {
6740 Some(cfg.destination_prefix.clone())
6741 },
6742 },
6743 },
6744 schedule: InventorySchedule {
6745 // `frequency_hours == 168` -> Weekly; everything else maps to
6746 // Daily for the wire response (the manager keeps the precise
6747 // hour count internally for due-checking).
6748 frequency: InventoryFrequency::from(
6749 if cfg.frequency_hours == 24 * 7 {
6750 "Weekly"
6751 } else {
6752 "Daily"
6753 }
6754 .to_owned(),
6755 ),
6756 },
6757 filter: None,
6758 optional_fields: None,
6759 }
6760}
6761
6762// ---------------------------------------------------------------------------
6763// v0.6 #35: Convert between the s3s-typed `NotificationConfiguration` (the
6764// wire surface) and our internal `crate::notifications::NotificationConfig`.
6765//
6766// We support TopicConfiguration (-> Destination::Sns) and QueueConfiguration
6767// (-> Destination::Sqs). LambdaFunction and EventBridge configurations are
6768// silently dropped on PUT (out of scope for v0.6 #35); the GET response only
6769// surfaces topic / queue rules.
6770//
6771// The webhook destination has no AWS-native wire form: operators configure
6772// webhooks via the JSON snapshot file (`--notifications-state-file`) or by
6773// poking `NotificationManager::put` directly from a custom binary. This
6774// keeps the wire surface AWS-compatible while still letting the always-
6775// available `Webhook` destination be reachable.
6776// ---------------------------------------------------------------------------
6777
6778fn notif_from_dto(dto: &NotificationConfiguration) -> crate::notifications::NotificationConfig {
6779 let mut rules: Vec<crate::notifications::NotificationRule> = Vec::new();
6780 if let Some(topics) = dto.topic_configurations.as_ref() {
6781 for (idx, t) in topics.iter().enumerate() {
6782 let events = events_from_dto(&t.events);
6783 let (prefix, suffix) = filter_from_dto(t.filter.as_ref());
6784 rules.push(crate::notifications::NotificationRule {
6785 id: t.id.clone().unwrap_or_else(|| format!("topic-{idx}")),
6786 events,
6787 destination: crate::notifications::Destination::Sns {
6788 topic_arn: t.topic_arn.clone(),
6789 },
6790 filter_prefix: prefix,
6791 filter_suffix: suffix,
6792 });
6793 }
6794 }
6795 if let Some(queues) = dto.queue_configurations.as_ref() {
6796 for (idx, q) in queues.iter().enumerate() {
6797 let events = events_from_dto(&q.events);
6798 let (prefix, suffix) = filter_from_dto(q.filter.as_ref());
6799 rules.push(crate::notifications::NotificationRule {
6800 id: q.id.clone().unwrap_or_else(|| format!("queue-{idx}")),
6801 events,
6802 destination: crate::notifications::Destination::Sqs {
6803 queue_arn: q.queue_arn.clone(),
6804 },
6805 filter_prefix: prefix,
6806 filter_suffix: suffix,
6807 });
6808 }
6809 }
6810 crate::notifications::NotificationConfig { rules }
6811}
6812
6813fn notif_to_dto(cfg: &crate::notifications::NotificationConfig) -> NotificationConfiguration {
6814 let mut topics: Vec<TopicConfiguration> = Vec::new();
6815 let mut queues: Vec<QueueConfiguration> = Vec::new();
6816 for rule in &cfg.rules {
6817 let events: Vec<Event> = rule
6818 .events
6819 .iter()
6820 .map(|e| Event::from(e.as_aws_str().to_owned()))
6821 .collect();
6822 let filter = filter_to_dto(rule.filter_prefix.as_deref(), rule.filter_suffix.as_deref());
6823 match &rule.destination {
6824 crate::notifications::Destination::Sns { topic_arn } => {
6825 topics.push(TopicConfiguration {
6826 events,
6827 filter,
6828 id: Some(rule.id.clone()),
6829 topic_arn: topic_arn.clone(),
6830 });
6831 }
6832 crate::notifications::Destination::Sqs { queue_arn } => {
6833 queues.push(QueueConfiguration {
6834 events,
6835 filter,
6836 id: Some(rule.id.clone()),
6837 queue_arn: queue_arn.clone(),
6838 });
6839 }
6840 // Webhook destinations have no AWS wire equivalent — they
6841 // round-trip through the JSON snapshot only. Skip them on the
6842 // GET surface (an SDK consumer wouldn't know what to do with
6843 // them anyway).
6844 crate::notifications::Destination::Webhook { .. } => {}
6845 }
6846 }
6847 NotificationConfiguration {
6848 event_bridge_configuration: None,
6849 lambda_function_configurations: None,
6850 queue_configurations: if queues.is_empty() {
6851 None
6852 } else {
6853 Some(queues)
6854 },
6855 topic_configurations: if topics.is_empty() {
6856 None
6857 } else {
6858 Some(topics)
6859 },
6860 }
6861}
6862
6863fn events_from_dto(events: &[Event]) -> Vec<crate::notifications::EventType> {
6864 events
6865 .iter()
6866 .filter_map(|e| crate::notifications::EventType::from_aws_str(e.as_ref()))
6867 .collect()
6868}
6869
6870fn filter_from_dto(
6871 f: Option<&NotificationConfigurationFilter>,
6872) -> (Option<String>, Option<String>) {
6873 let Some(f) = f else {
6874 return (None, None);
6875 };
6876 let Some(key) = f.key.as_ref() else {
6877 return (None, None);
6878 };
6879 let Some(rules) = key.filter_rules.as_ref() else {
6880 return (None, None);
6881 };
6882 let mut prefix = None;
6883 let mut suffix = None;
6884 for r in rules {
6885 let name = r.name.as_ref().map(|n| n.as_str().to_ascii_lowercase());
6886 let value = r.value.clone();
6887 match name.as_deref() {
6888 Some("prefix") => prefix = value,
6889 Some("suffix") => suffix = value,
6890 _ => {}
6891 }
6892 }
6893 (prefix, suffix)
6894}
6895
6896fn filter_to_dto(
6897 prefix: Option<&str>,
6898 suffix: Option<&str>,
6899) -> Option<NotificationConfigurationFilter> {
6900 if prefix.is_none() && suffix.is_none() {
6901 return None;
6902 }
6903 let mut rules: Vec<FilterRule> = Vec::new();
6904 if let Some(p) = prefix {
6905 rules.push(FilterRule {
6906 name: Some(FilterRuleName::from("prefix".to_owned())),
6907 value: Some(p.to_owned()),
6908 });
6909 }
6910 if let Some(s) = suffix {
6911 rules.push(FilterRule {
6912 name: Some(FilterRuleName::from("suffix".to_owned())),
6913 value: Some(s.to_owned()),
6914 });
6915 }
6916 Some(NotificationConfigurationFilter {
6917 key: Some(S3KeyFilter {
6918 filter_rules: Some(rules),
6919 }),
6920 })
6921}
6922
6923// ---------------------------------------------------------------------------
6924// v0.6 #40: Convert between the s3s-typed `ReplicationConfiguration` (the
6925// wire surface) and our internal `crate::replication::ReplicationConfig`.
6926// AWS's `ReplicationRuleFilter` is a sum type — `Prefix | Tag | And { Prefix,
6927// Tags }`; we flatten it into the single `(prefix, tag-vec)` representation
6928// the matcher needs. Sub-blocks v0.6 #40 does not implement
6929// (DeleteMarkerReplication / SourceSelectionCriteria / ReplicationTime /
6930// Metrics / EncryptionConfiguration) round-trip as `None` on GET — operators
6931// who set them on PUT see them silently dropped, mirroring "feature not
6932// supported in this release" semantics.
6933// ---------------------------------------------------------------------------
6934
6935fn replication_from_dto(dto: &ReplicationConfiguration) -> crate::replication::ReplicationConfig {
6936 let rules = dto
6937 .rules
6938 .iter()
6939 .enumerate()
6940 .map(|(idx, r)| {
6941 let id =
6942 r.id.as_ref()
6943 .map(|s| s.as_str().to_owned())
6944 .unwrap_or_else(|| format!("rule-{idx}"));
6945 let priority = r.priority.unwrap_or(0).max(0) as u32;
6946 let status_enabled = r.status.as_str() == ReplicationRuleStatus::ENABLED;
6947 let filter = replication_filter_from_dto(r.filter.as_ref(), r.prefix.as_deref());
6948 let destination_bucket = r.destination.bucket.clone();
6949 let destination_storage_class = r
6950 .destination
6951 .storage_class
6952 .as_ref()
6953 .map(|s| s.as_str().to_owned());
6954 crate::replication::ReplicationRule {
6955 id,
6956 priority,
6957 status_enabled,
6958 filter,
6959 destination_bucket,
6960 destination_storage_class,
6961 }
6962 })
6963 .collect();
6964 crate::replication::ReplicationConfig {
6965 role: dto.role.clone(),
6966 rules,
6967 }
6968}
6969
6970fn replication_to_dto(cfg: &crate::replication::ReplicationConfig) -> ReplicationConfiguration {
6971 let rules = cfg
6972 .rules
6973 .iter()
6974 .map(|r| {
6975 let status = if r.status_enabled {
6976 ReplicationRuleStatus::from_static(ReplicationRuleStatus::ENABLED)
6977 } else {
6978 ReplicationRuleStatus::from_static(ReplicationRuleStatus::DISABLED)
6979 };
6980 let destination = Destination {
6981 access_control_translation: None,
6982 account: None,
6983 bucket: r.destination_bucket.clone(),
6984 encryption_configuration: None,
6985 metrics: None,
6986 replication_time: None,
6987 storage_class: r
6988 .destination_storage_class
6989 .as_ref()
6990 .map(|s| StorageClass::from(s.clone())),
6991 };
6992 let filter = Some(replication_filter_to_dto(&r.filter));
6993 ReplicationRule {
6994 delete_marker_replication: None,
6995 destination,
6996 existing_object_replication: None,
6997 filter,
6998 id: Some(r.id.clone()),
6999 prefix: None,
7000 priority: Some(r.priority as i32),
7001 source_selection_criteria: None,
7002 status,
7003 }
7004 })
7005 .collect();
7006 ReplicationConfiguration {
7007 role: cfg.role.clone(),
7008 rules,
7009 }
7010}
7011
7012fn replication_filter_from_dto(
7013 f: Option<&ReplicationRuleFilter>,
7014 rule_level_prefix: Option<&str>,
7015) -> crate::replication::ReplicationFilter {
7016 let mut prefix: Option<String> = rule_level_prefix.map(str::to_owned);
7017 let mut tags: Vec<(String, String)> = Vec::new();
7018 if let Some(f) = f {
7019 if let Some(p) = f.prefix.as_ref()
7020 && prefix.is_none()
7021 {
7022 prefix = Some(p.clone());
7023 }
7024 if let Some(t) = f.tag.as_ref()
7025 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7026 {
7027 tags.push((k.clone(), v.clone()));
7028 }
7029 if let Some(and) = f.and.as_ref() {
7030 if let Some(p) = and.prefix.as_ref()
7031 && prefix.is_none()
7032 {
7033 prefix = Some(p.clone());
7034 }
7035 if let Some(ts) = and.tags.as_ref() {
7036 for t in ts {
7037 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7038 tags.push((k.clone(), v.clone()));
7039 }
7040 }
7041 }
7042 }
7043 }
7044 crate::replication::ReplicationFilter { prefix, tags }
7045}
7046
7047fn replication_filter_to_dto(f: &crate::replication::ReplicationFilter) -> ReplicationRuleFilter {
7048 if f.tags.is_empty() {
7049 ReplicationRuleFilter {
7050 and: None,
7051 prefix: f.prefix.clone(),
7052 tag: None,
7053 }
7054 } else if f.tags.len() == 1 && f.prefix.is_none() {
7055 let (k, v) = &f.tags[0];
7056 ReplicationRuleFilter {
7057 and: None,
7058 prefix: None,
7059 tag: Some(Tag {
7060 key: Some(k.clone()),
7061 value: Some(v.clone()),
7062 }),
7063 }
7064 } else {
7065 let tags: Vec<Tag> = f
7066 .tags
7067 .iter()
7068 .map(|(k, v)| Tag {
7069 key: Some(k.clone()),
7070 value: Some(v.clone()),
7071 })
7072 .collect();
7073 ReplicationRuleFilter {
7074 and: Some(ReplicationRuleAndOperator {
7075 prefix: f.prefix.clone(),
7076 tags: Some(tags),
7077 }),
7078 prefix: None,
7079 tag: None,
7080 }
7081 }
7082}
7083
7084// ---------------------------------------------------------------------------
7085// v0.6 #37: Convert between the s3s-typed `BucketLifecycleConfiguration`
7086// (the wire surface) and our internal `crate::lifecycle::LifecycleConfig`.
7087// The internal representation flattens AWS's "Filter | And" disjunction
7088// into a single `LifecycleFilter` struct of optional fields plus a tag
7089// vector. Fields S4's evaluator does not consume
7090// (`expired_object_delete_marker`, `noncurrent_version_transitions`,
7091// `transition_default_minimum_object_size`, the storage class on the
7092// noncurrent expiration) are dropped on PUT and re-rendered as their
7093// AWS-default shape on GET so the client always sees a well-formed
7094// configuration.
7095// ---------------------------------------------------------------------------
7096
7097fn dto_lifecycle_to_internal(
7098 dto: &BucketLifecycleConfiguration,
7099) -> crate::lifecycle::LifecycleConfig {
7100 crate::lifecycle::LifecycleConfig {
7101 rules: dto.rules.iter().map(dto_rule_to_internal).collect(),
7102 }
7103}
7104
7105fn dto_rule_to_internal(rule: &LifecycleRule) -> crate::lifecycle::LifecycleRule {
7106 let status = crate::lifecycle::LifecycleStatus::from_aws_str(rule.status.as_str());
7107 let filter = rule
7108 .filter
7109 .as_ref()
7110 .map(dto_filter_to_internal)
7111 .unwrap_or_default();
7112 let expiration_days = rule
7113 .expiration
7114 .as_ref()
7115 .and_then(|e| e.days)
7116 .and_then(|d| u32::try_from(d).ok());
7117 let expiration_date = rule
7118 .expiration
7119 .as_ref()
7120 .and_then(|e| e.date.as_ref())
7121 .and_then(timestamp_to_chrono_utc);
7122 let transitions: Vec<crate::lifecycle::TransitionRule> = rule
7123 .transitions
7124 .as_ref()
7125 .map(|ts| {
7126 ts.iter()
7127 .filter_map(|t| {
7128 let days = u32::try_from(t.days?).ok()?;
7129 let storage_class = t.storage_class.as_ref()?.as_str().to_owned();
7130 Some(crate::lifecycle::TransitionRule {
7131 days,
7132 storage_class,
7133 })
7134 })
7135 .collect()
7136 })
7137 .unwrap_or_default();
7138 let noncurrent_version_expiration_days = rule
7139 .noncurrent_version_expiration
7140 .as_ref()
7141 .and_then(|n| n.noncurrent_days)
7142 .and_then(|d| u32::try_from(d).ok());
7143 let abort_incomplete_multipart_upload_days = rule
7144 .abort_incomplete_multipart_upload
7145 .as_ref()
7146 .and_then(|a| a.days_after_initiation)
7147 .and_then(|d| u32::try_from(d).ok());
7148 crate::lifecycle::LifecycleRule {
7149 id: rule.id.clone().unwrap_or_default(),
7150 status,
7151 filter,
7152 expiration_days,
7153 expiration_date,
7154 transitions,
7155 noncurrent_version_expiration_days,
7156 abort_incomplete_multipart_upload_days,
7157 }
7158}
7159
7160fn dto_filter_to_internal(filter: &LifecycleRuleFilter) -> crate::lifecycle::LifecycleFilter {
7161 let mut prefix = filter.prefix.clone();
7162 let mut tags: Vec<(String, String)> = Vec::new();
7163 let mut size_gt: Option<u64> = filter
7164 .object_size_greater_than
7165 .and_then(|n| u64::try_from(n).ok());
7166 let mut size_lt: Option<u64> = filter
7167 .object_size_less_than
7168 .and_then(|n| u64::try_from(n).ok());
7169 if let Some(t) = &filter.tag
7170 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7171 {
7172 tags.push((k.clone(), v.clone()));
7173 }
7174 if let Some(and) = &filter.and {
7175 if prefix.is_none() {
7176 prefix = and.prefix.clone();
7177 }
7178 if size_gt.is_none() {
7179 size_gt = and
7180 .object_size_greater_than
7181 .and_then(|n| u64::try_from(n).ok());
7182 }
7183 if size_lt.is_none() {
7184 size_lt = and
7185 .object_size_less_than
7186 .and_then(|n| u64::try_from(n).ok());
7187 }
7188 if let Some(ts) = &and.tags {
7189 for t in ts {
7190 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7191 tags.push((k.clone(), v.clone()));
7192 }
7193 }
7194 }
7195 }
7196 crate::lifecycle::LifecycleFilter {
7197 prefix,
7198 tags,
7199 object_size_greater_than: size_gt,
7200 object_size_less_than: size_lt,
7201 }
7202}
7203
7204fn internal_rule_to_dto(rule: &crate::lifecycle::LifecycleRule) -> LifecycleRule {
7205 let expiration = if rule.expiration_days.is_some() || rule.expiration_date.is_some() {
7206 Some(LifecycleExpiration {
7207 date: rule.expiration_date.map(chrono_utc_to_timestamp),
7208 days: rule.expiration_days.map(|d| d as i32),
7209 expired_object_delete_marker: None,
7210 })
7211 } else {
7212 None
7213 };
7214 let transitions: Option<TransitionList> = if rule.transitions.is_empty() {
7215 None
7216 } else {
7217 Some(
7218 rule.transitions
7219 .iter()
7220 .map(|t| Transition {
7221 date: None,
7222 days: Some(t.days as i32),
7223 storage_class: Some(TransitionStorageClass::from(t.storage_class.clone())),
7224 })
7225 .collect(),
7226 )
7227 };
7228 let noncurrent_version_expiration =
7229 rule.noncurrent_version_expiration_days
7230 .map(|d| NoncurrentVersionExpiration {
7231 newer_noncurrent_versions: None,
7232 noncurrent_days: Some(d as i32),
7233 });
7234 let abort_incomplete_multipart_upload =
7235 rule.abort_incomplete_multipart_upload_days
7236 .map(|d| AbortIncompleteMultipartUpload {
7237 days_after_initiation: Some(d as i32),
7238 });
7239 let filter = if rule.filter.tags.is_empty()
7240 && rule.filter.object_size_greater_than.is_none()
7241 && rule.filter.object_size_less_than.is_none()
7242 {
7243 rule.filter.prefix.as_ref().map(|p| LifecycleRuleFilter {
7244 and: None,
7245 object_size_greater_than: None,
7246 object_size_less_than: None,
7247 prefix: Some(p.clone()),
7248 tag: None,
7249 })
7250 } else if rule.filter.tags.len() == 1
7251 && rule.filter.prefix.is_none()
7252 && rule.filter.object_size_greater_than.is_none()
7253 && rule.filter.object_size_less_than.is_none()
7254 {
7255 let (k, v) = rule.filter.tags[0].clone();
7256 Some(LifecycleRuleFilter {
7257 and: None,
7258 object_size_greater_than: None,
7259 object_size_less_than: None,
7260 prefix: None,
7261 tag: Some(Tag {
7262 key: Some(k),
7263 value: Some(v),
7264 }),
7265 })
7266 } else {
7267 let tags = if rule.filter.tags.is_empty() {
7268 None
7269 } else {
7270 Some(
7271 rule.filter
7272 .tags
7273 .iter()
7274 .map(|(k, v)| Tag {
7275 key: Some(k.clone()),
7276 value: Some(v.clone()),
7277 })
7278 .collect(),
7279 )
7280 };
7281 Some(LifecycleRuleFilter {
7282 and: Some(LifecycleRuleAndOperator {
7283 object_size_greater_than: rule
7284 .filter
7285 .object_size_greater_than
7286 .and_then(|n| i64::try_from(n).ok()),
7287 object_size_less_than: rule
7288 .filter
7289 .object_size_less_than
7290 .and_then(|n| i64::try_from(n).ok()),
7291 prefix: rule.filter.prefix.clone(),
7292 tags,
7293 }),
7294 object_size_greater_than: None,
7295 object_size_less_than: None,
7296 prefix: None,
7297 tag: None,
7298 })
7299 };
7300 LifecycleRule {
7301 abort_incomplete_multipart_upload,
7302 expiration,
7303 filter,
7304 id: if rule.id.is_empty() {
7305 None
7306 } else {
7307 Some(rule.id.clone())
7308 },
7309 noncurrent_version_expiration,
7310 noncurrent_version_transitions: None,
7311 prefix: None,
7312 status: ExpirationStatus::from(rule.status.as_aws_str().to_owned()),
7313 transitions,
7314 }
7315}
7316
7317// (timestamp <-> chrono helpers `timestamp_to_chrono_utc` /
7318// `chrono_utc_to_timestamp` are defined earlier in this file for the
7319// tagging/notifications work; the lifecycle DTO converters reuse them.)
7320
7321// ---------------------------------------------------------------------------
7322// v0.5 #33: SigV4a (asymmetric ECDSA-P256) integration hook.
7323//
7324// Kept as a self-contained block at the bottom of the file so it doesn't
7325// touch the existing `S4Service` struct, `new()`, or any of the per-op
7326// handlers above. The hook is wired in by the binary at server-build time
7327// as a hyper middleware layer (see `main.rs`), NOT inside `S4Service`.
7328//
7329// Lifecycle:
7330// 1. `SigV4aGate::new(store)` is constructed once at boot from the
7331// operator-supplied credential directory.
7332// 2. For each incoming request, `SigV4aGate::pre_route(&req,
7333// &requested_region, &canonical_request_bytes)` is invoked BEFORE
7334// the request hits the S3 framework. If the request claims SigV4a
7335// and verifies, control returns to the framework. Otherwise a 403
7336// `SignatureDoesNotMatch` is produced.
7337// 3. Plain SigV4 (HMAC-SHA256) requests pass through untouched.
7338// ---------------------------------------------------------------------------
7339
7340/// Gate that fronts the S3 service path with SigV4a verification (v0.5 #33).
7341///
7342/// Wraps a [`crate::sigv4a::SigV4aCredentialStore`] and exposes a single
7343/// `pre_route` entry point that returns `Ok(())` for both
7344/// "request is plain SigV4 — pass through" and "request is SigV4a and
7345/// verified", and an `Err(...)` containing a 403-equivalent diagnostic
7346/// otherwise. Cheap to clone (the inner store is `Arc`-backed).
7347///
7348/// v0.8.4 #76 (audit H-6): the gate now enforces an `x-amz-date`
7349/// freshness window (default 15 min, AWS-spec) and a strict credential
7350/// scope shape (`<key>/<YYYYMMDD>/s3/aws4_request`), shutting the
7351/// captured-request replay vector — previously a stolen valid SigV4a
7352/// signature could be replayed indefinitely (including DELETE).
7353#[derive(Debug, Clone)]
7354pub struct SigV4aGate {
7355 store: crate::sigv4a::SharedSigV4aCredentialStore,
7356 /// v0.8.4 #76: how far the request's `x-amz-date` may drift from
7357 /// the server's clock before being rejected with 403
7358 /// `RequestTimeTooSkewed`. Matches the AWS S3 spec default of
7359 /// 15 min when constructed via [`SigV4aGate::new`]; the operator
7360 /// can override via [`SigV4aGate::with_skew_tolerance`] (CLI flag
7361 /// `--sigv4a-skew-tolerance-seconds`).
7362 skew_tolerance: chrono::Duration,
7363}
7364
7365impl SigV4aGate {
7366 /// Default `x-amz-date` skew tolerance — 15 min, matching AWS S3.
7367 pub const DEFAULT_SKEW_TOLERANCE_SECS: i64 = 900;
7368
7369 #[must_use]
7370 pub fn new(store: crate::sigv4a::SharedSigV4aCredentialStore) -> Self {
7371 Self {
7372 store,
7373 skew_tolerance: chrono::Duration::seconds(Self::DEFAULT_SKEW_TOLERANCE_SECS),
7374 }
7375 }
7376
7377 /// v0.8.4 #76: override the `x-amz-date` skew tolerance (default
7378 /// 15 min). Operators can widen this for high-clock-drift
7379 /// environments or tighten it for compliance regimes that demand
7380 /// stricter freshness.
7381 #[must_use]
7382 pub fn with_skew_tolerance(mut self, skew: chrono::Duration) -> Self {
7383 self.skew_tolerance = skew;
7384 self
7385 }
7386
7387 /// Read the configured skew tolerance — exposed mostly for test +
7388 /// observability use.
7389 #[must_use]
7390 pub fn skew_tolerance(&self) -> chrono::Duration {
7391 self.skew_tolerance
7392 }
7393
7394 /// Inspect an incoming HTTP request. Behaviour:
7395 ///
7396 /// - Not SigV4a (no `X-Amz-Region-Set` and no SigV4a `Authorization`
7397 /// prefix) → returns `Ok(())`; the framework's existing SigV4
7398 /// path handles the request.
7399 /// - SigV4a + valid signature + region match + fresh x-amz-date
7400 /// → `Ok(())`.
7401 /// - SigV4a + unknown access-key-id → `Err` with `InvalidAccessKeyId`.
7402 /// - SigV4a + bad signature / region mismatch → `Err` with
7403 /// `SignatureDoesNotMatch`.
7404 /// - SigV4a + missing or skewed `x-amz-date` → `Err` with one of
7405 /// the v0.8.4 #76 freshness variants (`RequestTimeTooSkewed`
7406 /// et al.).
7407 ///
7408 /// `canonical_request_bytes` is the SigV4a string-to-sign (or
7409 /// canonical-request bytes; the caller decides) that the framework
7410 /// has already produced for this request. Keeping it as a parameter
7411 /// instead of rebuilding it inside the hook avoids duplicating the
7412 /// canonicalisation logic.
7413 pub fn pre_route<B>(
7414 &self,
7415 req: &http::Request<B>,
7416 requested_region: &str,
7417 canonical_request_bytes: &[u8],
7418 ) -> Result<(), SigV4aGateError> {
7419 self.pre_route_at(
7420 req,
7421 requested_region,
7422 canonical_request_bytes,
7423 chrono::Utc::now(),
7424 )
7425 }
7426
7427 /// Like [`SigV4aGate::pre_route`] but takes an explicit `now` for
7428 /// tests that need to pin the freshness clock. Production callers
7429 /// use `pre_route` (which calls `chrono::Utc::now()`).
7430 pub fn pre_route_at<B>(
7431 &self,
7432 req: &http::Request<B>,
7433 requested_region: &str,
7434 canonical_request_bytes: &[u8],
7435 now: chrono::DateTime<chrono::Utc>,
7436 ) -> Result<(), SigV4aGateError> {
7437 if !crate::sigv4a::detect(req) {
7438 return Ok(());
7439 }
7440 let auth_hdr = req
7441 .headers()
7442 .get(http::header::AUTHORIZATION)
7443 .and_then(|v| v.to_str().ok())
7444 .ok_or(SigV4aGateError::MissingAuthorization)?;
7445 let parsed = crate::sigv4a::parse_authorization_header(auth_hdr)
7446 .map_err(|_| SigV4aGateError::MalformedAuthorization)?;
7447 let region_set = req
7448 .headers()
7449 .get(crate::sigv4a::REGION_SET_HEADER)
7450 .and_then(|v| v.to_str().ok())
7451 .unwrap_or("*");
7452 let key = self
7453 .store
7454 .get(&parsed.access_key_id)
7455 .ok_or_else(|| SigV4aGateError::UnknownAccessKey(parsed.access_key_id.clone()))?;
7456 // v0.8.4 #76: snapshot the request headers into a
7457 // lowercase-keyed flat map so `verify_request` can do the
7458 // x-amz-date freshness checks without taking a generic
7459 // `HeaderMap` dep. Cheap because the headers list is tiny.
7460 //
7461 // v0.8.5 #84 (audit H-4): detect duplicate header names while
7462 // we flatten — `HashMap::insert` would silently overwrite the
7463 // first value with the second, mirroring the auth-confusion
7464 // vector the canonical-request builder also defends against.
7465 // Reject upfront so the rest of the gate (freshness check,
7466 // ECDSA verify) never sees a half-truncated header set. We
7467 // detect by checking `contains_key` *before* insertion rather
7468 // than by counting via `headers().get_all`, because the
7469 // upstream `HeaderMap` iteration yields each duplicate entry
7470 // as its own (name, value) pair — the second-seen entry is
7471 // exactly what `contains_key` traps.
7472 let mut header_map: std::collections::HashMap<String, String> =
7473 std::collections::HashMap::with_capacity(req.headers().len());
7474 for (name, value) in req.headers() {
7475 if let Ok(v) = value.to_str() {
7476 let lower = name.as_str().to_ascii_lowercase();
7477 if header_map.contains_key(&lower) {
7478 return Err(SigV4aGateError::Verify(
7479 crate::sigv4a::SigV4aError::DuplicateSignedHeader { header: lower },
7480 ));
7481 }
7482 header_map.insert(lower, v.to_string());
7483 }
7484 }
7485 crate::sigv4a::verify_request(
7486 &parsed,
7487 &header_map,
7488 canonical_request_bytes,
7489 key,
7490 region_set,
7491 requested_region,
7492 now,
7493 self.skew_tolerance,
7494 )
7495 .map_err(SigV4aGateError::Verify)?;
7496 Ok(())
7497 }
7498}
7499
7500/// Failure modes from [`SigV4aGate::pre_route`]. All variants map to
7501/// HTTP 403 with one of the two AWS-standard error codes
7502/// (`InvalidAccessKeyId` / `SignatureDoesNotMatch` / `RequestTimeTooSkewed`)
7503/// — see [`SigV4aGateError::s3_error_code`].
7504#[derive(Debug, thiserror::Error)]
7505pub enum SigV4aGateError {
7506 #[error("missing Authorization header")]
7507 MissingAuthorization,
7508 #[error("malformed SigV4a Authorization header")]
7509 MalformedAuthorization,
7510 #[error("unknown SigV4a access-key-id: {0}")]
7511 UnknownAccessKey(String),
7512 #[error("SigV4a verification failed: {0}")]
7513 Verify(#[source] crate::sigv4a::SigV4aError),
7514}
7515
7516impl SigV4aGateError {
7517 /// AWS S3 error code that should accompany the response.
7518 ///
7519 /// v0.8.4 #76 (audit H-6): the freshness check surfaces
7520 /// `RequestTimeTooSkewed` (matches AWS spec); date / scope shape
7521 /// failures surface as `InvalidRequest` (400); other failures stay
7522 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` (403) so the wire
7523 /// surface stays AWS-compatible.
7524 #[must_use]
7525 pub fn s3_error_code(&self) -> &'static str {
7526 match self {
7527 Self::UnknownAccessKey(_) => "InvalidAccessKeyId",
7528 Self::Verify(crate::sigv4a::SigV4aError::RequestTimeTooSkewed { .. }) => {
7529 "RequestTimeTooSkewed"
7530 }
7531 Self::Verify(
7532 crate::sigv4a::SigV4aError::MissingXAmzDate
7533 | crate::sigv4a::SigV4aError::InvalidDateFormat
7534 | crate::sigv4a::SigV4aError::DateScopeMismatch
7535 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7536 | crate::sigv4a::SigV4aError::InvalidTerminator
7537 | crate::sigv4a::SigV4aError::WrongService { .. }
7538 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7539 ) => "InvalidRequest",
7540 _ => "SignatureDoesNotMatch",
7541 }
7542 }
7543
7544 /// HTTP status code to accompany the response. v0.8.4 #76: format
7545 /// errors that are clearly client mistakes (missing / malformed
7546 /// `x-amz-date`, malformed credential scope, wrong service) are
7547 /// surfaced as 400 InvalidRequest; the rest stay 403.
7548 #[must_use]
7549 pub fn http_status(&self) -> http::StatusCode {
7550 match self {
7551 Self::Verify(
7552 crate::sigv4a::SigV4aError::MissingXAmzDate
7553 | crate::sigv4a::SigV4aError::InvalidDateFormat
7554 | crate::sigv4a::SigV4aError::DateScopeMismatch
7555 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7556 | crate::sigv4a::SigV4aError::InvalidTerminator
7557 | crate::sigv4a::SigV4aError::WrongService { .. }
7558 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7559 ) => http::StatusCode::BAD_REQUEST,
7560 _ => http::StatusCode::FORBIDDEN,
7561 }
7562 }
7563}
7564
7565#[cfg(test)]
7566mod tests {
7567 use super::*;
7568
7569 #[test]
7570 fn manifest_roundtrip_via_metadata() {
7571 let original = ChunkManifest {
7572 codec: CodecKind::CpuZstd,
7573 original_size: 1234,
7574 compressed_size: 567,
7575 crc32c: 0xdead_beef,
7576 };
7577 let mut meta: Option<Metadata> = None;
7578 write_manifest(&mut meta, &original);
7579 let extracted = extract_manifest(&meta).expect("manifest must round-trip");
7580 assert_eq!(extracted.codec, original.codec);
7581 assert_eq!(extracted.original_size, original.original_size);
7582 assert_eq!(extracted.compressed_size, original.compressed_size);
7583 assert_eq!(extracted.crc32c, original.crc32c);
7584 }
7585
7586 #[test]
7587 fn missing_metadata_yields_none() {
7588 let meta: Option<Metadata> = None;
7589 assert!(extract_manifest(&meta).is_none());
7590 }
7591
7592 #[test]
7593 fn partial_metadata_yields_none() {
7594 let mut meta = Metadata::new();
7595 meta.insert(META_CODEC.into(), "cpu-zstd".into());
7596 let opt = Some(meta);
7597 assert!(extract_manifest(&opt).is_none());
7598 }
7599
7600 #[test]
7601 fn parse_copy_source_range_basic() {
7602 let r = parse_copy_source_range("bytes=10-20").unwrap();
7603 match r {
7604 s3s::dto::Range::Int { first, last } => {
7605 assert_eq!(first, 10);
7606 assert_eq!(last, Some(20));
7607 }
7608 _ => panic!("expected Int range"),
7609 }
7610 }
7611
7612 #[test]
7613 fn parse_copy_source_range_rejects_inverted() {
7614 let err = parse_copy_source_range("bytes=20-10").unwrap_err();
7615 assert!(err.contains("last < first"));
7616 }
7617
7618 #[test]
7619 fn parse_copy_source_range_rejects_missing_prefix() {
7620 let err = parse_copy_source_range("10-20").unwrap_err();
7621 assert!(err.contains("must start with 'bytes='"));
7622 }
7623
7624 #[test]
7625 fn parse_copy_source_range_rejects_open_ended() {
7626 // S3 upload_part_copy spec requires N-M (closed); suffix and
7627 // open-ended forms are not allowed for this header.
7628 assert!(parse_copy_source_range("bytes=10-").is_err());
7629 assert!(parse_copy_source_range("bytes=-10").is_err());
7630 }
7631
7632 // v0.7 #49: safe_object_uri must round-trip every legal S3 key
7633 // (which includes spaces, slashes, control chars, raw UTF-8) into
7634 // a parseable `http::Uri` instead of panicking like the previous
7635 // `format!(...).parse().unwrap()` call sites did.
7636
7637 #[test]
7638 fn safe_object_uri_basic_ascii() {
7639 let uri = safe_object_uri("bucket", "key").expect("ascii must be safe");
7640 assert_eq!(uri.path(), "/bucket/key");
7641 }
7642
7643 #[test]
7644 fn safe_object_uri_encodes_spaces() {
7645 let uri = safe_object_uri("bucket", "key with spaces").expect("must encode spaces");
7646 // RFC 3986 path-segment encoding turns ' ' into %20.
7647 assert!(
7648 uri.path().contains("%20"),
7649 "expected percent-encoded space, got {}",
7650 uri.path()
7651 );
7652 assert!(uri.path().starts_with("/bucket/"));
7653 }
7654
7655 #[test]
7656 fn safe_object_uri_preserves_slashes() {
7657 // S3 keys legally contain '/' as a logical path separator —
7658 // the helper must NOT escape it (otherwise the synthetic URI
7659 // changes the perceived hierarchy).
7660 let uri = safe_object_uri("bucket", "key/with/slashes").expect("slashes must round-trip");
7661 assert_eq!(uri.path(), "/bucket/key/with/slashes");
7662 }
7663
7664 #[test]
7665 fn safe_object_uri_handles_newline_without_panic() {
7666 // Newlines are control chars in URIs; whether the result is
7667 // Ok (encoded as %0A) or Err (parse rejects), the helper
7668 // MUST NOT panic. Either outcome is acceptable.
7669 let _ = safe_object_uri("bucket", "key\n");
7670 }
7671
7672 #[test]
7673 fn safe_object_uri_handles_null_byte_without_panic() {
7674 let _ = safe_object_uri("bucket", "key\0bad");
7675 }
7676
7677 #[test]
7678 fn safe_object_uri_handles_unicode_without_panic() {
7679 // RTL override, BOM, plain Japanese — none should panic.
7680 let _ = safe_object_uri("bucket", "rtl\u{202E}override");
7681 let _ = safe_object_uri("bucket", "\u{FEFF}bom-key");
7682 let _ = safe_object_uri("bucket", "日本語キー");
7683 }
7684
7685 #[test]
7686 fn safe_object_uri_no_panic_for_every_byte() {
7687 // Exhaustive byte coverage: 0x00..=0xFF as a 1-byte key.
7688 // None of these may panic. (0x80..=0xFF are not valid UTF-8
7689 // by themselves; we go through `String::from_utf8_lossy` so
7690 // the helper sees a real `&str` regardless of the raw byte.)
7691 for b in 0u8..=255 {
7692 let s = String::from_utf8_lossy(&[b]).into_owned();
7693 let _ = safe_object_uri("bucket", &s);
7694 }
7695 }
7696
7697 /// v0.8.1 #58: smoke test for the DEK-handling shape used by the
7698 /// SSE-KMS branches of `put_object` and `complete_multipart_upload`.
7699 /// Mirrors the call pattern (generate_dek → length check → copy
7700 /// into stack `[u8; 32]` → reborrow as `&[u8; 32]` for `SseSource`)
7701 /// without spinning up a full `S4Service`.
7702 ///
7703 /// The real assertion this guards against is a regression where
7704 /// the `Zeroizing` wrapper is accidentally dropped before the
7705 /// stack copy lands (e.g. someone refactors to use
7706 /// `let dek = kms.generate_dek(...).await?.0; drop(dek); ...`)
7707 /// or where `&**dek` is rewritten in a way that doesn't compile.
7708 #[tokio::test]
7709 async fn kms_dek_lifetime_within_function_scope() {
7710 use crate::kms::{KmsBackend, LocalKms};
7711 use std::collections::HashMap;
7712 use std::path::PathBuf;
7713 use zeroize::Zeroizing;
7714
7715 let mut keks = HashMap::new();
7716 keks.insert("scope".to_string(), [33u8; 32]);
7717 let kms = LocalKms::from_keks(PathBuf::from("/tmp/kms-scope-test"), keks);
7718
7719 // Mirror the put_object KMS branch shape exactly.
7720 let (dek, wrapped) = kms.generate_dek("scope").await.unwrap();
7721 assert_eq!(dek.len(), 32);
7722 let mut dek_arr: Zeroizing<[u8; 32]> = Zeroizing::new([0u8; 32]);
7723 dek_arr.copy_from_slice(&dek);
7724
7725 // The reborrow used at the SseSource construction site —
7726 // mirrors the call-site pattern where `let dek_ref: &[u8; 32]`
7727 // auto-derefs from a `Zeroizing<[u8; 32]>` reference.
7728 let dek_ref: &[u8; 32] = &dek_arr;
7729 // Sanity: the reborrow points at the same bytes.
7730 assert_eq!(dek_ref, &*dek_arr);
7731 // Wrapped key id flows through unchanged.
7732 assert_eq!(wrapped.key_id, "scope");
7733
7734 // At end of scope, both `dek` (Zeroizing<Vec<u8>>) and
7735 // `dek_arr` (Zeroizing<[u8; 32]>) are dropped, wiping the
7736 // backing memory. Cannot directly assert the wipe (would be
7737 // UB to read freed memory), so this test instead enforces
7738 // that the call shape compiles and executes; the wipe itself
7739 // is exercised by the `zeroize` crate's own test suite.
7740 }
7741
7742 /// v0.8.5 #86 (audit M-2): the replication dispatcher must
7743 /// `acquire_owned()` a permit from `replication_semaphore` before
7744 /// kicking off the destination PUT, so a saturated semaphore
7745 /// back-pressures the in-flight queue depth instead of letting it
7746 /// grow without bound. We exercise the field directly (initial
7747 /// permit count, override via `with_replication_max_concurrent`,
7748 /// permit drop on `Drop`) — the full `spawn_replication_if_matched`
7749 /// integration is exercised by the existing replication tests in
7750 /// `tests/feature_e2e.rs` once a `ReplicationManager` is attached.
7751 #[tokio::test]
7752 async fn replication_semaphore_caps_concurrent_dispatchers() {
7753 // Build a minimal `S4Service` directly — no handler path is
7754 // exercised, only the constructor + setter + accessor shape.
7755 let registry = Arc::new(
7756 CodecRegistry::new(CodecKind::Passthrough)
7757 .with(Arc::new(s4_codec::passthrough::Passthrough)),
7758 );
7759 let dispatcher = Arc::new(s4_codec::dispatcher::AlwaysDispatcher(
7760 CodecKind::Passthrough,
7761 ));
7762 let s4 = S4Service::new(NoopBackend, registry, dispatcher);
7763
7764 // Default cap matches the documented constant.
7765 assert_eq!(
7766 s4.replication_semaphore().available_permits(),
7767 S4Service::<NoopBackend>::DEFAULT_REPLICATION_MAX_CONCURRENT,
7768 "fresh S4Service must expose DEFAULT_REPLICATION_MAX_CONCURRENT permits"
7769 );
7770
7771 // Override via the builder — replaces the underlying `Semaphore`.
7772 let s4 = s4.with_replication_max_concurrent(2);
7773 assert_eq!(
7774 s4.replication_semaphore().available_permits(),
7775 2,
7776 "with_replication_max_concurrent(2) must expose exactly 2 permits"
7777 );
7778
7779 // Acquiring permits must reduce `available_permits()` and
7780 // dropping them must restore the count — this is the contract
7781 // `spawn_replication_if_matched` relies on for back-pressure.
7782 let sem = Arc::clone(s4.replication_semaphore());
7783 let p1 = sem.clone().acquire_owned().await.expect("permit 1");
7784 let p2 = sem.clone().acquire_owned().await.expect("permit 2");
7785 assert_eq!(
7786 sem.available_permits(),
7787 0,
7788 "two acquired permits must zero `available_permits()`"
7789 );
7790 // A third `try_acquire_owned` must fail — the cap is enforced
7791 // synchronously, no extra spawn slips through.
7792 assert!(
7793 sem.clone().try_acquire_owned().is_err(),
7794 "third acquire must back-pressure: cap was 2"
7795 );
7796 drop(p1);
7797 drop(p2);
7798 assert_eq!(
7799 sem.available_permits(),
7800 2,
7801 "dropping permits must restore cap"
7802 );
7803
7804 // Lower-bound clamp: a 0 cap would deadlock all dispatchers,
7805 // so the setter clamps it to 1 instead of accepting it
7806 // (callers are warned in the CLI doc).
7807 let s4 = s4.with_replication_max_concurrent(0);
7808 assert_eq!(
7809 s4.replication_semaphore().available_permits(),
7810 1,
7811 "cap=0 must be clamped to 1 to avoid total deadlock"
7812 );
7813 }
7814
7815 /// v0.8.5 #86 (audit M-1): the access-log flusher must return a
7816 /// `JoinHandle<()>` that the caller can `abort()` on shutdown
7817 /// without leaving a dangling task. The pre-#86 call site dropped
7818 /// the handle at end-of-block (silently detaching it); the fix is
7819 /// hoisting it into a process-lived `Vec` so the graceful-shutdown
7820 /// branch in `main.rs` can wait for clean exit. This test exercises
7821 /// the `JoinHandle.abort()` shape directly so a future refactor that
7822 /// stops returning the handle (or returns a non-abortable wrapper)
7823 /// trips this regression guard.
7824 #[tokio::test]
7825 async fn flusher_handle_can_be_aborted_cleanly() {
7826 // Stand up a minimal `AccessLog` pointing at a tmp dir so the
7827 // flusher's `create_dir_all` succeeds. The dir is cleaned up
7828 // by the OS / test harness; we don't assert on the contents.
7829 let tmp = std::env::temp_dir().join(format!(
7830 "s4-86-flusher-{}-{}",
7831 std::process::id(),
7832 std::time::SystemTime::now()
7833 .duration_since(std::time::UNIX_EPOCH)
7834 .map(|d| d.as_nanos())
7835 .unwrap_or(0)
7836 ));
7837 let dest = crate::access_log::AccessLogDest { dir: tmp.clone() };
7838 let log = crate::access_log::AccessLog::new(dest);
7839 let handle = log.spawn_flusher(None);
7840 assert!(
7841 !handle.is_finished(),
7842 "freshly-spawned flusher must not yet be finished"
7843 );
7844 handle.abort();
7845 // `await`-ing an aborted handle returns `Err(JoinError)` whose
7846 // `is_cancelled()` is true.
7847 let join_result = handle.await;
7848 assert!(
7849 join_result.is_err(),
7850 "aborted flusher must surface JoinError, got Ok"
7851 );
7852 assert!(
7853 join_result.unwrap_err().is_cancelled(),
7854 "JoinError must report .is_cancelled() = true after abort()"
7855 );
7856 let _ = std::fs::remove_dir_all(&tmp);
7857 }
7858
7859 /// Stub backend used solely by the v0.8.5 #86 unit tests above —
7860 /// the `S4Service` constructor needs `B: S3` but the tests only
7861 /// exercise builder / accessor shape, never a handler call. Every
7862 /// `S3` method falls through to the trait's default
7863 /// `NotImplemented` (which `s3s` provides automatically).
7864 struct NoopBackend;
7865
7866 #[async_trait::async_trait]
7867 impl S3 for NoopBackend {}
7868
7869 /// v0.8.5 #81 (audit H-7): the panic-catch wrapper at the
7870 /// dispatcher spawn site must intercept a panicking inner future,
7871 /// log at ERROR, and bump the per-kind counter — instead of letting
7872 /// the panic propagate as a `JoinError` that no operator dashboard
7873 /// scrapes. We exercise the wrapper directly (rather than driving a
7874 /// full `spawn_replication_if_matched` end-to-end, which would
7875 /// require a full `S4Service` + backend) because the wrapper shape
7876 /// is the load-bearing piece — any inner-future swap would still
7877 /// route through the same `AssertUnwindSafe(...).catch_unwind()`
7878 /// closure we want to lock in here.
7879 #[tokio::test]
7880 async fn dispatcher_panic_caught_and_metric_bumped() {
7881 use futures::FutureExt as _;
7882
7883 let handle = crate::metrics::test_metrics_handle();
7884 let kind = "replication";
7885
7886 // Mirror the production wrapper shape verbatim — if the
7887 // production code ever stops using `AssertUnwindSafe.catch_unwind`
7888 // this test shouldn't keep passing on a hand-rolled copy that
7889 // diverged.
7890 let panicking = async {
7891 panic!("simulated dispatcher panic");
7892 };
7893 let result = std::panic::AssertUnwindSafe(panicking).catch_unwind().await;
7894 assert!(
7895 result.is_err(),
7896 "catch_unwind must surface the panic instead of swallowing it"
7897 );
7898 // Bump the production counter via the same helper the wrapper
7899 // calls so the rendered output gates on the production code
7900 // path, not a parallel bookkeeping copy.
7901 crate::metrics::record_dispatcher_panic(kind);
7902
7903 let rendered = handle.render();
7904 assert!(
7905 rendered.contains("s4_dispatcher_panics_total"),
7906 "expected s4_dispatcher_panics_total in metrics output, got: {rendered}"
7907 );
7908 assert!(
7909 rendered.contains("kind=\"replication\""),
7910 "expected kind=\"replication\" label in metrics output, got: {rendered}"
7911 );
7912 }
7913}