s4_server/service.rs
1//! `s3s::S3` 実装 — `s3s_aws::Proxy` への delegation を default にしつつ、
2//! `put_object` / `get_object` 経路で `s4_codec::CodecRegistry` を呼ぶ。
3//!
4//! ## カバー範囲 (Phase 1 月 2)
5//!
6//! - 圧縮 hook あり: `put_object`, `get_object`
7//! - 純 delegation (圧縮なし): `head_bucket`, `list_buckets`, `create_bucket`, `delete_bucket`,
8//! `head_object`, `delete_object`, `delete_objects`, `copy_object`, `list_objects`,
9//! `list_objects_v2`, `create_multipart_upload`, `upload_part`,
10//! `complete_multipart_upload`, `abort_multipart_upload`, `list_multipart_uploads`,
11//! `list_parts`
12//! - 未対応 (デフォルトで NotImplemented): その他 80+ ops (Tagging / ACL / Lifecycle 等は Phase 2)
13//!
14//! ## アーキテクチャ
15//!
16//! - `S4Service<B>` は backend (B: S3) と `Arc<CodecRegistry>` と `Arc<dyn CodecDispatcher>`
17//! を保持する。`CodecRegistry` 経由で複数 codec を抱えられるので、ひとつの S4 インスタンスが
18//! 複数 codec で書かれた object を透過的に GET できる
19//! - PUT: dispatcher が body の先頭 sample から codec を選び、registry で compress、
20//! manifest を S3 metadata に書いて backend に forward
21//! - GET: backend から取得 → metadata から manifest を復元 → registry.decompress で
22//! manifest 指定の codec で解凍 → 元の bytes を return
23//!
24//! ## 既知の制限事項
25//!
26//! - **Multipart Upload は per-part 圧縮が未実装**: 現状は upload_part を素通し。
27//! Phase 1 月 2 後半で per-part compress + complete_multipart_upload で manifest 集約。
28//! - **PUT body は memory に collect**: max_body_bytes 上限あり (default 5 GiB = S3 単発 PUT 上限)。
29//! Streaming-aware 圧縮は Phase 2。
30
31use std::sync::Arc;
32
33use base64::Engine as _;
34use bytes::BytesMut;
35use s3s::dto::*;
36use s3s::{S3, S3Error, S3ErrorCode, S3Request, S3Response, S3Result};
37use s4_codec::index::{FrameIndex, build_index_from_body, decode_index, encode_index, sidecar_key};
38use s4_codec::multipart::{
39 FRAME_HEADER_BYTES, FrameHeader, FrameIter, S3_MULTIPART_MIN_PART_BYTES, pad_to_minimum,
40 write_frame,
41};
42use s4_codec::{ChunkManifest, CodecDispatcher, CodecKind, CodecRegistry, CompressTelemetry};
43use std::time::Instant;
44use tracing::{debug, info};
45
46use crate::blob::{
47 bytes_to_blob, chain_sample_with_rest, collect_blob, collect_with_sample, peek_sample,
48};
49use crate::streaming::{
50 Crc32cVerifyingReader, async_read_to_blob, blob_to_async_read, cpu_zstd_decompress_stream,
51 pick_chunk_size, streaming_compress_to_frames, supports_streaming_compress,
52 supports_streaming_decompress,
53};
54
55/// PUT body の先頭 sampling で渡す最大 byte 数。
56const SAMPLE_BYTES: usize = 4096;
57
58/// v0.8 #55: stamp the GPU pipeline metrics (`s4_gpu_compress_seconds`,
59/// `s4_gpu_throughput_bytes_per_sec`, `s4_gpu_oom_total`) from a
60/// `CompressTelemetry` returned by `CodecRegistry::compress_with_telemetry`.
61/// CPU codecs (`gpu_seconds = None`) are no-ops here — they're already
62/// covered by the existing `s4_request_latency_seconds` / `s4_bytes_*`
63/// counters in the request-level `record_put` / `record_get` calls.
64#[inline]
65fn stamp_gpu_compress_telemetry(tel: &CompressTelemetry) {
66 if let Some(secs) = tel.gpu_seconds {
67 crate::metrics::record_gpu_compress(tel.codec, secs, tel.bytes_in, tel.bytes_out);
68 }
69 if tel.oom {
70 crate::metrics::record_gpu_oom(tel.codec);
71 }
72}
73
74/// v0.7 #49: percent-encoding set covering everything that is **not** an
75/// `unreserved` character per RFC 3986 §2.3, **plus** we additionally
76/// encode the path-reserved sub-delims that `http::Uri` rejects in a
77/// path segment (`?`, `#`, `%`, control bytes, space, etc.). We
78/// deliberately keep `/` un-encoded because S3 keys legally use `/` as
79/// a logical separator and the rest of the synthetic URI relies on the
80/// path layout `/{bucket}/{key}` round-tripping byte-for-byte.
81const URI_KEY_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
82 .add(b' ')
83 .add(b'"')
84 .add(b'#')
85 .add(b'<')
86 .add(b'>')
87 .add(b'?')
88 .add(b'`')
89 .add(b'{')
90 .add(b'}')
91 .add(b'|')
92 .add(b'\\')
93 .add(b'^')
94 .add(b'[')
95 .add(b']')
96 .add(b'%');
97
98/// v0.7 #49: build the synthetic `/{bucket}/{key}` request URI used by
99/// the sidecar / replication helpers when they re-enter the backend
100/// trait without going through the HTTP layer. S3 object keys can
101/// contain spaces, control bytes, and arbitrary Unicode that would
102/// make `format!(...).parse::<http::Uri>()` panic; we percent-encode
103/// the key bytes (RFC 3986 path segment) and the bucket name (defensive
104/// — bucket names are normally DNS-safe, but the helper is the single
105/// choke-point) before splicing them in. If the encoded form *still*
106/// fails to parse (extremely unlikely once everything outside the
107/// unreserved set is escaped) we surface a typed `400 InvalidObjectName`
108/// instead of crashing the worker.
109pub(crate) fn safe_object_uri(bucket: &str, key: &str) -> S3Result<http::Uri> {
110 use percent_encoding::utf8_percent_encode;
111 let bucket_enc = utf8_percent_encode(bucket, URI_KEY_ENCODE_SET);
112 let key_enc = utf8_percent_encode(key, URI_KEY_ENCODE_SET);
113 let raw = format!("/{bucket_enc}/{key_enc}");
114 raw.parse::<http::Uri>().map_err(|e| {
115 // S3 spec uses `InvalidObjectName` (HTTP 400) for keys that
116 // can't be represented in a request URI. The generated
117 // `S3ErrorCode` enum doesn't expose a typed variant for it,
118 // so we round-trip through `from_bytes` which preserves the
119 // canonical wire string while falling back to InvalidArgument
120 // if even that lookup fails (cannot happen at runtime — kept
121 // as a belt-and-suspenders branch so this helper never
122 // panics).
123 let code =
124 S3ErrorCode::from_bytes(b"InvalidObjectName").unwrap_or(S3ErrorCode::InvalidArgument);
125 S3Error::with_message(
126 code,
127 format!("object key cannot be encoded as a request URI: {e}"),
128 )
129 })
130}
131
132/// v0.8.12 HIGH-12 fix: verify a client-supplied integrity checksum
133/// against the received body BEFORE we strip the header on the way
134/// to the backend. Returns `Err(BadDigest)` on mismatch (matches
135/// AWS S3 wire behaviour); `Ok(())` when the supplied digest matches
136/// OR when the supplied algorithm is one we don't yet implement
137/// (the latter is logged so operators see the gap — fail-open on
138/// unsupported algorithms is the documented trade in the v0.8.11
139/// CHANGELOG, with full coverage tracked as a follow-up issue).
140///
141/// Algorithms covered: `Content-MD5` (base64 MD5),
142/// `x-amz-checksum-crc32c` (base64 big-endian u32),
143/// `x-amz-checksum-sha256` (base64 SHA-256). The remaining S3
144/// checksum algorithms (CRC32 non-Castagnoli, SHA-1, CRC64-NVME)
145/// are accepted and silently passed; verifying them needs new
146/// dependencies and was held back to keep the v0.8.12 surface
147/// bounded.
148#[allow(clippy::too_many_arguments)]
149fn verify_client_body_checksums(
150 body: &[u8],
151 content_md5_b64: Option<&str>,
152 checksum_crc32_b64: Option<&str>,
153 checksum_crc32c_b64: Option<&str>,
154 checksum_sha1_b64: Option<&str>,
155 checksum_sha256_b64: Option<&str>,
156 checksum_crc64nvme_b64: Option<&str>,
157) -> S3Result<()> {
158 use base64::Engine as _;
159 use md5::Md5;
160 use sha2::Sha256;
161 // `Digest` from md-5 / sha2 brings the `new`, `update`, `finalize`
162 // trait methods into scope. Bind anonymously so this `use` is
163 // never flagged as unused while still serving its real purpose.
164 use md5::Digest as _;
165 let b64 = base64::engine::general_purpose::STANDARD;
166 let bad = |what: &str| {
167 let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
168 S3Error::with_message(
169 code,
170 format!("client-supplied {what} did not match the received body"),
171 )
172 };
173 if let Some(claimed) = content_md5_b64 {
174 let want = b64.decode(claimed).map_err(|_| {
175 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed Content-MD5")
176 })?;
177 if want.len() != 16 {
178 return Err(S3Error::with_message(
179 S3ErrorCode::InvalidDigest,
180 "Content-MD5 must decode to 16 bytes",
181 ));
182 }
183 let mut h = Md5::new();
184 h.update(body);
185 let got = h.finalize();
186 // `subtle::ConstantTimeEq` would be ideal but the existing
187 // `constant_time_eq` helper in sse.rs is private; use a
188 // straightforward byte compare. The attacker doesn't get to
189 // choose the body retroactively, so a timing oracle here
190 // doesn't help them. `&got[..]` derefs the GenericArray
191 // into a `&[u8]` (the deprecated `.as_slice()` is gone in
192 // generic-array 1.x; CI runs `-D warnings`).
193 if got[..] != *want.as_slice() {
194 return Err(bad("Content-MD5"));
195 }
196 }
197 if let Some(claimed) = checksum_crc32c_b64 {
198 let want = b64.decode(claimed).map_err(|_| {
199 S3Error::with_message(
200 S3ErrorCode::InvalidDigest,
201 "malformed x-amz-checksum-crc32c",
202 )
203 })?;
204 if want.len() != 4 {
205 return Err(S3Error::with_message(
206 S3ErrorCode::InvalidDigest,
207 "x-amz-checksum-crc32c must decode to 4 bytes (big-endian u32)",
208 ));
209 }
210 let got = crc32c::crc32c(body).to_be_bytes();
211 if got != want.as_slice() {
212 return Err(bad("x-amz-checksum-crc32c"));
213 }
214 }
215 if let Some(claimed) = checksum_sha256_b64 {
216 let want = b64.decode(claimed).map_err(|_| {
217 S3Error::with_message(
218 S3ErrorCode::InvalidDigest,
219 "malformed x-amz-checksum-sha256",
220 )
221 })?;
222 if want.len() != 32 {
223 return Err(S3Error::with_message(
224 S3ErrorCode::InvalidDigest,
225 "x-amz-checksum-sha256 must decode to 32 bytes",
226 ));
227 }
228 let mut h = Sha256::new();
229 h.update(body);
230 let got = h.finalize();
231 if got[..] != *want.as_slice() {
232 return Err(bad("x-amz-checksum-sha256"));
233 }
234 }
235 // v0.8.12 #128 (MED-C): CRC32 (IEEE 802.3 — the non-Castagnoli
236 // variant AWS uses for `x-amz-checksum-crc32`). 4-byte
237 // big-endian value, base64-encoded.
238 if let Some(claimed) = checksum_crc32_b64 {
239 let want = b64.decode(claimed).map_err(|_| {
240 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-crc32")
241 })?;
242 if want.len() != 4 {
243 return Err(S3Error::with_message(
244 S3ErrorCode::InvalidDigest,
245 "x-amz-checksum-crc32 must decode to 4 bytes (big-endian u32)",
246 ));
247 }
248 let mut h = crc32fast::Hasher::new();
249 h.update(body);
250 let got = h.finalize().to_be_bytes();
251 if got != want.as_slice() {
252 return Err(bad("x-amz-checksum-crc32"));
253 }
254 }
255 // v0.8.12 #128 (MED-C): SHA-1. 20-byte digest, base64-encoded.
256 if let Some(claimed) = checksum_sha1_b64 {
257 use sha1::Sha1;
258 let want = b64.decode(claimed).map_err(|_| {
259 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-sha1")
260 })?;
261 if want.len() != 20 {
262 return Err(S3Error::with_message(
263 S3ErrorCode::InvalidDigest,
264 "x-amz-checksum-sha1 must decode to 20 bytes",
265 ));
266 }
267 let mut h = Sha1::new();
268 h.update(body);
269 let got = h.finalize();
270 if got[..] != *want.as_slice() {
271 return Err(bad("x-amz-checksum-sha1"));
272 }
273 }
274 // v0.8.12 #128 (MED-C): CRC64-NVME — AWS's newest checksum
275 // algorithm. NVMe spec: poly 0xad93d23594c93659, init / xorout
276 // 0xffffffffffffffff, refin / refout true. The reflected
277 // polynomial + 256-entry lookup table are computed lazily on
278 // first call (small enough to inline rather than pull in a
279 // dedicated crc64 crate).
280 if let Some(claimed) = checksum_crc64nvme_b64 {
281 let want = b64.decode(claimed).map_err(|_| {
282 S3Error::with_message(
283 S3ErrorCode::InvalidDigest,
284 "malformed x-amz-checksum-crc64nvme",
285 )
286 })?;
287 if want.len() != 8 {
288 return Err(S3Error::with_message(
289 S3ErrorCode::InvalidDigest,
290 "x-amz-checksum-crc64nvme must decode to 8 bytes (big-endian u64)",
291 ));
292 }
293 let got = crc64_nvme(body).to_be_bytes();
294 if got != want.as_slice() {
295 return Err(bad("x-amz-checksum-crc64nvme"));
296 }
297 }
298 Ok(())
299}
300
301/// v0.8.12 #128 (MED-C): CRC-64/NVME (AWS S3 `x-amz-checksum-crc64nvme`).
302/// NVMe spec: poly 0xad93d23594c93659, init 0xffffffffffffffff, refin
303/// true, refout true, xorout 0xffffffffffffffff. The reflected
304/// polynomial table is computed lazily on first call via
305/// [`std::sync::OnceLock`]; subsequent calls share the 256-entry table.
306fn crc64_nvme(bytes: &[u8]) -> u64 {
307 use std::sync::OnceLock;
308 static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
309 let tbl = TABLE.get_or_init(|| {
310 // Reflected polynomial (bit-reverse of 0xad93d23594c93659).
311 const POLY_REFLECTED: u64 = 0x9a6c_9329_ac4b_c9b5;
312 let mut t = [0u64; 256];
313 let mut i = 0usize;
314 while i < 256 {
315 let mut c = i as u64;
316 let mut j = 0;
317 while j < 8 {
318 c = if c & 1 != 0 {
319 (c >> 1) ^ POLY_REFLECTED
320 } else {
321 c >> 1
322 };
323 j += 1;
324 }
325 t[i] = c;
326 i += 1;
327 }
328 t
329 });
330 let mut crc: u64 = !0u64;
331 for &b in bytes {
332 let idx = ((crc as u8) ^ b) as usize;
333 crc = (crc >> 8) ^ tbl[idx];
334 }
335 !crc
336}
337
338/// v0.4 #20: captured at the start of a handler, before the request is
339/// consumed by the backend call, so the matching `record_access` at
340/// end-of-request can fill in the structured access log entry.
341struct AccessLogPreamble {
342 remote_ip: Option<String>,
343 requester: Option<String>,
344 request_uri: String,
345 user_agent: Option<String>,
346}
347
348pub struct S4Service<B: S3> {
349 /// Wrapped in `Arc` so the v0.6 #40 cross-bucket replication
350 /// dispatcher can clone it into a detached `tokio::spawn` task
351 /// (Arc::clone is cheap; backend trait methods take `&self` so no
352 /// other handler is affected by the indirection).
353 backend: Arc<B>,
354 registry: Arc<CodecRegistry>,
355 dispatcher: Arc<dyn CodecDispatcher>,
356 max_body_bytes: usize,
357 policy: Option<crate::policy::SharedPolicy>,
358 /// v0.3 #13: surfaced as the `aws:SecureTransport` Condition key. Set
359 /// to `true` when the listener is wrapped in TLS (or ACME), so policies
360 /// gating "deny if not over TLS" can do their job. Defaults to `false`
361 /// (HTTP); set via [`S4Service::with_secure_transport`] at boot.
362 secure_transport: bool,
363 /// v0.4 #19: optional per-(principal, bucket) token-bucket limiter.
364 rate_limits: Option<crate::rate_limit::SharedRateLimits>,
365 /// v0.4 #20: optional S3-style access log emitter.
366 access_log: Option<crate::access_log::SharedAccessLog>,
367 /// v0.4 #21 / v0.5 #29: optional server-side encryption keyring
368 /// (AES-256-GCM). When set, every PUT body gets wrapped in S4E2
369 /// (with the keyring's active key id) after the compress + framing
370 /// steps; every GET that sniffs as S4E1/S4E2 is decrypted before
371 /// frame parsing. A `with_sse_key(...)` call wraps the supplied
372 /// key in a 1-slot keyring so single-key (v0.4) operators get the
373 /// same behaviour they had before, just on the v2 frame.
374 sse_keyring: Option<crate::sse::SharedSseKeyring>,
375 /// v0.5 #34: optional first-class versioning state machine. When
376 /// `Some(...)`, S4-server itself owns the per-bucket versioning
377 /// state + per-(bucket, key) version chain; PUT / GET / DELETE /
378 /// list_object_versions / get_bucket_versioning /
379 /// put_bucket_versioning handlers consult the manager instead of
380 /// passing through. When `None` (default), the legacy
381 /// backend-passthrough behaviour applies so existing v0.4
382 /// deployments are unaffected until they explicitly call
383 /// `with_versioning(...)`.
384 versioning: Option<Arc<crate::versioning::VersioningManager>>,
385 /// v0.5 #28: optional SSE-KMS envelope-encryption backend. When
386 /// `Some(...)`, PUTs carrying `x-amz-server-side-encryption: aws:kms`
387 /// generate a fresh DEK via the backend, encrypt the body with it
388 /// (S4E4 frame), and persist only the wrapped DEK. GETs sniffing as
389 /// S4E4 unwrap the DEK through the same backend before decrypt.
390 /// `kms_default_key_id` is used when the request omits an explicit
391 /// `x-amz-server-side-encryption-aws-kms-key-id` (mirrors AWS S3
392 /// bucket-default behaviour).
393 kms: Option<Arc<dyn crate::kms::KmsBackend>>,
394 kms_default_key_id: Option<String>,
395 /// v0.5 #30: optional Object Lock (WORM) enforcement layer. When
396 /// `Some(...)`, `delete_object` and overwrite-style `put_object`
397 /// consult the manager and refuse the operation with HTTP 403
398 /// `AccessDenied` while the object is locked (Compliance until
399 /// expiry, Governance unless the bypass header is set, or any time
400 /// a legal hold is on). PUT also auto-applies the bucket-default
401 /// retention to brand-new objects when configured. When `None`
402 /// (default), the legacy backend-passthrough behaviour applies, so
403 /// existing v0.4 deployments are unaffected until they explicitly
404 /// call `with_object_lock(...)`.
405 object_lock: Option<Arc<crate::object_lock::ObjectLockManager>>,
406 /// v0.6 #38: optional first-class CORS bucket configuration manager.
407 /// When `Some(...)`, S4-server itself owns per-bucket CORS rules and
408 /// `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
409 /// consult the manager instead of passing through to the backend.
410 /// `handle_preflight` (public method on `S4Service`) routes OPTIONS-
411 /// style preflight matching through the same store; the actual HTTP
412 /// OPTIONS routing wire-up at the listener level is a follow-up
413 /// (s3s framework does not surface OPTIONS as a typed handler).
414 cors: Option<Arc<crate::cors::CorsManager>>,
415 /// v0.6 #36: optional first-class S3 Inventory manager. When
416 /// `Some(...)`, S4-server itself owns per-(bucket, id) inventory
417 /// configurations and `put_bucket_inventory_configuration` /
418 /// `get_bucket_inventory_configuration` /
419 /// `list_bucket_inventory_configurations` /
420 /// `delete_bucket_inventory_configuration` consult the manager
421 /// instead of passing through to the backend. The actual periodic
422 /// CSV emission is driven by a tokio task in `main.rs` that calls
423 /// `InventoryManager::run_once_for_test` on a fixed cadence; the
424 /// service handlers below only deal with config-level CRUD.
425 inventory: Option<Arc<crate::inventory::InventoryManager>>,
426 /// v0.6 #35: optional first-class S3 bucket-notification manager.
427 /// When `Some(...)`, S4-server itself owns per-bucket notification
428 /// configurations and `put_bucket_notification_configuration` /
429 /// `get_bucket_notification_configuration` consult the manager
430 /// instead of passing through to the backend. Successful PUT /
431 /// DELETE handlers fire matching destinations on a detached tokio
432 /// task (best-effort; see `crate::notifications::dispatch_event`).
433 notifications: Option<Arc<crate::notifications::NotificationManager>>,
434 /// v0.6 #37: optional first-class S3 Lifecycle configuration
435 /// manager. When `Some(...)`, S4-server itself owns per-bucket
436 /// lifecycle rules and `put_bucket_lifecycle_configuration` /
437 /// `get_bucket_lifecycle_configuration` /
438 /// `delete_bucket_lifecycle` consult the manager instead of
439 /// passing through to the backend. The actual background scanner
440 /// (list_objects_v2 -> evaluate -> delete / metadata-rewrite per
441 /// rule) is a v0.7+ follow-up; the test path
442 /// `S4Service::run_lifecycle_once_for_test` exercises the
443 /// evaluator end-to-end so this v0.6 #37 wiring is enough to ship
444 /// the configuration-management half without putting a
445 /// half-wired bucket-walk in front of users.
446 lifecycle: Option<Arc<crate::lifecycle::LifecycleManager>>,
447 /// v0.6 #39: optional first-class object + bucket Tagging manager.
448 /// When `Some(...)`, S4-server itself owns per-(bucket, key) and
449 /// per-bucket tag state — `PutObjectTagging` /
450 /// `GetObjectTagging` / `DeleteObjectTagging` /
451 /// `PutBucketTagging` / `GetBucketTagging` /
452 /// `DeleteBucketTagging` route through the manager (replacing the
453 /// previous backend-passthrough behaviour). `put_object` also
454 /// pre-parses the `x-amz-tagging` header / `Tagging` input field
455 /// so the IAM policy evaluator can gate on
456 /// `s3:RequestObjectTag/<key>` and `s3:ExistingObjectTag/<key>`.
457 /// On a successful PUT the parsed tags are persisted; on a
458 /// successful DELETE the matching tag entry is dropped.
459 tagging: Option<Arc<crate::tagging::TagManager>>,
460 /// v0.6 #40: optional first-class cross-bucket replication manager.
461 /// When `Some(...)`, S4-server itself owns per-bucket replication
462 /// rules; `PutBucketReplication` / `GetBucketReplication` /
463 /// `DeleteBucketReplication` route through the manager (replacing
464 /// the previous backend-passthrough behaviour). On every successful
465 /// `put_object` the manager's rule list is consulted; the
466 /// highest-priority matching enabled rule wins, the per-key status
467 /// is recorded as `Pending`, and the source body and metadata are
468 /// handed to a detached tokio task that PUTs to the destination
469 /// bucket through the same backend. The replica is stamped with
470 /// `x-amz-replication-status: REPLICA` in its metadata; the
471 /// source-side status is updated to `Completed` on success or
472 /// `Failed` after the 3-attempt retry budget is exhausted (drop
473 /// counter bumps in either-side case so dashboards see the loss).
474 /// `head_object` / `get_object` echo the recorded status back as
475 /// `x-amz-replication-status` so consumers can poll progress.
476 /// Limited to single-instance (same `S4Service`) replication; true
477 /// cross-region (multi-instance) is a v0.7+ follow-up.
478 replication: Option<Arc<crate::replication::ReplicationManager>>,
479 /// v0.6 #42: optional MFA-Delete enforcement layer. When `Some(...)`,
480 /// every DELETE / DELETE-version / delete-marker / `PutBucketVersioning`
481 /// request against a bucket whose MFA-Delete state is `Enabled`
482 /// must carry `x-amz-mfa: <serial> <code>` (RFC 6238 6-digit TOTP);
483 /// missing or invalid tokens return HTTP 403 `AccessDenied`. When
484 /// `None` (default), the gate is a no-op so existing v0.4 / v0.5
485 /// deployments are unaffected until they explicitly call
486 /// `with_mfa_delete(...)`.
487 mfa_delete: Option<Arc<crate::mfa::MfaDeleteManager>>,
488 /// v0.5 #32: when `true`, every PUT must carry an SSE indicator
489 /// (`x-amz-server-side-encryption`, the SSE-C customer-key headers,
490 /// or be matched against a configured server-managed keyring/KMS).
491 /// Set by `--compliance-mode strict` after the boot-time
492 /// prerequisite check passes.
493 compliance_strict: bool,
494 /// v0.7 #47: optional SigV4a (asymmetric ECDSA-P256-SHA256) verify
495 /// gate. When `Some(...)`, the listener-side middleware (see
496 /// [`crate::routing::try_sigv4a_verify`]) inspects every incoming
497 /// request and short-circuits SigV4a-signed ones — verifying the
498 /// signature against the credential store and returning 403
499 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` on failure. Plain
500 /// SigV4 (HMAC-SHA256) requests pass through to s3s untouched. When
501 /// `None`, the middleware is a no-op so the existing SigV4 path is
502 /// unaffected (operators opt in via `--sigv4a-credentials <DIR>`).
503 sigv4a_gate: Option<Arc<SigV4aGate>>,
504 /// v0.8 #54 BUG-5..10: per-`upload_id` side-table that ferries the
505 /// SSE / Tagging / Object-Lock context captured at
506 /// `CreateMultipartUpload` time through to `UploadPart` /
507 /// `CompleteMultipartUpload`. Always-on (no `with_*` flag) — the
508 /// store is gateway-internal and idle when no multipart is in
509 /// flight. See [`crate::multipart_state`] for rationale.
510 multipart_state: Arc<crate::multipart_state::MultipartStateStore>,
511 /// v0.8 #52: plaintext bytes per S4E5 chunk on the SSE-S4 PUT
512 /// path. `0` (default) → use the legacy buffered S4E2 path
513 /// (whole-body AES-GCM tag, GET buffers + verifies before
514 /// emitting). Non-zero → use the chunked S4E5 frame so GET can
515 /// stream-decrypt chunk-by-chunk. Wired by `--sse-chunk-size`
516 /// in `main.rs`. SSE-C and SSE-KMS are intentionally unaffected
517 /// (chunked variants tracked in a follow-up issue).
518 sse_chunk_size: usize,
519 /// v0.8.5 #86 (audit M-2): bounded permit pool gating the detached
520 /// replication dispatcher in [`Self::spawn_replication_if_matched`].
521 /// Without this cap, a high-volume PUT workload (1k req/s × N enabled
522 /// rules × slow destination = O(10k) in-flight tokio tasks) could
523 /// exhaust process memory before the destination drains. Each
524 /// dispatcher spawn `acquire_owned`s one permit and holds it for the
525 /// lifetime of the destination PUT + status stamp; once the cap is
526 /// reached the dispatcher async-blocks on `acquire_owned()` so the
527 /// listener path itself never stalls — only the in-flight replica
528 /// queue depth is bounded. Default 1024 (operator-tunable via
529 /// `--replication-max-concurrent`).
530 replication_semaphore: Arc<tokio::sync::Semaphore>,
531 /// v0.8.11 CRIT-4 fix: trust the `X-Forwarded-For` header for the
532 /// `aws:SourceIp` Condition key only when the operator has
533 /// explicitly opted in via `--trust-x-forwarded-for`. Default
534 /// (`false`) makes the policy evaluator see `source_ip = None`
535 /// for incoming requests, so a public-internet client can no
536 /// longer spoof an internal CIDR by setting `X-Forwarded-For`
537 /// themselves. Operators behind a trusted reverse proxy that
538 /// scrubs / sets `X-Forwarded-For` enable the flag; gateways
539 /// listening directly on the public internet leave it off and
540 /// gain a clear fail-closed default. A future release plumbs
541 /// the TCP peer address through the s3s service trait so we can
542 /// validate the forwarded header against a `--trusted-proxies`
543 /// CIDR list; until then the boolean opt-in closes the immediate
544 /// auth-bypass surface.
545 trust_x_forwarded_for: bool,
546 /// v0.8.17 G-4 (#161): migration escape hatch. When `true`,
547 /// the v0.8.16 F-13 reserved-name guard does NOT block GET /
548 /// HEAD / DELETE on keys ending in `.s4index` — the operator
549 /// is asserting that the deployment may carry pre-v0.8.15
550 /// user objects with that suffix and wants a window to
551 /// migrate them off. Writes (PUT / Copy / Create-Multipart)
552 /// stay blocked regardless of this flag, so attacker
553 /// injection from M-1 / F-13 stays closed. Default
554 /// `false` matches the v0.8.16 behaviour.
555 allow_legacy_reserved_key_reads: bool,
556}
557
558/// v0.8.17 G-2: which AWS error shape the reserved-name guard
559/// should emit on hit. `Read`-mode endpoints (GET / HEAD /
560/// Attributes / Tagging-read) return `NoSuchKey` — consistent
561/// with the listing filter hiding the sidecar. `Mutating`-mode
562/// endpoints (PUT / Copy / DELETE / Tagging-write / ACL-write)
563/// return `InvalidObjectName` so the client sees the suffix is
564/// reserved by-design rather than coincidentally missing.
565#[derive(Clone, Copy, Debug)]
566enum ReservedKeyMode {
567 Read,
568 Mutating,
569}
570
571impl<B: S3> S4Service<B> {
572 /// AWS S3 単発 PUT の API 上限 (5 GiB)
573 pub const DEFAULT_MAX_BODY_BYTES: usize = 5 * 1024 * 1024 * 1024;
574
575 /// v0.8.5 #86 (audit M-2): default cap on simultaneously-in-flight
576 /// replication dispatcher tasks. See the `replication_semaphore`
577 /// field doc for the rationale + override path.
578 pub const DEFAULT_REPLICATION_MAX_CONCURRENT: usize = 1024;
579
580 pub fn new(
581 backend: B,
582 registry: Arc<CodecRegistry>,
583 dispatcher: Arc<dyn CodecDispatcher>,
584 ) -> Self {
585 Self {
586 backend: Arc::new(backend),
587 registry,
588 dispatcher,
589 max_body_bytes: Self::DEFAULT_MAX_BODY_BYTES,
590 policy: None,
591 secure_transport: false,
592 rate_limits: None,
593 access_log: None,
594 sse_keyring: None,
595 versioning: None,
596 kms: None,
597 kms_default_key_id: None,
598 object_lock: None,
599 cors: None,
600 inventory: None,
601 notifications: None,
602 lifecycle: None,
603 tagging: None,
604 replication: None,
605 mfa_delete: None,
606 compliance_strict: false,
607 sigv4a_gate: None,
608 multipart_state: Arc::new(crate::multipart_state::MultipartStateStore::new()),
609 // v0.8 #52: chunked SSE-S4 disabled by default — opt
610 // in via `S4Service::with_sse_chunk_size(...)` /
611 // `--sse-chunk-size <BYTES>`. Default keeps the legacy
612 // S4E2 buffered path so existing deployments are
613 // bit-for-bit unchanged.
614 sse_chunk_size: 0,
615 // v0.8.5 #86 (audit M-2): default cap of 1024 in-flight
616 // replication tasks. Picked to be (a) ample headroom over a
617 // typical steady-state replication rate (the v0.8.3 #66
618 // status-sweep doc cites 1k keys/hour as a "steady" rate, so
619 // even a 100x burst lands well under 1024), (b) small enough
620 // that the worst-case memory pinned by stalled dispatchers
621 // — body bytes + metadata — stays bounded (1024 × 5 MiB
622 // typical S3 PUT ≈ 5 GiB, recoverable). Operators with
623 // wider cross-region fan-out can override via
624 // `--replication-max-concurrent`.
625 replication_semaphore: Arc::new(tokio::sync::Semaphore::new(
626 Self::DEFAULT_REPLICATION_MAX_CONCURRENT,
627 )),
628 // v0.8.11 CRIT-4: default fail-closed — ignore client-
629 // supplied `X-Forwarded-For` until the operator opts in
630 // through `with_trust_x_forwarded_for(true)`.
631 trust_x_forwarded_for: false,
632 // v0.8.17 G-4: closed by default; opt in via
633 // `with_allow_legacy_reserved_key_reads(true)` for the
634 // migration window only.
635 allow_legacy_reserved_key_reads: false,
636 }
637 }
638
639 /// v0.8.17 G-4: opt in to a migration window where GET / HEAD /
640 /// DELETE on `<key>.s4index` are allowed even though new
641 /// writes against that suffix stay rejected. Used by operators
642 /// upgrading from pre-v0.8.15 deployments that may carry
643 /// legacy user-owned objects with the now-reserved suffix.
644 /// Defaults to `false`; turn off again once the legacy data
645 /// has been migrated.
646 #[must_use]
647 pub fn with_allow_legacy_reserved_key_reads(mut self, on: bool) -> Self {
648 self.allow_legacy_reserved_key_reads = on;
649 self
650 }
651
652 /// v0.8.11 CRIT-4 fix: opt in to consuming the leftmost token of
653 /// the `X-Forwarded-For` header as `aws:SourceIp`. Only enable
654 /// when the gateway sits behind a trusted reverse proxy that
655 /// strips (or rewrites) any client-supplied value. When left
656 /// off (default), the policy evaluator sees `source_ip = None`
657 /// regardless of what the client sends — closing the
658 /// public-internet `X-Forwarded-For: 10.0.0.1` IAM-allowlist
659 /// bypass.
660 #[must_use]
661 pub fn with_trust_x_forwarded_for(mut self, on: bool) -> Self {
662 self.trust_x_forwarded_for = on;
663 self
664 }
665
666 /// v0.7 #47: attach the SigV4a verify gate. Once set, the
667 /// listener-side middleware (`crate::routing::try_sigv4a_verify`)
668 /// short-circuits any incoming `AWS4-ECDSA-P256-SHA256` request,
669 /// verifying it against the supplied credential store and
670 /// returning 403 on failure. Plain SigV4 (HMAC-SHA256) requests
671 /// are unaffected. When the gate is unset (default), the
672 /// middleware skips entirely so existing SigV4 deployments keep
673 /// working.
674 #[must_use]
675 pub fn with_sigv4a_gate(mut self, gate: Arc<SigV4aGate>) -> Self {
676 self.sigv4a_gate = Some(gate);
677 self
678 }
679
680 /// v0.7 #47: borrow the attached SigV4a gate. Used by `main.rs`
681 /// to snapshot the gate `Arc` before the s3s `ServiceBuilder`
682 /// consumes the `S4Service` (the listener-side middleware needs
683 /// the same `Arc` because s3s' SigV4 verifier rejects SigV4a
684 /// algorithm tokens with "unknown algorithm" — match has to
685 /// happen at the hyper layer instead).
686 #[must_use]
687 pub fn sigv4a_gate(&self) -> Option<&Arc<SigV4aGate>> {
688 self.sigv4a_gate.as_ref()
689 }
690
691 /// v0.8.2 #62: borrow the multipart state store so `main.rs` can
692 /// snapshot the `Arc` before the s3s `ServiceBuilder` consumes
693 /// the `S4Service`. The background `sweep_stale` task in `main.rs`
694 /// holds this `Arc` and ticks once an hour to drop abandoned
695 /// upload contexts (and their `Zeroizing<[u8; 32]>` SSE-C keys).
696 #[must_use]
697 pub fn multipart_state(&self) -> &Arc<crate::multipart_state::MultipartStateStore> {
698 &self.multipart_state
699 }
700
701 /// v0.6 #39: attach the in-memory object + bucket Tagging manager.
702 /// Once set, `Put/Get/Delete` `Object/Bucket Tagging` route
703 /// through the manager (instead of forwarding to the backend),
704 /// and `put_object`'s `x-amz-tagging` parse path becomes the
705 /// source of `s3:RequestObjectTag/<key>` for the IAM policy
706 /// evaluator. The manager itself is shared via `Arc`.
707 #[must_use]
708 pub fn with_tagging(mut self, mgr: Arc<crate::tagging::TagManager>) -> Self {
709 self.tagging = Some(mgr);
710 self
711 }
712
713 /// v0.6 #39: borrow the attached tagging manager (test /
714 /// introspection — the snapshotter in `main.rs`, when wired,
715 /// will keep its own `Arc` clone).
716 #[must_use]
717 pub fn tag_manager(&self) -> Option<&Arc<crate::tagging::TagManager>> {
718 self.tagging.as_ref()
719 }
720
721 /// v0.6 #36: attach the in-memory S3 Inventory manager. Once set,
722 /// `put_bucket_inventory_configuration` /
723 /// `get_bucket_inventory_configuration` /
724 /// `list_bucket_inventory_configurations` /
725 /// `delete_bucket_inventory_configuration` route through the
726 /// manager. The actual periodic CSV / manifest emission is
727 /// orchestrated by a tokio task started in `main.rs`; the manager
728 /// itself is shared between the handler and the scheduler via
729 /// `Arc`.
730 #[must_use]
731 pub fn with_inventory(mut self, mgr: Arc<crate::inventory::InventoryManager>) -> Self {
732 self.inventory = Some(mgr);
733 self
734 }
735
736 /// v0.6 #36: borrow the attached inventory manager (test /
737 /// introspection — the background scheduler in `main.rs` keeps its
738 /// own `Arc` clone, so this accessor is for the test path that
739 /// invokes `run_once_for_test` directly).
740 #[must_use]
741 pub fn inventory_manager(&self) -> Option<&Arc<crate::inventory::InventoryManager>> {
742 self.inventory.as_ref()
743 }
744
745 /// v0.6 #37: attach the in-memory S3 Lifecycle configuration
746 /// manager. Once set, `put_bucket_lifecycle_configuration` /
747 /// `get_bucket_lifecycle_configuration` / `delete_bucket_lifecycle`
748 /// route through the manager (replacing the previous backend-
749 /// passthrough behaviour). The actual periodic scanner that walks
750 /// the source bucket and invokes Expiration / Transition /
751 /// NoncurrentExpiration actions is a v0.7+ follow-up — see
752 /// [`Self::run_lifecycle_once_for_test`] for the in-memory test
753 /// path that exercises the evaluator end-to-end.
754 #[must_use]
755 pub fn with_lifecycle(mut self, mgr: Arc<crate::lifecycle::LifecycleManager>) -> Self {
756 self.lifecycle = Some(mgr);
757 self
758 }
759
760 /// v0.6 #37: borrow the attached lifecycle manager (test /
761 /// introspection — the background scheduler in `main.rs` keeps its
762 /// own `Arc` clone, so this accessor is for the test path that
763 /// invokes the evaluator directly).
764 #[must_use]
765 pub fn lifecycle_manager(&self) -> Option<&Arc<crate::lifecycle::LifecycleManager>> {
766 self.lifecycle.as_ref()
767 }
768
769 /// v0.6 #37: synchronous test entry that runs the lifecycle evaluator
770 /// against a caller-provided list of `(key, age, size, tags)` tuples
771 /// and returns the `(key, action)` pairs that should fire. The actual
772 /// backend invocation (S3.delete_object / metadata rewrite) is left
773 /// to the caller — the unit + E2E tests use this to verify the
774 /// evaluator without spawning the (deferred) background scanner.
775 /// Returns an empty `Vec` when no lifecycle manager is attached or
776 /// no rule matches.
777 #[must_use]
778 pub fn run_lifecycle_once_for_test(
779 &self,
780 bucket: &str,
781 objects: &[crate::lifecycle::EvaluateBatchEntry],
782 ) -> Vec<(String, crate::lifecycle::LifecycleAction)> {
783 let Some(mgr) = self.lifecycle.as_ref() else {
784 return Vec::new();
785 };
786 crate::lifecycle::evaluate_batch(mgr, bucket, objects)
787 }
788
789 /// v0.6 #35: attach the in-memory bucket-notification manager. Once
790 /// set, `put_bucket_notification_configuration` /
791 /// `get_bucket_notification_configuration` route through the manager
792 /// (replacing the previous backend-passthrough behaviour); successful
793 /// `put_object` / `delete_object` calls fire matching destinations
794 /// on a detached tokio task via
795 /// `crate::notifications::dispatch_event` (best-effort, fire-and-
796 /// forget — failures bump the manager's `dropped_total` counter and
797 /// log at warn but do NOT fail the originating S3 request).
798 #[must_use]
799 pub fn with_notifications(
800 mut self,
801 mgr: Arc<crate::notifications::NotificationManager>,
802 ) -> Self {
803 self.notifications = Some(mgr);
804 self
805 }
806
807 /// v0.6 #35: borrow the attached notifications manager (test /
808 /// introspection — used by the metrics layer to read
809 /// `dropped_total`).
810 #[must_use]
811 pub fn notifications_manager(&self) -> Option<&Arc<crate::notifications::NotificationManager>> {
812 self.notifications.as_ref()
813 }
814
815 /// v0.6 #35: internal helper used by the DELETE handlers to fire a
816 /// matching notification on a detached tokio task. No-op when no
817 /// manager is attached or no rule on the bucket matches the given
818 /// (event, key) tuple.
819 fn fire_delete_notification(
820 &self,
821 bucket: &str,
822 key: &str,
823 event: crate::notifications::EventType,
824 version_id: Option<String>,
825 ) {
826 let Some(mgr) = self.notifications.as_ref() else {
827 return;
828 };
829 let dests = mgr.match_destinations(bucket, &event, key);
830 if dests.is_empty() {
831 return;
832 }
833 tokio::spawn(crate::notifications::dispatch_event(
834 Arc::clone(mgr),
835 bucket.to_owned(),
836 key.to_owned(),
837 event,
838 None,
839 None,
840 version_id,
841 format!("S4-{}", uuid::Uuid::new_v4()),
842 ));
843 }
844
845 /// v0.6 #40: attach the in-memory cross-bucket replication manager.
846 /// Once set, `put_bucket_replication` / `get_bucket_replication` /
847 /// `delete_bucket_replication` route through the manager (replacing
848 /// the previous backend-passthrough behaviour); a successful
849 /// `put_object` whose key matches an enabled rule fires a detached
850 /// tokio task that PUTs the same body + metadata to the rule's
851 /// destination bucket, stamping the replica with
852 /// `x-amz-replication-status: REPLICA`. Failures after the retry
853 /// budget bump the manager's `dropped_total` counter and are
854 /// surfaced in the `s4_replication_dropped_total` Prometheus
855 /// counter; successes bump `s4_replication_replicated_total`.
856 #[must_use]
857 pub fn with_replication(mut self, mgr: Arc<crate::replication::ReplicationManager>) -> Self {
858 self.replication = Some(mgr);
859 self
860 }
861
862 /// v0.6 #40: borrow the attached replication manager (test /
863 /// introspection — used by the metrics layer to read
864 /// `dropped_total`).
865 #[must_use]
866 pub fn replication_manager(&self) -> Option<&Arc<crate::replication::ReplicationManager>> {
867 self.replication.as_ref()
868 }
869
870 /// v0.6 #40: internal helper used by the PUT handlers to fire a
871 /// detached cross-bucket replication task. No-op when no manager
872 /// is attached, the source backend PUT failed, or no rule on the
873 /// source bucket matches the (key, tags) tuple. The `body` is the
874 /// post-compression / post-encryption `Bytes` that was sent to
875 /// the source backend (refcount-cloned), and `metadata` is the
876 /// metadata map that already includes the manifest /
877 /// `s4-encrypted` markers — the replica decodes through the same
878 /// path. The destination PUT runs through `Arc<B>::put_object`.
879 ///
880 /// ## v0.8.2 #61: generation token + shadow-key destination
881 ///
882 /// `pending_version` is the source-side `PutOutcome` minted by the
883 /// caller's versioning branch (or `None` for unversioned /
884 /// suspended buckets). When `pending_version.versioned_response`
885 /// is `true`, the dispatcher writes the destination under the same
886 /// shadow path the source uses (`<key>.__s4ver__/<vid>`) so the
887 /// destination's version chain receives the new version the same
888 /// way `?versionId=` GET resolves it. Closes audit C-1.
889 ///
890 /// The dispatcher also mints a fresh `generation` token before
891 /// spawning, threaded through to [`crate::replication::
892 /// replicate_object`]. Closes audit C-3 — a stale retry of an
893 /// older PUT can no longer overwrite the destination's newer bytes
894 /// because the CAS guard sees the higher stored generation and
895 /// drops its destination write.
896 ///
897 /// ## Asymmetric versioning policy (out of scope)
898 ///
899 /// We assume source + destination buckets share the same
900 /// versioning policy (both Enabled or both Suspended /
901 /// Unversioned). Cross-bucket policy queries would require a
902 /// backend round-trip per replication, which is not worth it for
903 /// the single-instance scope. Operators who configure asymmetric
904 /// versioning will see destination-side `?versionId=` lookups
905 /// miss — documented as out-of-scope until a future per-rule
906 /// `destination_versioning_policy` knob lands.
907 // 8 args is the post-#61 shape: replication needs the
908 // source bucket+key, the canonical tag set for rule-matching,
909 // the post-codec body+metadata for the destination PUT, the
910 // backend-success gate, and the pending version-id for the
911 // shadow-key destination override. A shape struct would just
912 // split the (single) call site so opt for the inline form.
913 #[allow(clippy::too_many_arguments)]
914 fn spawn_replication_if_matched(
915 &self,
916 source_bucket: &str,
917 source_key: &str,
918 request_tags: &Option<crate::tagging::TagSet>,
919 body: &bytes::Bytes,
920 metadata: &Option<std::collections::HashMap<String, String>>,
921 backend_ok: bool,
922 pending_version: Option<&crate::versioning::PutOutcome>,
923 ) where
924 B: Send + Sync + 'static,
925 {
926 if !backend_ok {
927 return;
928 }
929 let Some(mgr) = self.replication.as_ref() else {
930 return;
931 };
932 // Pull the request's tags into the (k, v) shape the matcher
933 // expects. The tagging manager would have the canonical
934 // post-PUT view but at this point in the pipeline it's
935 // already been written above; for the rule-match decision
936 // the request's tags are sufficient (= the tags this PUT
937 // applies, S3 PutObject is full-replace on tags).
938 let object_tags: Vec<(String, String)> = request_tags
939 .as_ref()
940 .map(|ts| ts.iter().cloned().collect())
941 .unwrap_or_default();
942 let Some(rule) = mgr.match_rule(source_bucket, source_key, &object_tags) else {
943 return;
944 };
945 // v0.8.2 #61: mint the per-PUT generation BEFORE the eager
946 // Pending stamp so the stamp itself carries the right
947 // generation (the CAS in `record_status_if_newer` would
948 // otherwise see a `generation=0` Pending and accept any
949 // stale retry).
950 let generation = mgr.next_generation();
951 // Eagerly mark the source key as Pending so a HEAD between
952 // the source PUT returning and the spawned task completing
953 // surfaces the in-flight state. CAS-guarded so a slower
954 // older PUT can't downgrade a newer Completed back to Pending.
955 let _ = mgr.record_status_if_newer(
956 source_bucket,
957 source_key,
958 generation,
959 crate::replication::ReplicationStatus::Pending,
960 );
961 // v0.8.2 #61: derive the destination storage key. For a
962 // versioning-Enabled source the destination receives the
963 // same shadow-key path so a `?versionId=<vid>` GET on the
964 // destination resolves through the same lookup the source
965 // uses. Suspended / Unversioned sources keep the logical
966 // key (= `None` override = dispatcher uses `source_key`).
967 let destination_key_override = pending_version
968 .filter(|pv| pv.versioned_response)
969 .map(|pv| versioned_shadow_key(source_key, &pv.version_id));
970 // v0.8.3 #68 (audit M-1): capture the source object's Object
971 // Lock state so the dispatcher can decorate the destination
972 // PUT with the matching AWS-wire lock headers. Without this,
973 // a Compliance / Governance / legal-hold protected source
974 // would replicate to a destination where DELETE succeeds
975 // (the WORM posture would only hold on the source).
976 let source_lock_state = self
977 .object_lock
978 .as_ref()
979 .and_then(|mgr| mgr.get(source_bucket, source_key));
980 // v0.8.3 #68: hand the destination-side ObjectLockManager to
981 // the dispatcher closure so we can persist the propagated
982 // lock state on successful destination PUT (the destination
983 // PUT below bypasses S4Service::put_object — we drive the
984 // backend directly — so the explicit_lock_mode commit block
985 // in put_object never fires for replicas. We replay it here
986 // against the destination key.)
987 let dest_lock_mgr = self.object_lock.as_ref().map(Arc::clone);
988 let mgr_cl = Arc::clone(mgr);
989 let backend = Arc::clone(&self.backend);
990 let body_cl = body.clone();
991 let metadata_cl = metadata.clone();
992 let source_bucket_cl = source_bucket.to_owned();
993 let source_key_cl = source_key.to_owned();
994 let source_lock_state_for_closure = source_lock_state.clone();
995 let source_bucket_for_warn = source_bucket.to_owned();
996 // v0.8.5 #86 (audit M-2): bound the in-flight replication queue
997 // depth. Acquire happens INSIDE the spawned task (not on the
998 // listener path) so a saturated semaphore back-pressures the
999 // dispatcher pool without stalling the source PUT response —
1000 // the source has already returned 200 to the client by the time
1001 // the spawn body runs. A failed `acquire_owned` only happens
1002 // when the semaphore is closed (we never close it, so the
1003 // logged-and-skipped fallback is unreachable in practice).
1004 let semaphore = Arc::clone(&self.replication_semaphore);
1005 tokio::spawn(async move {
1006 let _permit = match semaphore.acquire_owned().await {
1007 Ok(p) => p,
1008 Err(e) => {
1009 tracing::warn!(
1010 bucket = %source_bucket_cl,
1011 key = %source_key_cl,
1012 "S4 replication dispatcher could not acquire semaphore permit (closed? {e}); skipping replica"
1013 );
1014 return;
1015 }
1016 };
1017 let do_put = move |dest_bucket: String,
1018 dest_key: String,
1019 dest_body: bytes::Bytes,
1020 dest_meta: Option<std::collections::HashMap<String, String>>| {
1021 let backend = Arc::clone(&backend);
1022 let dest_lock_mgr = dest_lock_mgr.clone();
1023 let lock_state = source_lock_state_for_closure.clone();
1024 let warn_src = source_bucket_for_warn.clone();
1025 async move {
1026 let req = S3Request {
1027 input: PutObjectInput {
1028 bucket: dest_bucket.clone(),
1029 key: dest_key.clone(),
1030 body: Some(bytes_to_blob(dest_body)),
1031 metadata: dest_meta,
1032 ..Default::default()
1033 },
1034 method: http::Method::PUT,
1035 uri: "/".parse().unwrap(),
1036 headers: http::HeaderMap::new(),
1037 extensions: http::Extensions::new(),
1038 credentials: None,
1039 region: None,
1040 service: None,
1041 trailing_headers: None,
1042 };
1043 let put_result = backend
1044 .put_object(req)
1045 .await
1046 .map(|_| ())
1047 .map_err(|e| format!("destination put_object: {e}"));
1048 // v0.8.3 #68: on successful destination PUT,
1049 // persist the propagated lock state into the
1050 // destination's ObjectLockManager so a subsequent
1051 // DELETE on the destination is refused. Three cases:
1052 // - PUT failed → skip (no replica to protect)
1053 // - lock_state None → nothing to propagate
1054 // - dest manager None (operator misconfig)
1055 // → log warn-once + bump skip metric
1056 if put_result.is_ok()
1057 && let Some(state) = lock_state
1058 {
1059 match dest_lock_mgr {
1060 Some(ref mgr) => {
1061 mgr.set(&dest_bucket, &dest_key, state);
1062 }
1063 None => {
1064 crate::replication::warn_lock_propagation_skipped(
1065 &warn_src,
1066 &dest_bucket,
1067 );
1068 }
1069 }
1070 }
1071 put_result
1072 }
1073 };
1074 // v0.8.5 #81 (audit H-7): wrap the dispatcher body in
1075 // `futures::FutureExt::catch_unwind` so a panic inside
1076 // `replicate_object` (or any of the user-supplied closures
1077 // it drives — `do_put`, the destination backend, the lock
1078 // manager) does NOT bubble out of the detached task as a
1079 // `JoinError` that no operator dashboard scrapes. Caught
1080 // panics bump `s4_dispatcher_panics_total{kind="replication"}`
1081 // + log at ERROR with the panic payload, so silent feature
1082 // degradation (= every replication PUT panicking and
1083 // dropping the replica without any visible signal) becomes
1084 // a first-class metric the operator can alert on.
1085 //
1086 // `AssertUnwindSafe` is required because the inner future
1087 // captures `Arc<...>` clones + a `do_put` closure that are
1088 // not `UnwindSafe` by default; the safety contract here is
1089 // "we don't continue using any of those captures after the
1090 // panic" which trivially holds (we drop them and return).
1091 use futures::FutureExt as _;
1092 let dispatcher_kind = "replication";
1093 let fut = crate::replication::replicate_object(
1094 rule,
1095 source_bucket_cl,
1096 source_key_cl,
1097 body_cl,
1098 metadata_cl,
1099 do_put,
1100 mgr_cl,
1101 generation,
1102 destination_key_override,
1103 source_lock_state,
1104 );
1105 if let Err(panic) = std::panic::AssertUnwindSafe(fut).catch_unwind().await {
1106 let panic_msg = panic
1107 .downcast_ref::<&'static str>()
1108 .copied()
1109 .map(str::to_owned)
1110 .or_else(|| panic.downcast_ref::<String>().cloned())
1111 .unwrap_or_else(|| "(non-string panic payload)".to_owned());
1112 tracing::error!(
1113 kind = dispatcher_kind,
1114 panic_payload = %panic_msg,
1115 "S4 dispatcher task panicked (caught by catch_unwind, runtime not poisoned)"
1116 );
1117 crate::metrics::record_dispatcher_panic(dispatcher_kind);
1118 }
1119 });
1120 }
1121
1122 /// v0.6 #42: attach the in-memory MFA-Delete enforcement manager.
1123 /// Once set, every DELETE / DELETE-version / delete-marker /
1124 /// `PutBucketVersioning` request against a bucket whose MFA-Delete
1125 /// state is `Enabled` requires a valid `x-amz-mfa: <serial> <code>`
1126 /// header (RFC 6238 6-digit TOTP); the gate is a no-op for buckets
1127 /// where MFA-Delete is `Disabled` (S3 default).
1128 #[must_use]
1129 pub fn with_mfa_delete(mut self, mgr: Arc<crate::mfa::MfaDeleteManager>) -> Self {
1130 self.mfa_delete = Some(mgr);
1131 self
1132 }
1133
1134 /// v0.6 #42: borrow the attached MFA-Delete manager (test /
1135 /// introspection — used by the snapshot path in `main.rs` to call
1136 /// `to_json` for restart-recoverable state).
1137 #[must_use]
1138 pub fn mfa_delete_manager(&self) -> Option<&Arc<crate::mfa::MfaDeleteManager>> {
1139 self.mfa_delete.as_ref()
1140 }
1141
1142 /// v0.6 #38: attach the in-memory CORS configuration manager. Once
1143 /// set, `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
1144 /// route through the manager instead of forwarding to the backend,
1145 /// and [`Self::handle_preflight`] becomes useful for the (future)
1146 /// listener-side OPTIONS interceptor.
1147 #[must_use]
1148 pub fn with_cors(mut self, mgr: Arc<crate::cors::CorsManager>) -> Self {
1149 self.cors = Some(mgr);
1150 self
1151 }
1152
1153 /// v0.6 #38: Borrow the attached CORS manager (test / introspection).
1154 #[must_use]
1155 pub fn cors_manager(&self) -> Option<&Arc<crate::cors::CorsManager>> {
1156 self.cors.as_ref()
1157 }
1158
1159 /// v0.6 #38: evaluate a CORS preflight request against the bucket's
1160 /// configured rules and, if a rule matches, return the headers that
1161 /// the (future) listener-side OPTIONS interceptor must put on the
1162 /// 200 response: `Access-Control-Allow-Origin`, `Access-Control-
1163 /// Allow-Methods`, `Access-Control-Allow-Headers`, optionally
1164 /// `Access-Control-Max-Age` and `Access-Control-Expose-Headers`.
1165 ///
1166 /// Returns `None` when no manager is attached, no config is
1167 /// registered for the bucket, or no rule matches the (origin,
1168 /// method, headers) triple. The caller is responsible for turning
1169 /// `None` into the appropriate 403 response.
1170 ///
1171 /// **Note:** the OPTIONS routing itself (i.e. wiring this method
1172 /// into the hyper-util listener path) is a follow-up — s3s does not
1173 /// surface OPTIONS as a typed S3 handler, so this method is
1174 /// currently call-able only from inside other handlers and tests.
1175 #[must_use]
1176 pub fn handle_preflight(
1177 &self,
1178 bucket: &str,
1179 origin: &str,
1180 method: &str,
1181 request_headers: &[String],
1182 ) -> Option<std::collections::HashMap<String, String>> {
1183 let mgr = self.cors.as_ref()?;
1184 let rule = mgr.match_preflight(bucket, origin, method, request_headers)?;
1185 let mut h = std::collections::HashMap::new();
1186 // Echo the matched origin back. If the rule used "*" we still
1187 // echo "*" (S3 spec — the spec does not require us to echo the
1188 // *requesting* origin when the wildcard matched).
1189 let allow_origin = if rule.allowed_origins.iter().any(|o| o == "*") {
1190 "*".to_string()
1191 } else {
1192 origin.to_string()
1193 };
1194 h.insert("Access-Control-Allow-Origin".to_string(), allow_origin);
1195 h.insert(
1196 "Access-Control-Allow-Methods".to_string(),
1197 rule.allowed_methods.join(", "),
1198 );
1199 if !rule.allowed_headers.is_empty() {
1200 // For the Allow-Headers response, echo back the rule's
1201 // pattern list verbatim (S3 echoes the configured list,
1202 // including "*" if present). Browsers honour exact-match
1203 // rules.
1204 h.insert(
1205 "Access-Control-Allow-Headers".to_string(),
1206 rule.allowed_headers.join(", "),
1207 );
1208 }
1209 if let Some(secs) = rule.max_age_seconds {
1210 h.insert("Access-Control-Max-Age".to_string(), secs.to_string());
1211 }
1212 if !rule.expose_headers.is_empty() {
1213 h.insert(
1214 "Access-Control-Expose-Headers".to_string(),
1215 rule.expose_headers.join(", "),
1216 );
1217 }
1218 Some(h)
1219 }
1220
1221 /// v0.5 #32: enable strict compliance mode. Every PUT must carry an
1222 /// SSE indicator (server-side encryption header or SSE-C customer
1223 /// key); requests without one are rejected with 400 InvalidRequest.
1224 /// Boot-time prerequisite checking lives in the binary
1225 /// (`validate_compliance_mode`) so this flag is purely the runtime
1226 /// switch.
1227 #[must_use]
1228 pub fn with_compliance_strict(mut self, on: bool) -> Self {
1229 self.compliance_strict = on;
1230 self
1231 }
1232
1233 /// v0.5 #30: attach the in-memory Object Lock (WORM) enforcement
1234 /// manager. Once set, `delete_object` and overwrite-path
1235 /// `put_object` refuse operations on locked keys with HTTP 403
1236 /// `AccessDenied`; new PUTs to a bucket with a default retention
1237 /// policy auto-create per-object lock state.
1238 #[must_use]
1239 pub fn with_object_lock(mut self, mgr: Arc<crate::object_lock::ObjectLockManager>) -> Self {
1240 self.object_lock = Some(mgr);
1241 self
1242 }
1243
1244 /// v0.7 #45: borrow the attached Object Lock manager (read-only —
1245 /// the lifecycle scanner uses this to skip currently-locked objects
1246 /// before issuing `delete_object`, since an Object Lock always wins
1247 /// over Lifecycle Expiration in AWS S3 semantics). Mirrors the
1248 /// shape of [`Self::lifecycle_manager`] /
1249 /// [`Self::tag_manager`] — purely additive accessor, no handler
1250 /// behaviour change.
1251 #[must_use]
1252 pub fn object_lock_manager(&self) -> Option<&Arc<crate::object_lock::ObjectLockManager>> {
1253 self.object_lock.as_ref()
1254 }
1255
1256 /// v0.5 #28: attach an SSE-KMS backend. `default_key_id` is used
1257 /// when a PUT requests SSE-KMS without naming a specific KMS key
1258 /// (operators set this to mirror AWS S3's bucket-default key).
1259 #[must_use]
1260 pub fn with_kms_backend(
1261 mut self,
1262 kms: Arc<dyn crate::kms::KmsBackend>,
1263 default_key_id: Option<String>,
1264 ) -> Self {
1265 self.kms = Some(kms);
1266 self.kms_default_key_id = default_key_id;
1267 self
1268 }
1269
1270 /// v0.5 #34: attach the first-class versioning state machine. Once
1271 /// set, this `S4Service` owns the per-bucket versioning state +
1272 /// per-(bucket, key) version chain; `put_object` / `get_object` /
1273 /// `delete_object` / `list_object_versions` /
1274 /// `get_bucket_versioning` / `put_bucket_versioning` consult the
1275 /// manager instead of passing through to the backend. The backend
1276 /// is still used as the byte store: Suspended / Unversioned buckets
1277 /// keep using `<key>` directly (legacy), Enabled buckets redirect
1278 /// each version's bytes to a shadow key
1279 /// (`<key>.__s4ver__/<version-id>`) so older versions survive newer
1280 /// PUTs to the same logical key.
1281 #[must_use]
1282 pub fn with_versioning(mut self, mgr: Arc<crate::versioning::VersioningManager>) -> Self {
1283 self.versioning = Some(mgr);
1284 self
1285 }
1286
1287 /// v0.8.5 #86 (audit M-3): borrow the attached versioning manager so
1288 /// the SIGUSR1 snapshot dump-back hook in `main.rs` can re-emit the
1289 /// in-memory state to the operator's `--versioning-state-file`
1290 /// without restarting the gateway. Mirrors the shape of
1291 /// [`Self::object_lock_manager`] / [`Self::lifecycle_manager`] —
1292 /// purely additive accessor, no handler behaviour change.
1293 #[must_use]
1294 pub fn versioning_manager(&self) -> Option<&Arc<crate::versioning::VersioningManager>> {
1295 self.versioning.as_ref()
1296 }
1297
1298 /// v0.8.5 #86 (audit M-2): override the default replication-dispatch
1299 /// concurrency cap (1024). Wired by the `--replication-max-concurrent`
1300 /// CLI flag in `main.rs`. Operators running heavy cross-region
1301 /// fan-out may need to raise this; operators on memory-constrained
1302 /// hosts may need to lower it. The new value replaces the existing
1303 /// `Semaphore` (so calling this after dispatchers are already in
1304 /// flight is fine — the in-flight tasks hold permits from the old
1305 /// semaphore which is dropped when its last permit is released).
1306 /// A `max` of 0 would deadlock all replicas; the value is silently
1307 /// clamped to 1 instead.
1308 #[must_use]
1309 pub fn with_replication_max_concurrent(mut self, max: usize) -> Self {
1310 let max = max.max(1);
1311 self.replication_semaphore = Arc::new(tokio::sync::Semaphore::new(max));
1312 self
1313 }
1314
1315 /// v0.8.5 #86 (audit M-2): borrow the in-flight replication
1316 /// concurrency permit pool. Tests inspect `available_permits()`
1317 /// after invoking `spawn_replication_if_matched` to verify the
1318 /// dispatcher actually `acquire_owned`s before kicking off the
1319 /// destination PUT.
1320 #[must_use]
1321 pub fn replication_semaphore(&self) -> &Arc<tokio::sync::Semaphore> {
1322 &self.replication_semaphore
1323 }
1324
1325 /// v0.4 #21 (kept for back-compat): attach a single SSE-S4 key.
1326 /// Internally wraps it in a 1-slot keyring with id=1 active, so
1327 /// new objects ride the v0.5 S4E2 frame while previously-written
1328 /// S4E1 bytes (this same key) still decrypt via the keyring's S4E1
1329 /// fallback path. Operators wanting true rotation should call
1330 /// [`Self::with_sse_keyring`] instead.
1331 #[must_use]
1332 pub fn with_sse_key(mut self, key: crate::sse::SharedSseKey) -> Self {
1333 let keyring = crate::sse::SseKeyring::new(1, key);
1334 self.sse_keyring = Some(std::sync::Arc::new(keyring));
1335 self
1336 }
1337
1338 /// v0.5 #29: attach a multi-key SSE-S4 keyring. PUT encrypts under
1339 /// the active key (S4E2 frame stamped with that key's id); GET
1340 /// dispatches on the body's magic — S4E1 falls back to trying every
1341 /// key in the ring (active first) so v0.4 objects survive a
1342 /// migration; S4E2 looks up the explicit key_id from the header.
1343 #[must_use]
1344 pub fn with_sse_keyring(mut self, keyring: crate::sse::SharedSseKeyring) -> Self {
1345 self.sse_keyring = Some(keyring);
1346 self
1347 }
1348
1349 /// v0.8 #52: opt the SSE-S4 PUT path into the chunked S4E5 frame
1350 /// (so the matching GET can stream-decrypt chunk-by-chunk
1351 /// instead of buffering the entire body before tag verify).
1352 /// `bytes` is the plaintext slice size — typically 1 MiB; 0
1353 /// disables the path and reverts to the legacy S4E2 buffered
1354 /// frame.
1355 ///
1356 /// SSE-C (S4E3) and SSE-KMS (S4E4) are intentionally untouched:
1357 /// the chunked envelopes for those flows are a follow-up issue
1358 /// (the customer-key wire surface needs separate version
1359 /// negotiation).
1360 ///
1361 /// Has no effect when `with_sse_keyring` / `with_sse_key` is
1362 /// not also set — the chunked path runs only on the SSE-S4
1363 /// branch of `put_object`.
1364 #[must_use]
1365 pub fn with_sse_chunk_size(mut self, bytes: usize) -> Self {
1366 self.sse_chunk_size = bytes;
1367 self
1368 }
1369
1370 /// v0.4 #20: attach an S3-style access-log emitter. Each completed
1371 /// PUT / GET / DELETE / List handler emits one entry into the
1372 /// emitter's buffer; a background flusher (started separately, see
1373 /// [`crate::access_log::AccessLog::spawn_flusher`]) writes hourly
1374 /// rotated `.log` files into the configured directory.
1375 #[must_use]
1376 pub fn with_access_log(mut self, log: crate::access_log::SharedAccessLog) -> Self {
1377 self.access_log = Some(log);
1378 self
1379 }
1380
1381 /// Capture the per-request access-log preamble before the request is
1382 /// consumed by the backend call. Returns `None` if no access logger
1383 /// is configured (cheap early-out so the handler doesn't pay the
1384 /// header-clone cost when access logging is off).
1385 fn access_log_preamble<I>(&self, req: &S3Request<I>) -> Option<AccessLogPreamble> {
1386 self.access_log.as_ref()?;
1387 Some(AccessLogPreamble {
1388 // v0.8.11 CRIT-4 fix: same trust gate as `request_context`.
1389 // Recording a client-controllable header in the access log
1390 // would poison forensic queries; leave it `None` until the
1391 // operator declares X-Forwarded-For is set by a trusted
1392 // proxy.
1393 remote_ip: if self.trust_x_forwarded_for {
1394 req.headers
1395 .get("x-forwarded-for")
1396 .and_then(|v| v.to_str().ok())
1397 .and_then(|raw| raw.split(',').next())
1398 .map(|s| s.trim().to_owned())
1399 } else {
1400 None
1401 },
1402 requester: Self::principal_of(req).map(str::to_owned),
1403 request_uri: format!("{} {}", req.method, req.uri.path()),
1404 user_agent: req
1405 .headers
1406 .get("user-agent")
1407 .and_then(|v| v.to_str().ok())
1408 .map(str::to_owned),
1409 })
1410 }
1411
1412 /// Internal — called by handlers at end-of-request with a captured
1413 /// preamble. Best-effort: swallows the await fast (clones Arc +
1414 /// pushes), no error propagation back to the request path.
1415 #[allow(clippy::too_many_arguments)]
1416 async fn record_access(
1417 &self,
1418 preamble: Option<AccessLogPreamble>,
1419 operation: &'static str,
1420 bucket: &str,
1421 key: Option<&str>,
1422 http_status: u16,
1423 bytes_sent: u64,
1424 object_size: u64,
1425 total_time_ms: u64,
1426 error_code: Option<&str>,
1427 ) {
1428 let (Some(log), Some(p)) = (self.access_log.as_ref(), preamble) else {
1429 return;
1430 };
1431 log.record(crate::access_log::AccessLogEntry {
1432 time: std::time::SystemTime::now(),
1433 bucket: bucket.to_owned(),
1434 remote_ip: p.remote_ip,
1435 requester: p.requester,
1436 operation,
1437 key: key.map(str::to_owned),
1438 request_uri: p.request_uri,
1439 http_status,
1440 error_code: error_code.map(str::to_owned),
1441 bytes_sent,
1442 object_size,
1443 total_time_ms,
1444 user_agent: p.user_agent,
1445 })
1446 .await;
1447 }
1448
1449 /// v0.4 #19: attach a per-(principal, bucket) token-bucket rate limiter.
1450 /// When set, every PUT / GET / DELETE / List / Copy / multipart op is
1451 /// throttle-checked before the policy gate; throttled requests return
1452 /// `S3ErrorCode::SlowDown` (HTTP 503) and bump
1453 /// `s4_rate_limit_throttled_total{principal,bucket}`.
1454 #[must_use]
1455 pub fn with_rate_limits(mut self, rl: crate::rate_limit::SharedRateLimits) -> Self {
1456 self.rate_limits = Some(rl);
1457 self
1458 }
1459
1460 /// Helper used by request handlers to apply the rate limit. Returns
1461 /// `Ok(())` when allowed (or no rate limiter is configured), or a
1462 /// `SlowDown` S3Error otherwise.
1463 fn enforce_rate_limit<I>(&self, req: &S3Request<I>, bucket: &str) -> S3Result<()> {
1464 let Some(rl) = self.rate_limits.as_ref() else {
1465 return Ok(());
1466 };
1467 let principal_id = Self::principal_of(req);
1468 if !rl.check(principal_id, bucket) {
1469 crate::metrics::record_rate_limit_throttle(principal_id.unwrap_or("-"), bucket);
1470 return Err(S3Error::with_message(
1471 S3ErrorCode::SlowDown,
1472 format!("rate-limited: bucket={bucket}"),
1473 ));
1474 }
1475 Ok(())
1476 }
1477
1478 /// Tell the policy evaluator that the listener is reached over TLS
1479 /// (or ACME). When `true`, the `aws:SecureTransport` Condition key
1480 /// resolves to `true`. Defaults to `false`.
1481 #[must_use]
1482 pub fn with_secure_transport(mut self, on: bool) -> Self {
1483 self.secure_transport = on;
1484 self
1485 }
1486
1487 #[must_use]
1488 pub fn with_max_body_bytes(mut self, n: usize) -> Self {
1489 self.max_body_bytes = n;
1490 self
1491 }
1492
1493 /// Attach an optional bucket policy (v0.2 #7). When `Some(...)`, every
1494 /// PUT / GET / DELETE / List handler runs `policy.evaluate(...)` before
1495 /// delegating to the backend; failures return `S3ErrorCode::AccessDenied`.
1496 /// When `None` (the default), no policy enforcement happens.
1497 #[must_use]
1498 pub fn with_policy(mut self, policy: crate::policy::SharedPolicy) -> Self {
1499 self.policy = Some(policy);
1500 self
1501 }
1502
1503 /// Pull the SigV4 access key id off the request's credentials, if any.
1504 /// Used as the `principal_id` for policy evaluation.
1505 fn principal_of<I>(req: &S3Request<I>) -> Option<&str> {
1506 req.credentials.as_ref().map(|c| c.access_key.as_str())
1507 }
1508
1509 /// v0.8.17 G-2: shared reserved-name guard used by every per-object
1510 /// API handler. `mode` chooses the AWS error shape: `Mutating`
1511 /// (PUT / Copy / DELETE / Tagging-write) returns
1512 /// `InvalidObjectName`; `Read` (GET / HEAD / Attributes / Tagging-read)
1513 /// returns `NoSuchKey` so a curious client gets the same response
1514 /// the listing filter has been giving them since v0.8.12 (the
1515 /// sidecar is invisible to list).
1516 ///
1517 /// v0.8.17 G-4: when `--allow-legacy-reserved-key-reads` is set
1518 /// AND the call is a `Read`, the guard returns `Ok(())` so
1519 /// operators upgrading from pre-v0.8.15 deployments can still
1520 /// access (and migrate off) any user-owned `<key>.s4index`
1521 /// objects that landed before M-1 / F-13 closed the namespace.
1522 /// Mutating operations stay blocked regardless of the flag —
1523 /// the flag is a read-only migration aid, not an injection
1524 /// re-opener.
1525 fn check_not_reserved_key(&self, key: &str, mode: ReservedKeyMode) -> S3Result<()> {
1526 if !s4_codec::index::is_reserved_sidecar_key(key) {
1527 return Ok(());
1528 }
1529 if matches!(mode, ReservedKeyMode::Read) && self.allow_legacy_reserved_key_reads {
1530 return Ok(());
1531 }
1532 match mode {
1533 ReservedKeyMode::Read => Err(S3Error::with_message(
1534 S3ErrorCode::NoSuchKey,
1535 format!("object key {key:?} is reserved for S4 internal sidecars"),
1536 )),
1537 ReservedKeyMode::Mutating => {
1538 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
1539 .unwrap_or(S3ErrorCode::InvalidArgument);
1540 Err(S3Error::with_message(
1541 code,
1542 format!(
1543 "object key {key:?} is reserved (suffix `{}` is used for S4 internal \
1544 sidecars)",
1545 s4_codec::index::SIDECAR_SUFFIX,
1546 ),
1547 ))
1548 }
1549 }
1550 }
1551
1552 /// v0.3 #13: build the per-request policy context from the incoming
1553 /// `S3Request`. Pulls `aws:UserAgent` from the User-Agent header,
1554 /// `aws:SourceIp` from the standard `X-Forwarded-For` header (most
1555 /// production deployments are behind an LB / reverse proxy that sets
1556 /// this), `aws:CurrentTime` from the system clock, and
1557 /// `aws:SecureTransport` from the per-listener TLS flag.
1558 fn request_context<I>(&self, req: &S3Request<I>) -> crate::policy::RequestContext {
1559 let user_agent = req
1560 .headers
1561 .get("user-agent")
1562 .and_then(|v| v.to_str().ok())
1563 .map(str::to_owned);
1564 // v0.8.11 CRIT-4 fix: `X-Forwarded-For` is a client-controllable
1565 // header. Trusting it unconditionally lets any public-internet
1566 // request claim it came from a trusted CIDR (e.g.
1567 // `curl -H 'X-Forwarded-For: 10.0.0.1'` to satisfy a
1568 // `Condition: NotIpAddress aws:SourceIp [10.0.0.0/8]` Deny).
1569 // We now only consume the header when the operator has
1570 // declared "this gateway sits behind a trusted reverse proxy
1571 // that scrubs client-supplied values" via
1572 // `with_trust_x_forwarded_for(true)` /
1573 // `--trust-x-forwarded-for`. Default leaves `source_ip` as
1574 // `None`, which fails closed for IP-allowlist Allow rules
1575 // and fails open for IP-blocklist Deny rules — operators
1576 // who need either case behind a public listener must opt in
1577 // or move the gate to the reverse proxy. The leftmost
1578 // comma-separated token is the originator per the
1579 // `X-Forwarded-For: client, proxy1, proxy2` convention.
1580 let source_ip = if self.trust_x_forwarded_for {
1581 req.headers
1582 .get("x-forwarded-for")
1583 .and_then(|v| v.to_str().ok())
1584 .and_then(|raw| raw.split(',').next())
1585 .and_then(|s| s.trim().parse().ok())
1586 } else {
1587 None
1588 };
1589 crate::policy::RequestContext {
1590 source_ip,
1591 user_agent,
1592 request_time: Some(std::time::SystemTime::now()),
1593 secure_transport: self.secure_transport,
1594 existing_object_tags: None,
1595 request_object_tags: None,
1596 extra: Default::default(),
1597 }
1598 }
1599
1600 /// Helper used by request handlers to enforce the optional policy.
1601 /// Returns `Ok(())` when allowed (or no policy is configured), or an
1602 /// `AccessDenied` S3Error otherwise. Bumps the policy denial Prometheus
1603 /// counter on deny.
1604 fn enforce_policy<I>(
1605 &self,
1606 req: &S3Request<I>,
1607 action: &'static str,
1608 bucket: &str,
1609 key: Option<&str>,
1610 ) -> S3Result<()> {
1611 self.enforce_policy_with_extra(req, action, bucket, key, None, None)
1612 }
1613
1614 /// v0.6 #39: variant of [`Self::enforce_policy`] that lets the
1615 /// caller plumb tag context (existing-on-object + on-request) into
1616 /// the policy evaluator. Both arguments default to `None`, in
1617 /// which case the resulting `RequestContext` is identical to
1618 /// [`Self::enforce_policy`]'s — so for handlers that don't deal
1619 /// with tags this is a transparent no-op.
1620 fn enforce_policy_with_extra<I>(
1621 &self,
1622 req: &S3Request<I>,
1623 action: &'static str,
1624 bucket: &str,
1625 key: Option<&str>,
1626 request_tags: Option<&crate::tagging::TagSet>,
1627 existing_tags: Option<&crate::tagging::TagSet>,
1628 ) -> S3Result<()> {
1629 let Some(policy) = self.policy.as_ref() else {
1630 return Ok(());
1631 };
1632 let principal_id = Self::principal_of(req);
1633 let mut ctx = self.request_context(req);
1634 if let Some(t) = request_tags {
1635 ctx.request_object_tags = Some(t.clone());
1636 }
1637 if let Some(t) = existing_tags {
1638 ctx.existing_object_tags = Some(t.clone());
1639 }
1640 let decision = policy.evaluate_with(action, bucket, key, principal_id, &ctx);
1641 if decision.allow {
1642 Ok(())
1643 } else {
1644 crate::metrics::record_policy_denial(action, bucket);
1645 tracing::info!(
1646 action,
1647 bucket,
1648 key = ?key,
1649 principal = ?principal_id,
1650 source_ip = ?ctx.source_ip,
1651 user_agent = ?ctx.user_agent,
1652 secure_transport = ctx.secure_transport,
1653 matched_sid = ?decision.matched_sid,
1654 effect = ?decision.matched_effect,
1655 "S4 policy denied request"
1656 );
1657 Err(S3Error::with_message(
1658 S3ErrorCode::AccessDenied,
1659 format!("denied by S4 policy: {action} on bucket={bucket}"),
1660 ))
1661 }
1662 }
1663
1664 /// テスト用: backend を取り戻す (test helper、production では使わない).
1665 /// v0.6 #40 で `backend` が `Arc<B>` 化したので `Arc::try_unwrap` で
1666 /// 1-clone の場合のみ返す。共有されている (= replication dispatcher が
1667 /// 同じ Arc を持っていて未完了) 場合は `Err` を返さず panic させる
1668 /// (test 用途専用 helper の caller 契約を維持)。
1669 pub fn into_backend(self) -> B {
1670 Arc::try_unwrap(self.backend).unwrap_or_else(|_| {
1671 panic!("into_backend: backend Arc still shared (replication dispatcher in flight?)")
1672 })
1673 }
1674
1675 /// 必要 frame だけを backend に Range GET し、frame parse + decompress + slice
1676 /// した結果を返す sidecar fast path。Range request の **帯域節約版**。
1677 async fn partial_range_get(
1678 &self,
1679 req: &S3Request<GetObjectInput>,
1680 plan: s4_codec::index::RangePlan,
1681 client_start: u64,
1682 client_end_exclusive: u64,
1683 total_original: u64,
1684 get_start: Instant,
1685 ) -> S3Result<S3Response<GetObjectOutput>> {
1686 // 必要 byte 範囲だけを backend に partial GET
1687 let backend_range = s3s::dto::Range::Int {
1688 first: plan.byte_start,
1689 last: Some(plan.byte_end_exclusive - 1),
1690 };
1691 let backend_input = GetObjectInput {
1692 bucket: req.input.bucket.clone(),
1693 key: req.input.key.clone(),
1694 range: Some(backend_range),
1695 ..Default::default()
1696 };
1697 let backend_req = S3Request {
1698 input: backend_input,
1699 method: req.method.clone(),
1700 uri: req.uri.clone(),
1701 headers: req.headers.clone(),
1702 extensions: http::Extensions::new(),
1703 credentials: req.credentials.clone(),
1704 region: req.region.clone(),
1705 service: req.service.clone(),
1706 trailing_headers: None,
1707 };
1708 let mut backend_resp = self.backend.get_object(backend_req).await?;
1709 let blob = backend_resp.output.body.take().ok_or_else(|| {
1710 S3Error::with_message(
1711 S3ErrorCode::InternalError,
1712 "backend partial GET returned empty body",
1713 )
1714 })?;
1715 let bytes = collect_blob(blob, self.max_body_bytes)
1716 .await
1717 .map_err(internal("collect partial body"))?;
1718
1719 // frame parse + decompress
1720 let mut combined = BytesMut::new();
1721 for frame in FrameIter::new(bytes) {
1722 let (header, payload) = frame.map_err(|e| {
1723 S3Error::with_message(
1724 S3ErrorCode::InternalError,
1725 format!("partial-range frame parse: {e}"),
1726 )
1727 })?;
1728 let chunk_manifest = ChunkManifest {
1729 codec: header.codec,
1730 original_size: header.original_size,
1731 compressed_size: header.compressed_size,
1732 crc32c: header.crc32c,
1733 };
1734 let decompressed = self
1735 .registry
1736 .decompress(payload, &chunk_manifest)
1737 .await
1738 .map_err(internal("partial-range decompress"))?;
1739 combined.extend_from_slice(&decompressed);
1740 }
1741 let combined = combined.freeze();
1742 let sliced = combined
1743 .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
1744
1745 // response 組立て
1746 let returned_size = sliced.len() as u64;
1747 backend_resp.output.content_length = Some(returned_size as i64);
1748 backend_resp.output.content_range = Some(format!(
1749 "bytes {client_start}-{}/{total_original}",
1750 client_end_exclusive - 1
1751 ));
1752 backend_resp.output.checksum_crc32 = None;
1753 backend_resp.output.checksum_crc32c = None;
1754 backend_resp.output.checksum_crc64nvme = None;
1755 backend_resp.output.checksum_sha1 = None;
1756 backend_resp.output.checksum_sha256 = None;
1757 backend_resp.output.e_tag = None;
1758 backend_resp.output.body = Some(bytes_to_blob(sliced));
1759 backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
1760
1761 let elapsed = get_start.elapsed();
1762 crate::metrics::record_get(
1763 "partial",
1764 plan.byte_end_exclusive - plan.byte_start,
1765 returned_size,
1766 elapsed.as_secs_f64(),
1767 true,
1768 );
1769 info!(
1770 op = "get_object",
1771 bucket = %req.input.bucket,
1772 key = %req.input.key,
1773 bytes_in = plan.byte_end_exclusive - plan.byte_start,
1774 bytes_out = returned_size,
1775 total_object_size = total_original,
1776 range = true,
1777 path = "sidecar-partial",
1778 latency_ms = elapsed.as_millis() as u64,
1779 "S4 partial Range GET via sidecar index"
1780 );
1781 Ok(backend_resp)
1782 }
1783
1784 /// `<key>.s4index` sidecar object を backend に書く。失敗しても本体 PUT は
1785 /// 成功扱いにしたいので、err は warn ログのみ (Range GET の partial path が
1786 /// 使えなくなるが、full read fallback で意味的には正しい結果を返す)。
1787 async fn write_sidecar(&self, bucket: &str, key: &str, index: &FrameIndex) {
1788 let bytes = encode_index(index);
1789 let len = bytes.len() as i64;
1790 let sidecar = sidecar_key(key);
1791 // v0.7 #49: synthetic re-entry URI must be percent-encoded; if
1792 // the (already legally-arbitrary) S3 key produces something we
1793 // cannot encode at all, drop the sidecar PUT (the GET path
1794 // falls back to a full read on a missing sidecar) instead of
1795 // panicking on `parse().unwrap()`.
1796 let uri = match safe_object_uri(bucket, &sidecar) {
1797 Ok(u) => u,
1798 Err(e) => {
1799 tracing::warn!(
1800 bucket,
1801 key,
1802 "S4 write_sidecar skipped (key not URI-encodable): {e}"
1803 );
1804 return;
1805 }
1806 };
1807 let put_input = PutObjectInput {
1808 bucket: bucket.into(),
1809 key: sidecar,
1810 body: Some(bytes_to_blob(bytes)),
1811 content_length: Some(len),
1812 content_type: Some("application/x-s4-index".into()),
1813 ..Default::default()
1814 };
1815 let put_req = S3Request {
1816 input: put_input,
1817 method: http::Method::PUT,
1818 uri,
1819 headers: http::HeaderMap::new(),
1820 extensions: http::Extensions::new(),
1821 credentials: None,
1822 region: None,
1823 service: None,
1824 trailing_headers: None,
1825 };
1826 if let Err(e) = self.backend.put_object(put_req).await {
1827 tracing::warn!(
1828 bucket,
1829 key,
1830 "S4 write_sidecar failed (Range GET will fall back to full read): {e}"
1831 );
1832 }
1833 }
1834
1835 /// v0.8.4 #73 H-2: confirm that the sidecar we just decoded still
1836 /// describes the current backend object before we trust its frame
1837 /// offsets for a partial Range GET. The sidecar carries the source
1838 /// `etag` and `compressed_size` that were observed at PUT time; we
1839 /// HEAD the backend object and compare.
1840 ///
1841 /// Decision matrix:
1842 /// - sidecar `source_etag = None` (legacy v1 / build_index_from_body
1843 /// that wasn't stamped) → return `true` (best-effort, preserves
1844 /// pre-v0.8.4 behaviour for existing on-disk sidecars).
1845 /// - HEAD fails → return `false` (we can't tell either way; full GET
1846 /// path will surface the real backend error to the client).
1847 /// - HEAD ETag matches → `true`.
1848 /// - HEAD ETag differs OR HEAD size differs from
1849 /// `source_compressed_size` → `false` (sidecar stale or attacker-
1850 /// written; fall back to full GET).
1851 async fn sidecar_version_binding_ok(
1852 &self,
1853 bucket: &str,
1854 key: &str,
1855 index: &FrameIndex,
1856 ) -> bool {
1857 let Some(ref expected_etag) = index.source_etag else {
1858 // Legacy sidecar without the v0.8.4 #73 H-2 binding —
1859 // back-compat: trust it (the partial fetch is the same
1860 // best-effort path that v0.8.3 and earlier shipped).
1861 return true;
1862 };
1863 let head_input = HeadObjectInput {
1864 bucket: bucket.into(),
1865 key: key.into(),
1866 ..Default::default()
1867 };
1868 let uri = match safe_object_uri(bucket, key) {
1869 Ok(u) => u,
1870 Err(_) => return false,
1871 };
1872 let head_req = S3Request {
1873 input: head_input,
1874 method: http::Method::HEAD,
1875 uri,
1876 headers: http::HeaderMap::new(),
1877 extensions: http::Extensions::new(),
1878 credentials: None,
1879 region: None,
1880 service: None,
1881 trailing_headers: None,
1882 };
1883 let head = match self.backend.head_object(head_req).await {
1884 Ok(r) => r.output,
1885 Err(e) => {
1886 tracing::debug!(
1887 bucket,
1888 key,
1889 "S4 sidecar version-binding HEAD failed, falling back to full GET: {e}"
1890 );
1891 return false;
1892 }
1893 };
1894 // ETag is a strong-vs-weak enum; we compare on the unwrapped string
1895 // form (matches what the PUT path stamped — see below).
1896 let live_etag = head.e_tag.as_ref().map(|t| t.value());
1897 if live_etag != Some(expected_etag.as_str()) {
1898 tracing::debug!(
1899 bucket,
1900 key,
1901 "sidecar stale (ETag mismatch), falling back to full GET (sidecar={:?}, live={:?})",
1902 expected_etag,
1903 live_etag,
1904 );
1905 return false;
1906 }
1907 if let Some(expected_size) = index.source_compressed_size
1908 && let Some(live_size) = head.content_length
1909 && live_size as u64 != expected_size
1910 {
1911 tracing::debug!(
1912 bucket,
1913 key,
1914 "sidecar stale (size mismatch), falling back to full GET (sidecar={}, live={})",
1915 expected_size,
1916 live_size,
1917 );
1918 return false;
1919 }
1920 true
1921 }
1922
1923 /// `<key>.s4index` sidecar を backend から読み出す。なければ None。
1924 async fn read_sidecar(&self, bucket: &str, key: &str) -> Option<FrameIndex> {
1925 let sidecar = sidecar_key(key);
1926 // v0.7 #49: same encode-or-bail treatment as write_sidecar.
1927 let uri = safe_object_uri(bucket, &sidecar).ok()?;
1928 let get_input = GetObjectInput {
1929 bucket: bucket.into(),
1930 key: sidecar,
1931 ..Default::default()
1932 };
1933 let get_req = S3Request {
1934 input: get_input,
1935 method: http::Method::GET,
1936 uri,
1937 headers: http::HeaderMap::new(),
1938 extensions: http::Extensions::new(),
1939 credentials: None,
1940 region: None,
1941 service: None,
1942 trailing_headers: None,
1943 };
1944 let resp = self.backend.get_object(get_req).await.ok()?;
1945 let blob = resp.output.body?;
1946 let bytes = collect_blob(blob, 64 * 1024 * 1024).await.ok()?;
1947 decode_index(bytes).ok()
1948 }
1949
1950 /// Multipart object (frame 列) を解凍 → 元 bytes を再構築。
1951 ///
1952 /// **per-frame codec dispatch**: 各 frame header に codec_id が入っているので、
1953 /// frame ごとに registry が違う codec を呼ぶことができる。同一 object 内で
1954 /// 異なる codec が混在していても透過的に解凍可能 (parquet 風 mixed columns 等)。
1955 async fn decompress_multipart(&self, bytes: bytes::Bytes) -> S3Result<bytes::Bytes> {
1956 let mut out = BytesMut::new();
1957 // v0.8.15 H-h: cap the *aggregate* decoded output. Each
1958 // individual frame is already bounded by
1959 // `validate_decompress_manifest` (default 5 GiB per frame),
1960 // but a forged multi-frame body can declare many frames
1961 // each near the limit — without an object-level ceiling, a
1962 // single GET could pin tens of GiB of plaintext in
1963 // `BytesMut::extend_from_slice`. Use the gateway's
1964 // `max_body_bytes` (same cap that bounds PUT bodies) so a
1965 // GET can never produce more plaintext than a PUT can ever
1966 // legitimately have stored.
1967 let aggregate_cap = self.max_body_bytes;
1968 let mut produced: usize = 0;
1969 for frame in FrameIter::new(bytes) {
1970 let (header, payload) = frame.map_err(|e| {
1971 S3Error::with_message(
1972 S3ErrorCode::InternalError,
1973 format!("multipart frame parse: {e}"),
1974 )
1975 })?;
1976 let chunk_manifest = ChunkManifest {
1977 codec: header.codec,
1978 original_size: header.original_size,
1979 compressed_size: header.compressed_size,
1980 crc32c: header.crc32c,
1981 };
1982 // v0.8.15 H-h: pre-flight check on the declared
1983 // `original_size` so a forged manifest claiming a frame
1984 // that would push us past the cap is rejected before we
1985 // start decoding. Defence-in-depth alongside the
1986 // post-decode `produced` check below.
1987 if (produced as u64).saturating_add(header.original_size) > aggregate_cap as u64 {
1988 return Err(S3Error::with_message(
1989 S3ErrorCode::InternalError,
1990 format!(
1991 "multipart aggregate output exceeds cap: would reach \
1992 {produced_total} bytes after this frame, cap is {aggregate_cap}",
1993 produced_total = (produced as u64).saturating_add(header.original_size),
1994 ),
1995 ));
1996 }
1997 let decompressed = self
1998 .registry
1999 .decompress(payload, &chunk_manifest)
2000 .await
2001 .map_err(internal("multipart frame decompress"))?;
2002 produced = produced.saturating_add(decompressed.len());
2003 if produced > aggregate_cap {
2004 return Err(S3Error::with_message(
2005 S3ErrorCode::InternalError,
2006 format!(
2007 "multipart aggregate output exceeded cap: {produced} bytes \
2008 emitted, cap is {aggregate_cap}"
2009 ),
2010 ));
2011 }
2012 out.extend_from_slice(&decompressed);
2013 }
2014 Ok(out.freeze())
2015 }
2016}
2017
2018/// Parse a CopySourceRange header value (`bytes=N-M`, `bytes=N-`, `bytes=-N`)
2019/// into the s3s::dto::Range used by the GetObject path. The S3 spec only
2020/// allows `bytes=N-M` for upload_part_copy (no suffix or open-ended), so
2021/// reject the other variants for parity with AWS.
2022fn parse_copy_source_range(s: &str) -> Result<s3s::dto::Range, String> {
2023 let rest = s
2024 .strip_prefix("bytes=")
2025 .ok_or_else(|| format!("CopySourceRange must start with 'bytes=', got {s:?}"))?;
2026 let (a, b) = rest
2027 .split_once('-')
2028 .ok_or_else(|| format!("CopySourceRange must be 'bytes=N-M', got {s:?}"))?;
2029 let first: u64 = a
2030 .parse()
2031 .map_err(|_| format!("CopySourceRange first byte not a number: {a:?}"))?;
2032 let last: u64 = b
2033 .parse()
2034 .map_err(|_| format!("CopySourceRange last byte not a number: {b:?}"))?;
2035 if last < first {
2036 return Err(format!("CopySourceRange last < first: {s:?}"));
2037 }
2038 Ok(s3s::dto::Range::Int {
2039 first,
2040 last: Some(last),
2041 })
2042}
2043
2044/// v0.5 #34: synthesize the backend storage key for a given
2045/// (logical key, version-id) pair on an Enabled-versioning bucket.
2046///
2047/// Uses the `__s4ver__/` infix because:
2048/// - it's not a substring of `.s4index` / `.s4ver` natural keys (no false-positive
2049/// listing filter collisions)
2050/// - directory-style separator keeps S3 console "browse by prefix" UX intact
2051/// (versions roll up under one virtual folder per object)
2052/// - human-readable on debug logs / `aws s3 ls`
2053///
2054/// `list_objects` / `list_objects_v2` / `list_object_versions` MUST filter
2055/// keys containing `.__s4ver__/` from results so customers don't see internal
2056/// shadow objects.
2057pub fn versioned_shadow_key(key: &str, version_id: &str) -> String {
2058 format!("{key}.__s4ver__/{version_id}")
2059}
2060
2061/// Test for the marker substring used by [`versioned_shadow_key`]. Cheap str
2062/// scan; both list_objects filter and the GET passthrough check use this.
2063fn is_versioning_shadow_key(key: &str) -> bool {
2064 key.contains(".__s4ver__/")
2065}
2066
2067/// v0.6 #42: wall-clock seconds since the UNIX epoch — fed to
2068/// `mfa::check_mfa` so the TOTP verifier can match the client's
2069/// authenticator app's view of "now". Falls back to `0` on the
2070/// (impossible-in-practice) clock-before-1970 path so the verifier
2071/// rejects rather than panicking.
2072fn current_unix_secs() -> u64 {
2073 std::time::SystemTime::now()
2074 .duration_since(std::time::UNIX_EPOCH)
2075 .map(|d| d.as_secs())
2076 .unwrap_or(0)
2077}
2078
2079/// v0.6 #42: translate an `MfaError` into the matching S3 wire error.
2080///
2081/// - `Missing` / `SerialMismatch` / `InvalidCode` → `403 AccessDenied`
2082/// (S3 spec for MFA Delete: every gating failure surfaces as
2083/// `AccessDenied`, not a separate `MFA*` code).
2084/// - `Malformed` → `400 InvalidRequest` (the request itself is
2085/// syntactically broken, not a permission issue).
2086fn mfa_error_to_s3(e: crate::mfa::MfaError) -> S3Error {
2087 match e {
2088 crate::mfa::MfaError::Missing => S3Error::with_message(
2089 S3ErrorCode::AccessDenied,
2090 "MFA token required for this operation",
2091 ),
2092 crate::mfa::MfaError::Malformed => {
2093 S3Error::with_message(S3ErrorCode::InvalidRequest, "malformed x-amz-mfa header")
2094 }
2095 crate::mfa::MfaError::SerialMismatch => S3Error::with_message(
2096 S3ErrorCode::AccessDenied,
2097 "MFA serial does not match configured device",
2098 ),
2099 crate::mfa::MfaError::InvalidCode => {
2100 S3Error::with_message(S3ErrorCode::AccessDenied, "invalid MFA code")
2101 }
2102 }
2103}
2104
2105fn is_multipart_object(metadata: &Option<Metadata>) -> bool {
2106 metadata
2107 .as_ref()
2108 .and_then(|m| m.get(META_MULTIPART))
2109 .map(|v| v == "true")
2110 .unwrap_or(false)
2111}
2112
2113const META_CODEC: &str = "s4-codec";
2114const META_ORIGINAL_SIZE: &str = "s4-original-size";
2115const META_COMPRESSED_SIZE: &str = "s4-compressed-size";
2116const META_CRC32C: &str = "s4-crc32c";
2117/// Multipart upload で per-part frame format を使ったオブジェクトであることを示す。
2118/// GET 時にこの flag を見て frame parser を起動する。
2119const META_MULTIPART: &str = "s4-multipart";
2120/// v0.2 #4: single-PUT でも S4F2 framed format で書かれていることを示す。
2121/// 旧 v0.1 single-PUT は raw 圧縮 bytes (この flag なし)。GET 時にこの flag を
2122/// 見て framed 経路 (= multipart と同じ FrameIter parse) に流す。
2123const META_FRAMED: &str = "s4-framed";
2124
2125fn is_framed_v2_object(metadata: &Option<Metadata>) -> bool {
2126 metadata
2127 .as_ref()
2128 .and_then(|m| m.get(META_FRAMED))
2129 .map(|v| v == "true")
2130 .unwrap_or(false)
2131}
2132
2133/// v0.4 #21: detect SSE-S4 by the metadata flag we set on PUT.
2134fn is_sse_encrypted(metadata: &Option<Metadata>) -> bool {
2135 metadata
2136 .as_ref()
2137 .and_then(|m| m.get("s4-encrypted"))
2138 .map(|v| v == "aes-256-gcm")
2139 .unwrap_or(false)
2140}
2141
2142/// v0.5 #27: pull the three SSE-C headers off an input struct. The S3
2143/// contract is "all three or none" — partial sets are a 400.
2144///
2145/// Returns `Ok(None)` when no SSE-C headers were sent (server-managed or
2146/// no encryption), `Ok(Some(material))` on validated client key, and
2147/// `Err` for malformed or partial inputs.
2148fn extract_sse_c_material(
2149 algorithm: &Option<String>,
2150 key: &Option<String>,
2151 md5: &Option<String>,
2152) -> S3Result<Option<crate::sse::CustomerKeyMaterial>> {
2153 match (algorithm, key, md5) {
2154 (None, None, None) => Ok(None),
2155 (Some(a), Some(k), Some(m)) => crate::sse::parse_customer_key_headers(a, k, m)
2156 .map(Some)
2157 .map_err(sse_c_error_to_s3),
2158 _ => Err(S3Error::with_message(
2159 S3ErrorCode::InvalidRequest,
2160 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
2161 )),
2162 }
2163}
2164
2165/// v0.5 #28: detect SSE-KMS request — `x-amz-server-side-encryption: aws:kms`.
2166/// Returns the key-id to wrap under, falling back to the gateway default.
2167fn extract_kms_key_id(
2168 sse: &Option<ServerSideEncryption>,
2169 sse_kms_key_id: &Option<String>,
2170 gateway_default: Option<&str>,
2171) -> Option<String> {
2172 let asks_for_kms = sse
2173 .as_ref()
2174 .map(|s| s.as_str() == ServerSideEncryption::AWS_KMS)
2175 .unwrap_or(false);
2176 if !asks_for_kms {
2177 return None;
2178 }
2179 sse_kms_key_id
2180 .clone()
2181 .or_else(|| gateway_default.map(str::to_owned))
2182}
2183
2184/// v0.5 #28: map kms module errors to AWS-shaped S3 error codes.
2185/// `KeyNotFound` is operator misconfig (400); `BackendUnavailable` is a
2186/// transient KMS outage (503). Other variants are 500 InternalError.
2187fn kms_error_to_s3(e: crate::kms::KmsError) -> S3Error {
2188 use crate::kms::KmsError as K;
2189 match e {
2190 K::KeyNotFound { key_id } => S3Error::with_message(
2191 S3ErrorCode::InvalidArgument,
2192 format!("KMS key not found: {key_id}"),
2193 ),
2194 K::BackendUnavailable { message } => S3Error::with_message(
2195 S3ErrorCode::ServiceUnavailable,
2196 format!("KMS backend unavailable: {message}"),
2197 ),
2198 other => S3Error::with_message(S3ErrorCode::InternalError, format!("KMS error: {other}")),
2199 }
2200}
2201
2202/// v0.5 #27: map sse module errors to AWS-shaped S3 error codes.
2203/// `WrongCustomerKey` → 403 AccessDenied (matches AWS behaviour);
2204/// `InvalidCustomerKey` / algorithm / required / unexpected → 400.
2205fn sse_c_error_to_s3(e: crate::sse::SseError) -> S3Error {
2206 use crate::sse::SseError as E;
2207 match e {
2208 E::WrongCustomerKey => S3Error::with_message(
2209 S3ErrorCode::AccessDenied,
2210 "SSE-C key does not match the key used at PUT time",
2211 ),
2212 E::InvalidCustomerKey { reason } => {
2213 S3Error::with_message(S3ErrorCode::InvalidArgument, format!("SSE-C: {reason}"))
2214 }
2215 E::CustomerKeyAlgorithmUnsupported { algo } => S3Error::with_message(
2216 S3ErrorCode::InvalidArgument,
2217 format!("SSE-C unsupported algorithm: {algo:?} (only AES256 is allowed)"),
2218 ),
2219 E::CustomerKeyRequired => S3Error::with_message(
2220 S3ErrorCode::InvalidRequest,
2221 "object is SSE-C encrypted; supply x-amz-server-side-encryption-customer-* headers",
2222 ),
2223 E::CustomerKeyUnexpected => S3Error::with_message(
2224 S3ErrorCode::InvalidRequest,
2225 "object is not SSE-C encrypted; do not send x-amz-server-side-encryption-customer-* headers",
2226 ),
2227 other => S3Error::with_message(S3ErrorCode::InternalError, format!("SSE error: {other}")),
2228 }
2229}
2230
2231fn extract_manifest(metadata: &Option<Metadata>) -> Option<ChunkManifest> {
2232 let m = metadata.as_ref()?;
2233 let codec = m
2234 .get(META_CODEC)
2235 .and_then(|s| s.parse::<CodecKind>().ok())?;
2236 let original_size = m.get(META_ORIGINAL_SIZE)?.parse().ok()?;
2237 let compressed_size = m.get(META_COMPRESSED_SIZE)?.parse().ok()?;
2238 let crc32c = m.get(META_CRC32C)?.parse().ok()?;
2239 Some(ChunkManifest {
2240 codec,
2241 original_size,
2242 compressed_size,
2243 crc32c,
2244 })
2245}
2246
2247fn write_manifest(metadata: &mut Option<Metadata>, manifest: &ChunkManifest) {
2248 let meta = metadata.get_or_insert_with(Default::default);
2249 meta.insert(META_CODEC.into(), manifest.codec.as_str().into());
2250 meta.insert(
2251 META_ORIGINAL_SIZE.into(),
2252 manifest.original_size.to_string(),
2253 );
2254 meta.insert(
2255 META_COMPRESSED_SIZE.into(),
2256 manifest.compressed_size.to_string(),
2257 );
2258 meta.insert(META_CRC32C.into(), manifest.crc32c.to_string());
2259}
2260
2261fn internal<E: std::fmt::Display>(prefix: &'static str) -> impl FnOnce(E) -> S3Error {
2262 move |e| S3Error::with_message(S3ErrorCode::InternalError, format!("{prefix}: {e}"))
2263}
2264
2265/// v0.6 #41: map a `select::SelectError` to the S3 error surface. AWS
2266/// uses a domain-specific `InvalidSqlExpression` code for parse / unsupported
2267/// errors, but s3s 0.13 doesn't expose that as a typed variant — we
2268/// fall back to the well-known `InvalidRequest` 400 with a descriptive
2269/// message that includes the original error context.
2270fn select_error_to_s3(e: crate::select::SelectError, fmt: &str) -> S3Error {
2271 use crate::select::SelectError;
2272 match e {
2273 SelectError::Parse(msg) => S3Error::with_message(
2274 S3ErrorCode::InvalidRequest,
2275 format!("SQL parse error: {msg}"),
2276 ),
2277 SelectError::UnsupportedFeature(msg) => S3Error::with_message(
2278 S3ErrorCode::InvalidRequest,
2279 format!("unsupported SQL feature: {msg}"),
2280 ),
2281 SelectError::RowEval(msg) => S3Error::with_message(
2282 S3ErrorCode::InvalidRequest,
2283 format!("SQL row evaluation error: {msg}"),
2284 ),
2285 SelectError::InputFormat(msg) => S3Error::with_message(
2286 S3ErrorCode::InvalidRequest,
2287 format!("{fmt} input format error: {msg}"),
2288 ),
2289 }
2290}
2291
2292/// v0.5 #30: parse the `x-amz-bypass-governance-retention` header into a
2293/// boolean flag. AWS S3 accepts `true` (case-insensitive); any other value
2294/// (including missing) is treated as `false`.
2295fn parse_bypass_governance_header(headers: &http::HeaderMap) -> bool {
2296 headers
2297 .get("x-amz-bypass-governance-retention")
2298 .and_then(|v| v.to_str().ok())
2299 .map(|s| s.eq_ignore_ascii_case("true"))
2300 .unwrap_or(false)
2301}
2302
2303/// Convert s3s `Timestamp` into a `chrono::DateTime<Utc>` by formatting it
2304/// as an RFC3339 string and re-parsing through `chrono`. The string format
2305/// avoids pulling the `time` crate (transitive dep of s3s, not declared by
2306/// s4-server) into our direct deps. Returns `None` if the format/parse fails
2307/// or the value is outside `chrono`'s supported range.
2308fn timestamp_to_chrono_utc(ts: &Timestamp) -> Option<chrono::DateTime<chrono::Utc>> {
2309 let mut buf = Vec::new();
2310 ts.format(s3s::dto::TimestampFormat::DateTime, &mut buf)
2311 .ok()?;
2312 let s = std::str::from_utf8(&buf).ok()?;
2313 chrono::DateTime::parse_from_rfc3339(s)
2314 .ok()
2315 .map(|dt| dt.with_timezone(&chrono::Utc))
2316}
2317
2318/// Inverse of [`timestamp_to_chrono_utc`] — emit RFC3339 (the s3s
2319/// `DateTime` wire format) and re-parse via `Timestamp::parse`.
2320fn chrono_utc_to_timestamp(dt: chrono::DateTime<chrono::Utc>) -> Timestamp {
2321 // chrono's RFC3339 output format matches s3s' parser ("...Z" with
2322 // optional sub-second precision). Fall back to UNIX_EPOCH if anything
2323 // unexpected happens — we never produce malformed strings, so this
2324 // branch is unreachable in practice.
2325 let s = dt.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
2326 Timestamp::parse(s3s::dto::TimestampFormat::DateTime, &s).unwrap_or_default()
2327}
2328
2329/// v0.6 #39: convert our internal [`crate::tagging::TagSet`] into the
2330/// s3s `Vec<Tag>` wire shape used on `GetObject/BucketTaggingOutput`.
2331/// Both halves of every pair land in the `Some(_)` slot — AWS marks
2332/// the field optional but always populates it on response.
2333fn tagset_to_aws(set: &crate::tagging::TagSet) -> Vec<Tag> {
2334 set.iter()
2335 .map(|(k, v)| Tag {
2336 key: Some(k.clone()),
2337 value: Some(v.clone()),
2338 })
2339 .collect()
2340}
2341
2342/// v0.6 #39: inverse of [`tagset_to_aws`] for input handlers. Missing
2343/// keys / values become empty strings (mirrors AWS, which rejects
2344/// `<Key/>` with InvalidTag at the parser layer; downstream
2345/// `TagSet::validate` then enforces our size limits).
2346fn aws_to_tagset(tags: &[Tag]) -> Result<crate::tagging::TagSet, crate::tagging::TagError> {
2347 let pairs = tags
2348 .iter()
2349 .map(|t| {
2350 (
2351 t.key.clone().unwrap_or_default(),
2352 t.value.clone().unwrap_or_default(),
2353 )
2354 })
2355 .collect();
2356 crate::tagging::TagSet::from_pairs(pairs)
2357}
2358
2359/// `Range` request を decompressed object サイズ `total` に適用して `(start, end_exclusive)`
2360/// を返す。`Range::Int { first, last }` は `bytes=first-last` (last は inclusive)、
2361/// `Range::Suffix { length }` は末尾 `length` byte。S3 仕様に準拠。
2362pub fn resolve_range(range: &s3s::dto::Range, total: u64) -> Result<(u64, u64), String> {
2363 if total == 0 {
2364 return Err("cannot range-get zero-length object".into());
2365 }
2366 match range {
2367 s3s::dto::Range::Int { first, last } => {
2368 let start = *first;
2369 let end_inclusive = match last {
2370 Some(l) => (*l).min(total - 1),
2371 None => total - 1,
2372 };
2373 if start > end_inclusive || start >= total {
2374 return Err(format!(
2375 "range bytes={start}-{:?} out of object size {total}",
2376 last
2377 ));
2378 }
2379 Ok((start, end_inclusive + 1))
2380 }
2381 s3s::dto::Range::Suffix { length } => {
2382 let len = (*length).min(total);
2383 Ok((total - len, total))
2384 }
2385 }
2386}
2387
2388#[async_trait::async_trait]
2389impl<B: S3> S3 for S4Service<B> {
2390 // === 圧縮を挟む path (PUT) ===
2391 #[tracing::instrument(
2392 name = "s4.put_object",
2393 skip(self, req),
2394 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_in, bytes_out, latency_ms)
2395 )]
2396 async fn put_object(
2397 &self,
2398 mut req: S3Request<PutObjectInput>,
2399 ) -> S3Result<S3Response<PutObjectOutput>> {
2400 let put_start = Instant::now();
2401 let put_bucket = req.input.bucket.clone();
2402 let put_key = req.input.key.clone();
2403 // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
2404 self.check_not_reserved_key(&put_key, ReservedKeyMode::Mutating)?;
2405 let access_preamble = self.access_log_preamble(&req);
2406 self.enforce_rate_limit(&req, &put_bucket)?;
2407 // v0.6 #39: parse `x-amz-tagging` (URL-encoded query string) so
2408 // the IAM policy gate sees the request's tags via
2409 // `s3:RequestObjectTag/<key>`. `existing_object_tags` is also
2410 // resolved from the Tagging manager (when wired) so
2411 // `s3:ExistingObjectTag/<key>` works on overwrite.
2412 let request_tags: Option<crate::tagging::TagSet> = req
2413 .input
2414 .tagging
2415 .as_deref()
2416 .map(crate::tagging::parse_tagging_header)
2417 .transpose()
2418 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
2419 let existing_tags: Option<crate::tagging::TagSet> = self
2420 .tagging
2421 .as_ref()
2422 .and_then(|m| m.get_object_tags(&put_bucket, &put_key));
2423 self.enforce_policy_with_extra(
2424 &req,
2425 "s3:PutObject",
2426 &put_bucket,
2427 Some(&put_key),
2428 request_tags.as_ref(),
2429 existing_tags.as_ref(),
2430 )?;
2431 // v0.5 #30: an Object Lock-protected key cannot be overwritten by
2432 // a non-versioned PUT (Suspended / Unversioned bucket). Enabled
2433 // bucket PUTs are exempt because they materialise a fresh
2434 // version under a shadow key (`<key>.__s4ver__/<vid>`) — the
2435 // locked version's bytes are untouched. The check mirrors the
2436 // delete path (Compliance never bypassable, Governance via the
2437 // bypass header, legal hold never).
2438 if let Some(mgr) = self.object_lock.as_ref()
2439 && let Some(state) = mgr.get(&put_bucket, &put_key)
2440 {
2441 let bucket_versioned_enabled = self
2442 .versioning
2443 .as_ref()
2444 .map(|v| v.state(&put_bucket) == crate::versioning::VersioningState::Enabled)
2445 .unwrap_or(false);
2446 if !bucket_versioned_enabled {
2447 let bypass = parse_bypass_governance_header(&req.headers);
2448 let now = chrono::Utc::now();
2449 if !state.can_delete(now, bypass) {
2450 crate::metrics::record_policy_denial("s3:PutObject", &put_bucket);
2451 return Err(S3Error::with_message(
2452 S3ErrorCode::AccessDenied,
2453 "Access Denied because object protected by object lock",
2454 ));
2455 }
2456 }
2457 }
2458 // v0.5 #30: per-PUT explicit retention / legal hold (S3
2459 // `x-amz-object-lock-mode`, `x-amz-object-lock-retain-until-date`,
2460 // `x-amz-object-lock-legal-hold`). Captured before the body
2461 // moves into the backend; persisted into the manager only on
2462 // backend success below.
2463 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
2464 .input
2465 .object_lock_mode
2466 .as_ref()
2467 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
2468 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
2469 .input
2470 .object_lock_retain_until_date
2471 .as_ref()
2472 .and_then(timestamp_to_chrono_utc);
2473 let explicit_legal_hold_on: Option<bool> = req
2474 .input
2475 .object_lock_legal_hold_status
2476 .as_ref()
2477 .map(|s| s.as_str().eq_ignore_ascii_case("ON"));
2478 if let Some(blob) = req.input.body.take() {
2479 // Sample 4 KiB から codec を決定。streaming-aware codec なら streaming
2480 // compress fast path、そうでなければ従来の collect-then-compress。
2481 let (sample, rest_stream) = peek_sample(blob, SAMPLE_BYTES)
2482 .await
2483 .map_err(internal("peek put sample"))?;
2484 let sample_len = sample.len().min(SAMPLE_BYTES);
2485 // v0.8 #56: pass the request's Content-Length (when present) so
2486 // the sampling dispatcher can promote large objects to a GPU
2487 // codec. Chunked transfers (no Content-Length) keep CPU.
2488 let total_size_hint = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2489 let kind = self
2490 .dispatcher
2491 .pick_with_size_hint(&sample[..sample_len], total_size_hint)
2492 .await;
2493
2494 // Passthrough buys nothing from S4F2 wrapping (no compression =
2495 // no per-chunk frame to skip past) and the +28-byte header
2496 // overhead breaks size-sensitive callers that expect a true
2497 // pass-through. So passthrough always uses the legacy raw-blob
2498 // path; only compressing codecs go through the framed path.
2499 //
2500 // v0.8.14 follow-up to #127 MED-B: the previous attempt
2501 // forced the buffered path whenever the client supplied
2502 // any whole-body checksum so `verify_client_body_checksums`
2503 // could run. Modern AWS SDKs auto-add an
2504 // `x-amz-checksum-crc32` trailer by default, which made
2505 // every SDK PUT lose the streaming-framed path and
2506 // therefore lose its sidecar — silent data path
2507 // regression caught by
2508 // `range_get_falls_back_to_full_when_sidecar_etag_stale`
2509 // and `upload_part_copy_propagates_source_version_id`
2510 // on the MinIO E2E job. The streaming PUT path now
2511 // passes through unchanged; client-supplied checksums on
2512 // streaming PUTs are NOT verified (same fail-open as
2513 // pre-v0.8.12). The buffered PUT branch and UploadPart
2514 // do verify, which covers the buffered upload case the
2515 // HIGH-12 audit was scoped to. True streaming verify
2516 // (tee-into-hasher on the chained input) remains the
2517 // tracked follow-up.
2518 let use_framed = supports_streaming_compress(kind) && kind != CodecKind::Passthrough;
2519 let (compressed, manifest, is_framed) = if use_framed {
2520 // streaming fast path: input は memory に collect しない
2521 let chained = chain_sample_with_rest(sample, rest_stream);
2522 debug!(
2523 bucket = ?req.input.bucket,
2524 key = ?req.input.key,
2525 codec = kind.as_str(),
2526 path = "streaming-framed",
2527 "S4 put_object: compressing (streaming, S4F2 multi-frame)"
2528 );
2529 // v0.4 #16: pick the chunk size based on the request's
2530 // Content-Length when known, falling back to the 4 MiB
2531 // default for chunked transfers.
2532 let chunk_size = pick_chunk_size(req.input.content_length.map(|n| n as u64));
2533 // v0.8.4 #73 M2: pass the request's Content-Length so
2534 // streaming_compress_to_frames can fail-fast on a mid-PUT
2535 // truncation (client disconnect after sending half the
2536 // body). `None` is the chunked-Transfer-Encoding case
2537 // where the upstream genuinely doesn't know the size and
2538 // the backend's framing layer is the only truncation
2539 // signal we have.
2540 let expected_input_size =
2541 req.input.content_length.and_then(|n| u64::try_from(n).ok());
2542 let (body, manifest) = streaming_compress_to_frames(
2543 chained,
2544 Arc::clone(&self.registry),
2545 kind,
2546 chunk_size,
2547 expected_input_size,
2548 )
2549 .await
2550 .map_err(|e| match e {
2551 s4_codec::CodecError::TruncatedStream { expected, got } => {
2552 // 400 IncompleteBody: client advertised N bytes
2553 // but disconnected after `got`. Mirrors AWS S3's
2554 // canonical error code for the same shape so SDK
2555 // retries kick in instead of treating the PUT as
2556 // a successful upload of a half-body.
2557 S3Error::with_message(
2558 S3ErrorCode::IncompleteBody,
2559 format!("PUT body truncated: expected {expected} bytes, got {got}"),
2560 )
2561 }
2562 // v0.8.15 M-4: 400
2563 // `RequestBodyLengthMismatch` for over-length
2564 // bodies. AWS S3 returns this when the declared
2565 // `Content-Length` is smaller than the wire body;
2566 // S4 used to silently accept the surplus bytes.
2567 // `IncompleteBody` is the closest typed variant
2568 // in the s3s enum — we widen the message so the
2569 // SDK / curl side sees the shape unambiguously.
2570 s4_codec::CodecError::OverlengthStream { expected, got } => {
2571 let code = S3ErrorCode::from_bytes(b"RequestBodyLengthMismatch")
2572 .unwrap_or(S3ErrorCode::IncompleteBody);
2573 S3Error::with_message(
2574 code,
2575 format!(
2576 "PUT body length mismatch: Content-Length declared {expected} \
2577 bytes, body carried at least {got}"
2578 ),
2579 )
2580 }
2581 other => internal("streaming framed compress")(other),
2582 })?;
2583 (body, manifest, true)
2584 } else {
2585 // GPU codec 等で streaming-aware でないものは bytes-buffered path
2586 // (raw 圧縮 bytes、framed なし — back-compat 互換 path)
2587 let bytes = collect_with_sample(sample, rest_stream, self.max_body_bytes)
2588 .await
2589 .map_err(internal("collect put body (buffered path)"))?;
2590 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
2591 // checksum algorithms against the received body on
2592 // the buffered path. The streaming-framed branch
2593 // above redirects here when ANY checksum header is
2594 // present (#127 MED-B), so this is the single
2595 // checkpoint for client-supplied integrity.
2596 verify_client_body_checksums(
2597 &bytes,
2598 req.input.content_md5.as_deref(),
2599 req.input.checksum_crc32.as_deref(),
2600 req.input.checksum_crc32c.as_deref(),
2601 req.input.checksum_sha1.as_deref(),
2602 req.input.checksum_sha256.as_deref(),
2603 req.input.checksum_crc64nvme.as_deref(),
2604 )?;
2605 debug!(
2606 bucket = ?req.input.bucket,
2607 key = ?req.input.key,
2608 bytes = bytes.len(),
2609 codec = kind.as_str(),
2610 path = "buffered",
2611 "S4 put_object: compressing (buffered, raw blob)"
2612 );
2613 // v0.8 #55: telemetry-returning compress so we can stamp
2614 // GPU-pipeline Prometheus metrics (`s4_gpu_compress_seconds`,
2615 // throughput gauge, OOM counter) for nvcomp / dietgpu codecs.
2616 // CPU codecs come back with `gpu_seconds = None` and the
2617 // stamp helper short-circuits — no extra cost on CPU path.
2618 let (compress_res, tel) = self.registry.compress_with_telemetry(bytes, kind).await;
2619 stamp_gpu_compress_telemetry(&tel);
2620 let (body, m) = compress_res.map_err(internal("registry compress"))?;
2621 (body, m, false)
2622 };
2623
2624 write_manifest(&mut req.input.metadata, &manifest);
2625 if is_framed {
2626 // v0.2 #4: framed body であることを GET 側に伝える meta flag。
2627 req.input
2628 .metadata
2629 .get_or_insert_with(Default::default)
2630 .insert(META_FRAMED.into(), "true".into());
2631 }
2632 // 重要: content_length を圧縮後サイズで更新する。
2633 // これを忘れると下流 (aws-sdk-s3 → S3) が宣言サイズ分の bytes を
2634 // 待ち続けて RequestTimeout で失敗する (S3 仕様)。
2635 req.input.content_length = Some(compressed.len() as i64);
2636 // body を書き換えたので、客側が送ってきた original body 用の
2637 // checksum / MD5 ヘッダは無効化する (そのまま転送すると下流 S3 が
2638 // XAmzContentChecksumMismatch を返す)。S4 自身の整合性は
2639 // ChunkManifest.crc32c で担保している。
2640 req.input.checksum_algorithm = None;
2641 req.input.checksum_crc32 = None;
2642 req.input.checksum_crc32c = None;
2643 req.input.checksum_crc64nvme = None;
2644 req.input.checksum_sha1 = None;
2645 req.input.checksum_sha256 = None;
2646 req.input.content_md5 = None;
2647 let original_size = manifest.original_size;
2648 let compressed_size = manifest.compressed_size;
2649 let codec_label = manifest.codec.as_str();
2650 // (sidecar_index is built below, after the SSE-mode
2651 // extraction, so v0.8.12 HIGH-10 can short-circuit the
2652 // build when the on-disk bytes are about to be encrypted.)
2653 // v0.4 #21 / v0.5 #29 / v0.5 #27: encrypt-after-compress.
2654 // Precedence:
2655 // - SSE-C headers present → per-request customer key (S4E3)
2656 // - server-managed keyring configured → active key (S4E2)
2657 // - neither → no encryption (raw compressed body)
2658 // The `s4-encrypted: aes-256-gcm` metadata flag is set in
2659 // both encrypted modes; the on-disk frame magic distinguishes
2660 // S4E1 / S4E2 / S4E3 so GET picks the right decrypt path.
2661 // v0.7 #48 BUG-2/3 fix: take() the SSE fields off req.input
2662 // so the encryption headers are NOT forwarded to the
2663 // backend. S4 owns the encrypt-then-store contract; if we
2664 // leave the headers in place, real S3-compat backends
2665 // (MinIO / AWS) try to apply their own SSE on top and
2666 // either reject (MinIO requires HTTPS for SSE-C) or fail
2667 // (MinIO has no KMS configured). MemoryBackend ignored
2668 // these so mock tests passed.
2669 let sse_c_alg = req.input.sse_customer_algorithm.take();
2670 let sse_c_key = req.input.sse_customer_key.take();
2671 let sse_c_md5 = req.input.sse_customer_key_md5.take();
2672 let sse_header = req.input.server_side_encryption.take();
2673 let sse_kms_key = req.input.ssekms_key_id.take();
2674 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
2675 // v0.5 #28: SSE-KMS request? Resolves to None unless the
2676 // request asks for `aws:kms` AND a key id is available
2677 // (explicit header or gateway default). When set, we'll
2678 // generate a per-object DEK below.
2679 let kms_key_id = extract_kms_key_id(
2680 &sse_header,
2681 &sse_kms_key,
2682 self.kms_default_key_id.as_deref(),
2683 );
2684 // v0.8.12 HIGH-10 fix: the sidecar offsets describe the
2685 // pre-encrypt `compressed` body, but the bytes the
2686 // backend stores when any SSE mode is active are
2687 // *post-encrypt* (different length, different layout).
2688 // A Range GET on an SSE-encrypted object would slice the
2689 // ciphertext at the stale offsets, hand the wrong bytes
2690 // to the frame parser, and 500. Suppress the sidecar
2691 // entirely when SSE is going to be applied below;
2692 // encrypted-object Range GET falls back to the buffered
2693 // path (decrypt full body → frame parse → slice), trading
2694 // partial-fetch performance for correctness. An
2695 // encryption-aware sidecar format is a follow-up issue.
2696 let will_encrypt =
2697 sse_c_material.is_some() || kms_key_id.is_some() || self.sse_keyring.is_some();
2698 let sidecar_index = if is_framed && !will_encrypt {
2699 s4_codec::index::build_index_from_body(&compressed).ok()
2700 } else {
2701 None
2702 };
2703 // v0.5 #32: in compliance-strict mode, every PUT must
2704 // declare SSE — either client-supplied (SSE-C), KMS, or by
2705 // virtue of a server-side keyring being configured (which
2706 // applies SSE-S4 to every PUT automatically). Requests that
2707 // would otherwise land as plain compressed bytes are
2708 // rejected with 400 InvalidRequest.
2709 if self.compliance_strict
2710 && sse_c_material.is_none()
2711 && kms_key_id.is_none()
2712 && self.sse_keyring.is_none()
2713 && sse_header.as_ref().map(|s| s.as_str()) != Some(ServerSideEncryption::AES256)
2714 {
2715 return Err(S3Error::with_message(
2716 S3ErrorCode::InvalidRequest,
2717 "compliance-mode strict: PUT must include x-amz-server-side-encryption \
2718 (AES256 or aws:kms) or x-amz-server-side-encryption-customer-* headers",
2719 ));
2720 }
2721 // SSE-C and SSE-KMS are mutually exclusive on a single PUT
2722 // (AWS S3 returns 400 InvalidArgument). SSE-C wins by spec.
2723 if sse_c_material.is_some() && kms_key_id.is_some() {
2724 return Err(S3Error::with_message(
2725 S3ErrorCode::InvalidArgument,
2726 "SSE-C and SSE-KMS cannot be used together on the same PUT",
2727 ));
2728 }
2729 // KMS path needs to call generate_dek().await before the
2730 // body_to_send branch; capture the result here.
2731 //
2732 // v0.8.1 #58: the plaintext DEK lives in three places
2733 // during one PUT:
2734 //
2735 // 1. The `Zeroizing<Vec<u8>>` returned by `generate_dek`
2736 // — wiped when the binding `dek` falls out of scope at
2737 // the end of this `if`-arm.
2738 // 2. The stack `[u8; 32]` we copy into for `SseSource::Kms`
2739 // — wrapped in `Zeroizing<[u8; 32]>` so it's wiped when
2740 // the outer `kms_wrap` `Option` is dropped at the end
2741 // of `put_object`.
2742 // 3. AES-GCM internal key state inside the `aes-gcm`
2743 // crate during `encrypt_with_source` — out of scope
2744 // for this fix; tracked separately in v0.8.2.
2745 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
2746 if let Some(ref key_id) = kms_key_id {
2747 let kms = self.kms.as_ref().ok_or_else(|| {
2748 S3Error::with_message(
2749 S3ErrorCode::InvalidRequest,
2750 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
2751 )
2752 })?;
2753 // `dek` is `Zeroizing<Vec<u8>>`; deref + slice access
2754 // works unchanged via `Deref<Target=Vec<u8>>`.
2755 let (dek, wrapped) = kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
2756 if dek.len() != 32 {
2757 return Err(S3Error::with_message(
2758 S3ErrorCode::InternalError,
2759 format!(
2760 "KMS backend returned a DEK of {} bytes (expected 32)",
2761 dek.len()
2762 ),
2763 ));
2764 }
2765 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
2766 zeroize::Zeroizing::new([0u8; 32]);
2767 dek_arr.copy_from_slice(&dek);
2768 // `dek` (the `Zeroizing<Vec<u8>>`) is dropped at the
2769 // end of this scope, wiping the heap allocation.
2770 Some((dek_arr, wrapped))
2771 } else {
2772 None
2773 };
2774 // v0.7 #48 BUG-4 fix: stamp the SSE *type* into metadata
2775 // alongside `s4-encrypted` so HEAD (which doesn't fetch the
2776 // body) can echo the correct `x-amz-server-side-encryption`
2777 // value. Without this, HEAD on an SSE-KMS object would not
2778 // echo `aws:kms` because the frame magic is only available
2779 // on the body (which HEAD doesn't read).
2780 let body_to_send = if let Some(ref m) = sse_c_material {
2781 let meta = req.input.metadata.get_or_insert_with(Default::default);
2782 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2783 meta.insert("s4-sse-type".into(), "AES256".into());
2784 meta.insert(
2785 "s4-sse-c-key-md5".into(),
2786 base64::engine::general_purpose::STANDARD.encode(m.key_md5),
2787 );
2788 crate::sse::encrypt_with_source(
2789 &compressed,
2790 crate::sse::SseSource::CustomerKey {
2791 key: &m.key,
2792 key_md5: &m.key_md5,
2793 },
2794 )
2795 } else if let Some((ref dek, ref wrapped)) = kms_wrap {
2796 let meta = req.input.metadata.get_or_insert_with(Default::default);
2797 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2798 meta.insert("s4-sse-type".into(), "aws:kms".into());
2799 meta.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
2800 // v0.8.1 #58: `dek` is `&Zeroizing<[u8; 32]>`; `SseSource::Kms`
2801 // wants `&[u8; 32]`. Rust auto-derefs `&Zeroizing<T>` to
2802 // `&T` here via `Deref<Target=T>`, so the binding picks
2803 // up the inner array reference without copying. The array
2804 // stays in the `Zeroizing` wrapper that owns it and gets
2805 // wiped when `kms_wrap` drops at the end of `put_object`.
2806 let dek_ref: &[u8; 32] = dek;
2807 crate::sse::encrypt_with_source(
2808 &compressed,
2809 crate::sse::SseSource::Kms {
2810 dek: dek_ref,
2811 wrapped,
2812 },
2813 )
2814 } else if let Some(keyring) = self.sse_keyring.as_ref() {
2815 // SSE-S4 is server-driven transparent encryption; the
2816 // client didn't ask for SSE. We stamp `s4-encrypted`
2817 // (internal flag the GET path needs) but deliberately
2818 // do NOT stamp `s4-sse-type` — that lights up the HEAD
2819 // echo of `x-amz-server-side-encryption: AES256`,
2820 // which would falsely advertise AWS-style SSE-S3
2821 // semantics the operator didn't request.
2822 let meta = req.input.metadata.get_or_insert_with(Default::default);
2823 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
2824 // v0.8 #52: when `--sse-chunk-size > 0` is configured,
2825 // emit the chunked S4E5 frame so the matching GET can
2826 // stream-decrypt instead of buffering 5 GiB before
2827 // emitting a byte. Falls back to the buffered S4E2
2828 // frame at chunk_size=0 (default) so existing
2829 // deployments are bit-for-bit unchanged.
2830 if self.sse_chunk_size > 0 {
2831 crate::sse::encrypt_v2_chunked(&compressed, keyring, self.sse_chunk_size)
2832 .map_err(|e| {
2833 S3Error::with_message(
2834 S3ErrorCode::InternalError,
2835 format!("SSE-S4 chunked encrypt failed: {e}"),
2836 )
2837 })?
2838 } else {
2839 crate::sse::encrypt_v2(&compressed, keyring)
2840 }
2841 } else {
2842 compressed.clone()
2843 };
2844 // v0.6 #40: capture the about-to-be-sent body + metadata so
2845 // the replication dispatcher (run after the source PUT
2846 // succeeds) can hand the same backend bytes to the
2847 // destination bucket. `Bytes` clone is cheap (refcounted).
2848 let replication_body = body_to_send.clone();
2849 let replication_metadata = req.input.metadata.clone();
2850 // v0.7 #48 BUG-1 fix: SSE encryption (S4E1/E2/E3/E4 frames)
2851 // makes the body longer than the post-compression bytes
2852 // (header + nonce + tag overhead). The earlier
2853 // content_length stamp at compressed.len() is now stale, so
2854 // re-stamp from the actual bytes about to be sent or the
2855 // backend (real S3 / MinIO) rejects with
2856 // `StreamLengthMismatch`. MemoryBackend never validated
2857 // this, which is why mock-only tests passed.
2858 req.input.content_length = Some(body_to_send.len() as i64);
2859 req.input.body = Some(bytes_to_blob(body_to_send));
2860 // v0.5 #34: pre-allocate a version-id when the bucket is
2861 // Enabled, then redirect the backend storage key to the
2862 // shadow path so older versions survive newer PUTs.
2863 // Suspended / Unversioned buckets keep using the plain
2864 // `<key>` (S3 spec: Suspended overwrites the same backend
2865 // object). Pre-allocation (instead of recording after PUT)
2866 // ensures the shadow key + the response's
2867 // `x-amz-version-id` use the same vid.
2868 let pending_version: Option<crate::versioning::PutOutcome> = self
2869 .versioning
2870 .as_ref()
2871 .map(|mgr| mgr.state(&put_bucket))
2872 .map(|state| match state {
2873 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
2874 version_id: crate::versioning::VersioningManager::new_version_id(),
2875 versioned_response: true,
2876 },
2877 crate::versioning::VersioningState::Suspended
2878 | crate::versioning::VersioningState::Unversioned => {
2879 crate::versioning::PutOutcome {
2880 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
2881 versioned_response: false,
2882 }
2883 }
2884 });
2885 if let Some(ref pv) = pending_version
2886 && pv.versioned_response
2887 {
2888 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
2889 }
2890 // v0.8.4 #73 H-2: capture the to-be-stored body length BEFORE
2891 // the move into `req.input` is consumed by the backend call.
2892 // The sidecar's `source_compressed_size` is checked against
2893 // the live HEAD `Content-Length` on Range GET to detect a
2894 // backend-side mutation.
2895 let backend_object_size = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2896 let mut backend_resp = self.backend.put_object(req).await;
2897 if let Some(mut idx) = sidecar_index
2898 && let Ok(ref resp) = backend_resp
2899 && idx.entries.len() > 1
2900 {
2901 // 1 chunk しかない (small object) なら sidecar は意味がない (=
2902 // partial fetch しても full body と同じ範囲) ので省略。
2903 // Sidecar は user-visible key で書く (latest version の
2904 // partial fetch path 用)。Old versions の Range GET は今 task
2905 // の scope 外 (full read fallback でも意味的には正しい)。
2906 //
2907 // v0.8.4 #73 H-2: stamp the version-binding fields the
2908 // GET path needs to detect a stale / attacker-written
2909 // sidecar. ETag comes from the backend's PUT response —
2910 // when missing (some backends don't return an ETag) we
2911 // synthesize a CRC-derived stable identifier so the
2912 // sidecar still binds to *something*; the GET HEAD will
2913 // see the same backend ETag (None vs None) and treat the
2914 // pair as consistent.
2915 let source_etag = resp.output.e_tag.as_ref().map(|t| t.value().to_string());
2916 idx.source_etag = source_etag;
2917 idx.source_compressed_size = backend_object_size;
2918 self.write_sidecar(&put_bucket, &put_key, &idx).await;
2919 }
2920 // v0.5 #34: commit the new version into the manager only on
2921 // backend success. Use the pre-allocated vid so the response
2922 // header and the chain entry agree.
2923 if let (Some(mgr), Some(pv), Ok(resp)) = (
2924 self.versioning.as_ref(),
2925 pending_version.as_ref(),
2926 backend_resp.as_mut(),
2927 ) {
2928 let etag = resp
2929 .output
2930 .e_tag
2931 .clone()
2932 .map(ETag::into_value)
2933 .unwrap_or_else(|| format!("\"crc32c-{}\"", manifest.crc32c));
2934 let now = chrono::Utc::now();
2935 mgr.commit_put_with_version(
2936 &put_bucket,
2937 &put_key,
2938 crate::versioning::VersionEntry {
2939 version_id: pv.version_id.clone(),
2940 etag,
2941 size: original_size,
2942 is_delete_marker: false,
2943 created_at: now,
2944 },
2945 );
2946 if pv.versioned_response {
2947 resp.output.version_id = Some(pv.version_id.clone());
2948 }
2949 }
2950 // v0.5 #27: AWS S3 echoes the SSE-C headers back on success
2951 // so the client knows the server actually applied the
2952 // requested algorithm and which key fingerprint matched.
2953 if let (Some(m), Ok(resp)) = (sse_c_material.as_ref(), backend_resp.as_mut()) {
2954 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
2955 resp.output.sse_customer_key_md5 =
2956 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
2957 }
2958 // v0.5 #28: SSE-KMS echo — `aws:kms` + the canonical key id
2959 // the backend returned (AWS KMS returns the ARN even when
2960 // the request used an alias).
2961 if let (Some((_, wrapped)), Ok(resp)) = (kms_wrap.as_ref(), backend_resp.as_mut()) {
2962 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
2963 ServerSideEncryption::AWS_KMS,
2964 ));
2965 resp.output.ssekms_key_id = Some(wrapped.key_id.clone());
2966 }
2967 // v0.5 #30: persist any per-PUT explicit retention / legal
2968 // hold the client supplied, then auto-apply the bucket
2969 // default (no-op when state is already populated). The
2970 // explicit fields take precedence — the bucket-default
2971 // helper bails out as soon as it sees any retention.
2972 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
2973 if explicit_lock_mode.is_some()
2974 || explicit_retain_until.is_some()
2975 || explicit_legal_hold_on.is_some()
2976 {
2977 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
2978 if let Some(m) = explicit_lock_mode {
2979 state.mode = Some(m);
2980 }
2981 if let Some(u) = explicit_retain_until {
2982 state.retain_until = Some(u);
2983 }
2984 if let Some(lh) = explicit_legal_hold_on {
2985 state.legal_hold_on = lh;
2986 }
2987 mgr.set(&put_bucket, &put_key, state);
2988 }
2989 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
2990 }
2991 let _ = (original_size, compressed_size); // mute unused warnings
2992 let elapsed = put_start.elapsed();
2993 crate::metrics::record_put(
2994 codec_label,
2995 original_size,
2996 compressed_size,
2997 elapsed.as_secs_f64(),
2998 backend_resp.is_ok(),
2999 );
3000 // v0.4 #20: structured access-log entry (best-effort).
3001 self.record_access(
3002 access_preamble,
3003 "REST.PUT.OBJECT",
3004 &put_bucket,
3005 Some(&put_key),
3006 if backend_resp.is_ok() { 200 } else { 500 },
3007 compressed_size,
3008 original_size,
3009 elapsed.as_millis() as u64,
3010 backend_resp.as_ref().err().map(|e| e.code().as_str()),
3011 )
3012 .await;
3013 info!(
3014 op = "put_object",
3015 bucket = %put_bucket,
3016 key = %put_key,
3017 codec = codec_label,
3018 bytes_in = original_size,
3019 bytes_out = compressed_size,
3020 ratio = format!(
3021 "{:.3}",
3022 if original_size == 0 { 1.0 } else { compressed_size as f64 / original_size as f64 }
3023 ),
3024 latency_ms = elapsed.as_millis() as u64,
3025 ok = backend_resp.is_ok(),
3026 "S4 put completed"
3027 );
3028 // v0.6 #35: fire bucket-notification destinations (best-effort,
3029 // detached). Skipped when no manager is attached or when the
3030 // bucket has no rule matching `s3:ObjectCreated:Put` for this
3031 // key.
3032 if backend_resp.is_ok()
3033 && let Some(mgr) = self.notifications.as_ref()
3034 {
3035 let dests = mgr.match_destinations(
3036 &put_bucket,
3037 &crate::notifications::EventType::ObjectCreatedPut,
3038 &put_key,
3039 );
3040 if !dests.is_empty() {
3041 let etag = backend_resp
3042 .as_ref()
3043 .ok()
3044 .and_then(|r| r.output.e_tag.clone())
3045 .map(ETag::into_value);
3046 let version_id = pending_version
3047 .as_ref()
3048 .filter(|pv| pv.versioned_response)
3049 .map(|pv| pv.version_id.clone());
3050 tokio::spawn(crate::notifications::dispatch_event(
3051 Arc::clone(mgr),
3052 put_bucket.clone(),
3053 put_key.clone(),
3054 crate::notifications::EventType::ObjectCreatedPut,
3055 Some(original_size),
3056 etag,
3057 version_id,
3058 format!("S4-{}", uuid::Uuid::new_v4()),
3059 ));
3060 }
3061 }
3062 // v0.6 #39: persist parsed `x-amz-tagging` tags into the
3063 // tagging manager on a successful PUT. AWS PutObject's
3064 // tagging is a full-replace operation (not a merge), so
3065 // any pre-existing entry for `(bucket, key)` is overwritten.
3066 if backend_resp.is_ok()
3067 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3068 {
3069 mgr.put_object_tags(&put_bucket, &put_key, tags);
3070 }
3071 // v0.6 #40: cross-bucket replication fire-point. On
3072 // successful source PUT, consult the replication manager;
3073 // when an enabled rule matches, mark the source key
3074 // `Pending` and spawn a detached task that PUTs the same
3075 // backend bytes + metadata to the rule's destination
3076 // bucket. The dispatcher itself records `Completed` /
3077 // `Failed` and bumps the drop counter on retry-budget
3078 // exhaustion.
3079 self.spawn_replication_if_matched(
3080 &put_bucket,
3081 &put_key,
3082 &request_tags,
3083 &replication_body,
3084 &replication_metadata,
3085 backend_resp.is_ok(),
3086 pending_version.as_ref(),
3087 );
3088 return backend_resp;
3089 }
3090 // Body-less PUT (rare: zero-length object). Mirror the body-full
3091 // versioning hooks so list_object_versions / GET-by-version still see
3092 // empty-body objects in the chain.
3093 let pending_version: Option<crate::versioning::PutOutcome> = self
3094 .versioning
3095 .as_ref()
3096 .map(|mgr| mgr.state(&put_bucket))
3097 .map(|state| match state {
3098 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3099 version_id: crate::versioning::VersioningManager::new_version_id(),
3100 versioned_response: true,
3101 },
3102 _ => crate::versioning::PutOutcome {
3103 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3104 versioned_response: false,
3105 },
3106 });
3107 if let Some(ref pv) = pending_version
3108 && pv.versioned_response
3109 {
3110 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3111 }
3112 let mut backend_resp = self.backend.put_object(req).await;
3113 if let (Some(mgr), Some(pv), Ok(resp)) = (
3114 self.versioning.as_ref(),
3115 pending_version.as_ref(),
3116 backend_resp.as_mut(),
3117 ) {
3118 let etag = resp
3119 .output
3120 .e_tag
3121 .clone()
3122 .map(ETag::into_value)
3123 .unwrap_or_default();
3124 let now = chrono::Utc::now();
3125 mgr.commit_put_with_version(
3126 &put_bucket,
3127 &put_key,
3128 crate::versioning::VersionEntry {
3129 version_id: pv.version_id.clone(),
3130 etag,
3131 size: 0,
3132 is_delete_marker: false,
3133 created_at: now,
3134 },
3135 );
3136 if pv.versioned_response {
3137 resp.output.version_id = Some(pv.version_id.clone());
3138 }
3139 }
3140 // v0.5 #30: same explicit-then-default lock-state commit as the
3141 // body-bearing branch above, so a zero-length PUT also picks up
3142 // bucket-default retention.
3143 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3144 if explicit_lock_mode.is_some()
3145 || explicit_retain_until.is_some()
3146 || explicit_legal_hold_on.is_some()
3147 {
3148 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3149 if let Some(m) = explicit_lock_mode {
3150 state.mode = Some(m);
3151 }
3152 if let Some(u) = explicit_retain_until {
3153 state.retain_until = Some(u);
3154 }
3155 if let Some(lh) = explicit_legal_hold_on {
3156 state.legal_hold_on = lh;
3157 }
3158 mgr.set(&put_bucket, &put_key, state);
3159 }
3160 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3161 }
3162 // v0.6 #35: same notification fire-point as the body-bearing PUT
3163 // branch above (zero-length objects still match `ObjectCreated:Put`
3164 // rules per the AWS event taxonomy).
3165 if backend_resp.is_ok()
3166 && let Some(mgr) = self.notifications.as_ref()
3167 {
3168 let dests = mgr.match_destinations(
3169 &put_bucket,
3170 &crate::notifications::EventType::ObjectCreatedPut,
3171 &put_key,
3172 );
3173 if !dests.is_empty() {
3174 let etag = backend_resp
3175 .as_ref()
3176 .ok()
3177 .and_then(|r| r.output.e_tag.clone())
3178 .map(ETag::into_value);
3179 let version_id = pending_version
3180 .as_ref()
3181 .filter(|pv| pv.versioned_response)
3182 .map(|pv| pv.version_id.clone());
3183 tokio::spawn(crate::notifications::dispatch_event(
3184 Arc::clone(mgr),
3185 put_bucket.clone(),
3186 put_key.clone(),
3187 crate::notifications::EventType::ObjectCreatedPut,
3188 Some(0),
3189 etag,
3190 version_id,
3191 format!("S4-{}", uuid::Uuid::new_v4()),
3192 ));
3193 }
3194 }
3195 // v0.6 #39: persist parsed `x-amz-tagging` for the body-less
3196 // (zero-length) PUT branch too — same shape as the body-bearing
3197 // branch above.
3198 if backend_resp.is_ok()
3199 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3200 {
3201 mgr.put_object_tags(&put_bucket, &put_key, tags);
3202 }
3203 // v0.6 #40: cross-bucket replication for the zero-length PUT
3204 // branch — same shape as the body-bearing branch above.
3205 // v0.8.2 #61: pass `pending_version` so a versioned source's
3206 // destination receives the same shadow-key path.
3207 self.spawn_replication_if_matched(
3208 &put_bucket,
3209 &put_key,
3210 &request_tags,
3211 &bytes::Bytes::new(),
3212 &None,
3213 backend_resp.is_ok(),
3214 pending_version.as_ref(),
3215 );
3216 backend_resp
3217 }
3218
3219 // === 圧縮を解く path (GET) ===
3220 #[tracing::instrument(
3221 name = "s4.get_object",
3222 skip(self, req),
3223 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_out, range, path)
3224 )]
3225 async fn get_object(
3226 &self,
3227 mut req: S3Request<GetObjectInput>,
3228 ) -> S3Result<S3Response<GetObjectOutput>> {
3229 let get_start = Instant::now();
3230 let get_bucket = req.input.bucket.clone();
3231 let get_key = req.input.key.clone();
3232 // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
3233 self.check_not_reserved_key(&get_key, ReservedKeyMode::Read)?;
3234 self.enforce_rate_limit(&req, &get_bucket)?;
3235 self.enforce_policy(&req, "s3:GetObject", &get_bucket, Some(&get_key))?;
3236 // Range request の事前検出 (decompress 後 slice する path に使う)。
3237 let range_request = req.input.range.take();
3238 // v0.5 #27: pull SSE-C material from the input headers before
3239 // the request is moved into the backend. A header parse error
3240 // fails fast (no body fetch). The material is consumed below
3241 // when decrypting an S4E3-framed body; the SSE-C headers on
3242 // `req.input` are cleared so the backend doesn't see them.
3243 let sse_c_alg = req.input.sse_customer_algorithm.take();
3244 let sse_c_key = req.input.sse_customer_key.take();
3245 let sse_c_md5 = req.input.sse_customer_key_md5.take();
3246 let get_sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3247
3248 // v0.5 #34: route the GET through the VersioningManager when
3249 // attached AND the bucket is in a versioning-aware state.
3250 // Resolves which version to fetch (explicit `?versionId=` query
3251 // param vs. chain latest), translates a delete-marker into 404
3252 // NoSuchKey, and rewrites the backend storage key to the shadow
3253 // path (`<key>.__s4ver__/<vid>`) for non-null Enabled-bucket
3254 // versions. `resolved_version_id` is stamped onto the response
3255 // so clients see a coherent `x-amz-version-id` header.
3256 //
3257 // When the bucket is Unversioned (or no manager attached), the
3258 // chain-resolution step is skipped and the request flows
3259 // through the existing single-key path unchanged.
3260 let resolved_version_id: Option<String> = match self.versioning.as_ref() {
3261 Some(mgr)
3262 if mgr.state(&get_bucket) != crate::versioning::VersioningState::Unversioned =>
3263 {
3264 let req_vid = req.input.version_id.take();
3265 let entry = match req_vid.as_deref() {
3266 Some(vid) => {
3267 mgr.lookup_version(&get_bucket, &get_key, vid)
3268 .ok_or_else(|| {
3269 S3Error::with_message(
3270 S3ErrorCode::NoSuchVersion,
3271 format!("no such version: {vid}"),
3272 )
3273 })?
3274 }
3275 None => mgr.lookup_latest(&get_bucket, &get_key).ok_or_else(|| {
3276 S3Error::with_message(
3277 S3ErrorCode::NoSuchKey,
3278 format!("no such key: {get_key}"),
3279 )
3280 })?,
3281 };
3282 if entry.is_delete_marker {
3283 // S3 spec: GET without versionId on a
3284 // delete-marker latest → 404 NoSuchKey + the
3285 // response carries `x-amz-delete-marker: true`.
3286 // GET with explicit versionId pointing at a delete
3287 // marker → 405 MethodNotAllowed; we surface
3288 // NoSuchKey here for both since s3s collapses them
3289 // into the same not-found error path.
3290 return Err(S3Error::with_message(
3291 S3ErrorCode::NoSuchKey,
3292 format!("delete marker is the current version of {get_key}"),
3293 ));
3294 }
3295 if entry.version_id != crate::versioning::NULL_VERSION_ID {
3296 req.input.key = versioned_shadow_key(&get_key, &entry.version_id);
3297 }
3298 Some(entry.version_id)
3299 }
3300 _ => None,
3301 };
3302
3303 // ====== Range GET の partial-fetch fast path (sidecar index 利用) ======
3304 // sidecar `<key>.s4index` が存在し、multipart-framed object であれば
3305 // 必要 frame だけを backend に Range GET し帯域節約する。
3306 //
3307 // v0.8.4 #73 H-2: BEFORE trusting the sidecar's frame offsets,
3308 // verify the source object hasn't been overwritten / mutated since
3309 // the sidecar was stamped. The sidecar carries the backend ETag
3310 // captured at PUT time (`source_etag`); a HEAD against the current
3311 // backend object tells us the live ETag. If they disagree we treat
3312 // the sidecar as stale and fall through to the full-GET path —
3313 // returning the wrong frames for a Range request would surface as
3314 // a CRC mismatch deeper in the stack but would also potentially
3315 // disclose unrelated frames if a hostile operator wrote the
3316 // sidecar themselves. Fail-open to "full read" is the safe default.
3317 //
3318 // Legacy v1 sidecars (no `source_etag` populated) keep the old
3319 // best-effort behaviour so existing on-disk indexes don't suddenly
3320 // start missing the partial-fetch path.
3321 if let Some(ref r) = range_request
3322 && let Some(index) = self.read_sidecar(&req.input.bucket, &req.input.key).await
3323 && self
3324 .sidecar_version_binding_ok(&req.input.bucket, &req.input.key, &index)
3325 .await
3326 {
3327 let total = index.total_original_size();
3328 let (start, end_exclusive) = match resolve_range(r, total) {
3329 Ok(v) => v,
3330 Err(e) => {
3331 return Err(S3Error::with_message(S3ErrorCode::InvalidRange, e));
3332 }
3333 };
3334 if let Some(plan) = index.lookup_range(start, end_exclusive) {
3335 return self
3336 .partial_range_get(&req, plan, start, end_exclusive, total, get_start)
3337 .await;
3338 }
3339 }
3340 let mut resp = self.backend.get_object(req).await?;
3341 // v0.5 #34: stamp the resolved version-id so the client sees a
3342 // coherent `x-amz-version-id` header (only for chains owned by
3343 // the manager — Unversioned buckets / no-manager paths never
3344 // set this).
3345 if let Some(ref vid) = resolved_version_id {
3346 resp.output.version_id = Some(vid.clone());
3347 }
3348 let is_multipart = is_multipart_object(&resp.output.metadata);
3349 let is_framed_v2 = is_framed_v2_object(&resp.output.metadata);
3350 // v0.2 #4: framed-v2 single-PUT は多 frame parse が必要なので
3351 // multipart と同じ path に流す。
3352 let needs_frame_parse = is_multipart || is_framed_v2;
3353 let manifest_opt = extract_manifest(&resp.output.metadata);
3354
3355 if !needs_frame_parse && manifest_opt.is_none() {
3356 // S4 が書いていないオブジェクトは透過 (raw bucket pre-existing object 等)
3357 debug!("S4 get_object: object lacks s4-codec metadata, returning as-is");
3358 return Ok(resp);
3359 }
3360
3361 if let Some(blob) = resp.output.body.take() {
3362 // v0.4 #21 / v0.5 #27: if the object was stored under SSE
3363 // (metadata flag `s4-encrypted: aes-256-gcm`), decrypt
3364 // before any frame parse / streaming decompress. Encrypted
3365 // bodies are opaque to the codec; this also forces the
3366 // buffered path because AES-GCM needs the full body for tag
3367 // verify. SSE-C uses the per-request customer key, SSE-S4
3368 // falls back to the configured keyring.
3369 let blob = if is_sse_encrypted(&resp.output.metadata) {
3370 let body = collect_blob(blob, self.max_body_bytes)
3371 .await
3372 .map_err(internal("collect SSE-encrypted body"))?;
3373 // v0.5 #28: peek the frame magic to route the right
3374 // decrypt path. S4E4 means SSE-KMS — unwrap the DEK
3375 // through the KMS backend (async). S4E1/E2/E3 take
3376 // the sync path (keyring or customer key).
3377 //
3378 // v0.8 #52 (S4E5) / v0.8.1 #57 (S4E6): the chunked
3379 // SSE-S4 frames take the *streaming* path — we hand
3380 // the response body a per-chunk verify-and-emit
3381 // Stream so the client sees chunk 0 plaintext after
3382 // one chunk-worth of AES-GCM verify (vs. waiting
3383 // for the whole body's tag), and the gateway no
3384 // longer needs to materialize the full plaintext
3385 // in memory before responding. SSE-C is out of
3386 // scope for the chunked path (chunked S4E3 is a
3387 // follow-up), so this branch requires the SSE-S4
3388 // keyring to be wired and `get_sse_c_material` to
3389 // be absent — otherwise we surface a clear
3390 // misconfiguration error instead of silently
3391 // falling through to the buffered chunked path.
3392 // v0.8.11 CRIT-1 fix: the chunked stream early-return is
3393 // only correct when the decrypted body IS the user's
3394 // plaintext as-stored. If the object went through the
3395 // codec (compressed) or carries S4F2 frames, returning
3396 // the decrypt stream directly hands the client
3397 // compressed / framed bytes. Restrict the early-return
3398 // to codec=Passthrough + non-framed objects; everything
3399 // else falls through to the buffered path, which
3400 // decrypt-buffers S4E5/S4E6 via
3401 // `decrypt_chunked_buffered_default` and then runs the
3402 // existing decompress pipeline.
3403 let chunked_streaming_safe = !needs_frame_parse
3404 && manifest_opt
3405 .as_ref()
3406 .map(|m| m.codec == CodecKind::Passthrough)
3407 .unwrap_or(false);
3408 if matches!(crate::sse::peek_magic(&body), Some("S4E5") | Some("S4E6"))
3409 && get_sse_c_material.is_none()
3410 && chunked_streaming_safe
3411 {
3412 let keyring_arc = self.sse_keyring.clone().ok_or_else(|| {
3413 S3Error::with_message(
3414 S3ErrorCode::InvalidRequest,
3415 "object is SSE-S4 encrypted (S4E5/S4E6) but no --sse-s4-key is configured on this gateway",
3416 )
3417 })?;
3418 let body_len = body.len() as u64;
3419 let stream = crate::sse::decrypt_chunked_stream(body, keyring_arc.as_ref());
3420 // Stream is `'static` (the keyring borrow is
3421 // consumed up front; the cipher lives inside
3422 // the stream state — see decrypt_chunked_stream
3423 // doc), so we can move it straight into a
3424 // StreamingBlob without lifetime gymnastics.
3425 use futures::StreamExt;
3426 let mapped = stream.map(|r| {
3427 r.map_err(|e| std::io::Error::other(format!("SSE-S4 chunked decrypt: {e}")))
3428 });
3429 use s3s::dto::StreamingBlob;
3430 resp.output.body = Some(StreamingBlob::wrap(mapped));
3431 // Plaintext content_length is unknown until all
3432 // chunks have been verified; null it out so the
3433 // ByteStream wrapper reports `unknown` to the
3434 // HTTP layer (which then emits chunked transfer-
3435 // encoding) rather than lying about the size.
3436 resp.output.content_length = None;
3437 // The backend's checksums + ETag describe the
3438 // encrypted body (S4E5/S4E6 wire format), not
3439 // the plaintext we're about to stream — clear them
3440 // so the AWS SDK doesn't fail the GET with a
3441 // ChecksumMismatch on a successful round-trip.
3442 // Mirrors the streaming-zstd path at L1180-1185.
3443 resp.output.checksum_crc32 = None;
3444 resp.output.checksum_crc32c = None;
3445 resp.output.checksum_crc64nvme = None;
3446 resp.output.checksum_sha1 = None;
3447 resp.output.checksum_sha256 = None;
3448 resp.output.e_tag = None;
3449 let elapsed = get_start.elapsed();
3450 crate::metrics::record_get(
3451 "sse-s4-chunked",
3452 body_len,
3453 body_len,
3454 elapsed.as_secs_f64(),
3455 true,
3456 );
3457 return Ok(resp);
3458 }
3459 let plain = match crate::sse::peek_magic(&body) {
3460 Some("S4E4") => {
3461 let kms = self.kms.as_ref().ok_or_else(|| {
3462 S3Error::with_message(
3463 S3ErrorCode::InvalidRequest,
3464 "object is SSE-KMS encrypted but no --kms-local-dir / --kms-aws-region is configured on this gateway",
3465 )
3466 })?;
3467 let kms_ref: &dyn crate::kms::KmsBackend = kms.as_ref();
3468 crate::sse::decrypt_with_kms(&body, kms_ref)
3469 .await
3470 .map_err(|e| match e {
3471 crate::sse::SseError::KmsBackend(k) => kms_error_to_s3(k),
3472 other => S3Error::with_message(
3473 S3ErrorCode::InternalError,
3474 format!("SSE-KMS decrypt failed: {other}"),
3475 ),
3476 })?
3477 }
3478 _ => {
3479 if let Some(ref m) = get_sse_c_material {
3480 crate::sse::decrypt(
3481 &body,
3482 crate::sse::SseSource::CustomerKey {
3483 key: &m.key,
3484 key_md5: &m.key_md5,
3485 },
3486 )
3487 .map_err(sse_c_error_to_s3)?
3488 } else {
3489 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
3490 S3Error::with_message(
3491 S3ErrorCode::InvalidRequest,
3492 "object is SSE-S4 encrypted but no --sse-s4-key is configured on this gateway",
3493 )
3494 })?;
3495 crate::sse::decrypt(&body, keyring).map_err(|e| {
3496 S3Error::with_message(
3497 S3ErrorCode::InternalError,
3498 format!("SSE-S4 decrypt failed: {e}"),
3499 )
3500 })?
3501 }
3502 }
3503 };
3504 // v0.5 #28: parse out the on-disk wrapped DEK's key id
3505 // so the GET response can echo `x-amz-server-side-encryption-aws-kms-key-id`.
3506 if matches!(crate::sse::peek_magic(&body), Some("S4E4"))
3507 && let Ok(hdr) = crate::sse::parse_s4e4_header(&body)
3508 {
3509 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
3510 ServerSideEncryption::AWS_KMS,
3511 ));
3512 resp.output.ssekms_key_id = Some(hdr.key_id.to_string());
3513 }
3514 bytes_to_blob(plain)
3515 } else if let Some(ref m) = get_sse_c_material {
3516 // Client sent SSE-C headers for an unencrypted object —
3517 // mirror AWS S3's 400 InvalidRequest.
3518 let _ = m;
3519 return Err(sse_c_error_to_s3(
3520 crate::sse::SseError::CustomerKeyUnexpected,
3521 ));
3522 } else {
3523 blob
3524 };
3525 // v0.5 #27: SSE-C echo on success — algorithm + key MD5
3526 // tell the client that the supplied key was the one used.
3527 if let Some(ref m) = get_sse_c_material {
3528 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
3529 resp.output.sse_customer_key_md5 =
3530 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
3531 }
3532 // ====== Streaming fast path (CpuZstd, non-multipart, codec supports it) ======
3533 // 大規模 object (e.g. 5 GB) を memory に collect すると OOM するので、
3534 // codec が streaming-aware なら body を chunk-by-chunk で decompress して
3535 // 即座に client に流す。
3536 //
3537 // ただし Range request 時は streaming できない (slice するため total bytes
3538 // が必要) → buffered path に fall through。
3539 if range_request.is_none()
3540 && !needs_frame_parse
3541 && let Some(ref m) = manifest_opt
3542 && supports_streaming_decompress(m.codec)
3543 && m.codec == CodecKind::CpuZstd
3544 {
3545 // v0.8.4 #73 H-1: wrap the decompressor output in a
3546 // rolling-CRC32C verifier so a tampered ciphertext (or a
3547 // backend-side corruption that the zstd decoder happens
3548 // to "successfully" decode into wrong bytes) surfaces as
3549 // a streaming error tail at EOF instead of silently
3550 // delivering corrupt plaintext to the client. The wrap
3551 // is a pure pass-through during the body — no extra
3552 // buffering, TTFB unaffected — and the integrity
3553 // decision lands at the last chunk.
3554 let decompressed_blob = cpu_zstd_decompress_stream(blob);
3555 let verified_reader = Crc32cVerifyingReader::new(
3556 blob_to_async_read(decompressed_blob),
3557 m.crc32c,
3558 m.original_size,
3559 );
3560 let verified_blob = async_read_to_blob(verified_reader);
3561 resp.output.content_length = Some(m.original_size as i64);
3562 resp.output.checksum_crc32 = None;
3563 resp.output.checksum_crc32c = None;
3564 resp.output.checksum_crc64nvme = None;
3565 resp.output.checksum_sha1 = None;
3566 resp.output.checksum_sha256 = None;
3567 resp.output.e_tag = None;
3568 resp.output.body = Some(verified_blob);
3569 let elapsed = get_start.elapsed();
3570 crate::metrics::record_get(
3571 m.codec.as_str(),
3572 m.compressed_size,
3573 m.original_size,
3574 elapsed.as_secs_f64(),
3575 true,
3576 );
3577 info!(
3578 op = "get_object",
3579 bucket = %get_bucket,
3580 key = %get_key,
3581 codec = m.codec.as_str(),
3582 bytes_in = m.compressed_size,
3583 bytes_out = m.original_size,
3584 path = "streaming",
3585 setup_latency_ms = elapsed.as_millis() as u64,
3586 "S4 get started (streaming)"
3587 );
3588 return Ok(resp);
3589 }
3590 // Passthrough: そのまま流す (Range なしの場合のみ streaming)
3591 if range_request.is_none()
3592 && !needs_frame_parse
3593 && let Some(ref m) = manifest_opt
3594 && m.codec == CodecKind::Passthrough
3595 {
3596 resp.output.content_length = Some(m.original_size as i64);
3597 resp.output.checksum_crc32 = None;
3598 resp.output.checksum_crc32c = None;
3599 resp.output.checksum_crc64nvme = None;
3600 resp.output.checksum_sha1 = None;
3601 resp.output.checksum_sha256 = None;
3602 resp.output.e_tag = None;
3603 resp.output.body = Some(blob);
3604 debug!("S4 get_object: passthrough streaming");
3605 return Ok(resp);
3606 }
3607
3608 // ====== Buffered slow path (multipart frame parser, GPU codecs) ======
3609 let bytes = collect_blob(blob, self.max_body_bytes)
3610 .await
3611 .map_err(internal("collect get body"))?;
3612
3613 let decompressed = if needs_frame_parse {
3614 // multipart objects と framed-v2 single-PUT objects は同じ
3615 // S4F2 frame 列なので decompress_multipart で統一処理
3616 self.decompress_multipart(bytes).await?
3617 } else {
3618 let manifest = manifest_opt.as_ref().expect("non-multipart guarded above");
3619 self.registry
3620 .decompress(bytes, manifest)
3621 .await
3622 .map_err(internal("registry decompress"))?
3623 };
3624
3625 // Range request があれば slice。なければ full body を返す。
3626 let total_size = decompressed.len() as u64;
3627 let (final_bytes, status_override) = if let Some(r) = range_request.as_ref() {
3628 let (start, end) = resolve_range(r, total_size)
3629 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
3630 let sliced = decompressed.slice(start as usize..end as usize);
3631 resp.output.content_range = Some(format!(
3632 "bytes {start}-{}/{total_size}",
3633 end.saturating_sub(1)
3634 ));
3635 (sliced, Some(http::StatusCode::PARTIAL_CONTENT))
3636 } else {
3637 (decompressed, None)
3638 };
3639 // 解凍後の真のサイズを返す (S3 client は content_length を信頼するので
3640 // 圧縮 size のままだと downstream が body を途中で切ってしまう)
3641 resp.output.content_length = Some(final_bytes.len() as i64);
3642 // 圧縮済 bytes の checksum を返すと AWS SDK 側で StreamingError
3643 // (ChecksumMismatch) になる。ETag も backend が返した「圧縮済 bytes の
3644 // MD5/checksum」なので意味的にズレる — クリアして S4 自身の crc32c
3645 // (manifest 内 / frame 内) で integrity を保証する設計にする。
3646 resp.output.checksum_crc32 = None;
3647 resp.output.checksum_crc32c = None;
3648 resp.output.checksum_crc64nvme = None;
3649 resp.output.checksum_sha1 = None;
3650 resp.output.checksum_sha256 = None;
3651 resp.output.e_tag = None;
3652 let returned_size = final_bytes.len() as u64;
3653 let codec_label = manifest_opt
3654 .as_ref()
3655 .map(|m| m.codec.as_str())
3656 .unwrap_or("multipart");
3657 resp.output.body = Some(bytes_to_blob(final_bytes));
3658 if let Some(status) = status_override {
3659 resp.status = Some(status);
3660 }
3661 let elapsed = get_start.elapsed();
3662 crate::metrics::record_get(codec_label, 0, returned_size, elapsed.as_secs_f64(), true);
3663 info!(
3664 op = "get_object",
3665 bucket = %get_bucket,
3666 key = %get_key,
3667 codec = codec_label,
3668 bytes_out = returned_size,
3669 total_object_size = total_size,
3670 range = range_request.is_some(),
3671 path = "buffered",
3672 latency_ms = elapsed.as_millis() as u64,
3673 "S4 get completed (buffered)"
3674 );
3675 }
3676 // v0.6 #40: echo the recorded `x-amz-replication-status` so
3677 // consumers can poll progress (PENDING / COMPLETED / FAILED).
3678 if let Some(mgr) = self.replication.as_ref()
3679 && let Some(status) = mgr.lookup_status(&get_bucket, &get_key)
3680 {
3681 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3682 status.as_aws_str().to_owned(),
3683 ));
3684 }
3685 Ok(resp)
3686 }
3687
3688 // === passthrough delegations ===
3689 async fn head_bucket(
3690 &self,
3691 req: S3Request<HeadBucketInput>,
3692 ) -> S3Result<S3Response<HeadBucketOutput>> {
3693 self.backend.head_bucket(req).await
3694 }
3695 async fn list_buckets(
3696 &self,
3697 req: S3Request<ListBucketsInput>,
3698 ) -> S3Result<S3Response<ListBucketsOutput>> {
3699 self.backend.list_buckets(req).await
3700 }
3701 async fn create_bucket(
3702 &self,
3703 req: S3Request<CreateBucketInput>,
3704 ) -> S3Result<S3Response<CreateBucketOutput>> {
3705 self.backend.create_bucket(req).await
3706 }
3707 async fn delete_bucket(
3708 &self,
3709 req: S3Request<DeleteBucketInput>,
3710 ) -> S3Result<S3Response<DeleteBucketOutput>> {
3711 self.backend.delete_bucket(req).await
3712 }
3713 async fn head_object(
3714 &self,
3715 req: S3Request<HeadObjectInput>,
3716 ) -> S3Result<S3Response<HeadObjectOutput>> {
3717 // v0.6 #40: capture bucket/key before req is consumed so the
3718 // replication-status echo can look the entry up.
3719 let head_bucket = req.input.bucket.clone();
3720 let head_key = req.input.key.clone();
3721 // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
3722 self.check_not_reserved_key(&head_key, ReservedKeyMode::Read)?;
3723 let mut resp = self.backend.head_object(req).await?;
3724 if let Some(manifest) = extract_manifest(&resp.output.metadata) {
3725 // 客側には decompress 後の意味のある content_length / checksum を返す。
3726 // backend が返す圧縮済 bytes の checksum / e_tag は意味が違うため除去
3727 // (S4 は manifest 内の crc32c で integrity を担保する)。
3728 resp.output.content_length = Some(manifest.original_size as i64);
3729 resp.output.checksum_crc32 = None;
3730 resp.output.checksum_crc32c = None;
3731 resp.output.checksum_crc64nvme = None;
3732 resp.output.checksum_sha1 = None;
3733 resp.output.checksum_sha256 = None;
3734 resp.output.e_tag = None;
3735 }
3736 // v0.6 #40: echo `x-amz-replication-status` (PENDING / COMPLETED
3737 // / FAILED) so consumers can poll progress without a GET.
3738 if let Some(mgr) = self.replication.as_ref()
3739 && let Some(status) = mgr.lookup_status(&head_bucket, &head_key)
3740 {
3741 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
3742 status.as_aws_str().to_owned(),
3743 ));
3744 }
3745 // v0.7 #48 BUG-4 fix: HEAD must echo SSE indicators so SDKs
3746 // and pipelines see the same posture they got on PUT. The PUT
3747 // path stamps `s4-sse-type` metadata for exactly this — HEAD
3748 // doesn't fetch the body, so it can't peek frame magic.
3749 if let Some(meta) = resp.output.metadata.as_ref()
3750 && let Some(sse_type) = meta.get("s4-sse-type")
3751 {
3752 {
3753 match sse_type.as_str() {
3754 "aws:kms" => {
3755 resp.output.server_side_encryption = Some(
3756 ServerSideEncryption::from_static(ServerSideEncryption::AWS_KMS),
3757 );
3758 if let Some(key_id) = meta.get("s4-sse-kms-key-id") {
3759 resp.output.ssekms_key_id = Some(key_id.clone());
3760 }
3761 }
3762 _ => {
3763 resp.output.server_side_encryption = Some(
3764 ServerSideEncryption::from_static(ServerSideEncryption::AES256),
3765 );
3766 if let Some(md5) = meta.get("s4-sse-c-key-md5") {
3767 resp.output.sse_customer_algorithm =
3768 Some(crate::sse::SSE_C_ALGORITHM.into());
3769 resp.output.sse_customer_key_md5 = Some(md5.clone());
3770 }
3771 }
3772 }
3773 }
3774 }
3775 Ok(resp)
3776 }
3777 async fn delete_object(
3778 &self,
3779 mut req: S3Request<DeleteObjectInput>,
3780 ) -> S3Result<S3Response<DeleteObjectOutput>> {
3781 let bucket = req.input.bucket.clone();
3782 let key = req.input.key.clone();
3783 // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
3784 // The S4 internal sidecar cleanup path
3785 // (`write_sidecar` and friends) talks to
3786 // `self.backend.delete_object(...)` directly, NOT through
3787 // this trait method, so the guard doesn't break
3788 // legitimate sidecar cleanup.
3789 self.check_not_reserved_key(&key, ReservedKeyMode::Mutating)?;
3790 self.enforce_rate_limit(&req, &bucket)?;
3791 self.enforce_policy(&req, "s3:DeleteObject", &bucket, Some(&key))?;
3792 // v0.6 #42: MFA Delete enforcement. When the bucket has
3793 // MFA-Delete = Enabled, every DELETE / DELETE-version /
3794 // delete-marker form needs `x-amz-mfa: <serial> <code>` (RFC 6238
3795 // 6-digit TOTP). Runs *before* the WORM / versioning routers so
3796 // a missing token is denied for free regardless of which delete
3797 // path the request would otherwise take.
3798 if let Some(mgr) = self.mfa_delete.as_ref()
3799 && mgr.is_enabled(&bucket)
3800 {
3801 let header = req.input.mfa.as_deref();
3802 if let Err(e) = crate::mfa::check_mfa(&bucket, header, mgr, current_unix_secs()) {
3803 crate::metrics::record_mfa_delete_denial(&bucket);
3804 return Err(mfa_error_to_s3(e));
3805 }
3806 }
3807 // v0.5 #30: refuse the delete while a WORM lock is in effect.
3808 // Compliance can never be bypassed; Governance can be overridden
3809 // via `x-amz-bypass-governance-retention: true`; legal hold
3810 // never. The check happens before the versioning router so a
3811 // locked object can't be soft-deleted (delete-marker push) on an
3812 // Enabled bucket either — S3 spec says lock applies to all
3813 // delete forms.
3814 if let Some(mgr) = self.object_lock.as_ref()
3815 && let Some(state) = mgr.get(&bucket, &key)
3816 {
3817 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
3818 // v0.8.12 HIGH-7 fix: the bypass header alone used to be
3819 // enough to override Governance retention. AWS spec
3820 // requires the caller hold `s3:BypassGovernanceRetention`
3821 // for the target ARN; without that, the header is
3822 // silently ignored (not an error — it lines up with how
3823 // AWS' canonical behaviour treats unprivileged callers).
3824 let bypass_allowed = if bypass_header {
3825 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
3826 .is_ok()
3827 } else {
3828 false
3829 };
3830 let now = chrono::Utc::now();
3831 if !state.can_delete(now, bypass_allowed) {
3832 crate::metrics::record_policy_denial("s3:DeleteObject", &bucket);
3833 return Err(S3Error::with_message(
3834 S3ErrorCode::AccessDenied,
3835 "Access Denied because object protected by object lock",
3836 ));
3837 }
3838 }
3839 // v0.5 #34: route DELETE through the VersioningManager when the
3840 // bucket is in a versioning-aware state.
3841 //
3842 // - Enabled bucket, no version_id → push a delete marker into
3843 // the chain. NO backend object is touched (older versions
3844 // stay reachable via specific-version GET).
3845 // - Enabled / Suspended bucket, with version_id → physical
3846 // delete. Backend bytes at the shadow key (or `<key>` for
3847 // `null`) are removed; chain entry is dropped. If the deleted
3848 // entry was a delete marker, no backend bytes exist for it
3849 // (record-only).
3850 // - Suspended bucket, no version_id → push a "null" delete
3851 // marker (S3 spec); backend bytes at `<key>` are physically
3852 // removed (same as legacy).
3853 // - Unversioned bucket → fall through to legacy passthrough.
3854 if let Some(mgr) = self.versioning.as_ref() {
3855 let state = mgr.state(&bucket);
3856 if state != crate::versioning::VersioningState::Unversioned {
3857 let req_vid = req.input.version_id.take();
3858 if let Some(vid) = req_vid {
3859 // Specific-version DELETE: touch backend bytes only
3860 // when the entry was a real version (not a delete
3861 // marker, which has no backend bytes).
3862 let outcome = mgr.record_delete_specific(&bucket, &key, &vid);
3863 let backend_target = if vid == crate::versioning::NULL_VERSION_ID {
3864 key.clone()
3865 } else {
3866 versioned_shadow_key(&key, &vid)
3867 };
3868 let was_real_version = outcome
3869 .as_ref()
3870 .map(|o| !o.is_delete_marker)
3871 .unwrap_or(false);
3872 if was_real_version {
3873 // Best-effort backend cleanup; missing bytes
3874 // are not an error (e.g. shadow key already
3875 // GC'd).
3876 let backend_input = DeleteObjectInput {
3877 bucket: bucket.clone(),
3878 key: backend_target,
3879 ..Default::default()
3880 };
3881 let backend_req = S3Request {
3882 input: backend_input,
3883 method: http::Method::DELETE,
3884 uri: req.uri.clone(),
3885 headers: req.headers.clone(),
3886 extensions: http::Extensions::new(),
3887 credentials: req.credentials.clone(),
3888 region: req.region.clone(),
3889 service: req.service.clone(),
3890 trailing_headers: None,
3891 };
3892 let _ = self.backend.delete_object(backend_req).await;
3893 }
3894 let mut output = DeleteObjectOutput {
3895 version_id: Some(vid.clone()),
3896 ..Default::default()
3897 };
3898 if let Some(o) = outcome.as_ref()
3899 && o.is_delete_marker
3900 {
3901 output.delete_marker = Some(true);
3902 }
3903 // v0.6 #35: specific-version DELETE always counts as
3904 // a hard `ObjectRemoved:Delete` event (the chain
3905 // entry, marker or not, is gone after this call).
3906 self.fire_delete_notification(
3907 &bucket,
3908 &key,
3909 crate::notifications::EventType::ObjectRemovedDelete,
3910 Some(vid.clone()),
3911 );
3912 return Ok(S3Response::new(output));
3913 }
3914 // No version_id: record a delete marker (state-aware).
3915 let outcome = mgr.record_delete(&bucket, &key);
3916 if state == crate::versioning::VersioningState::Suspended {
3917 // Suspended buckets also evict the prior `<key>`
3918 // bytes (the previous null version is gone too).
3919 let backend_input = DeleteObjectInput {
3920 bucket: bucket.clone(),
3921 key: key.clone(),
3922 ..Default::default()
3923 };
3924 let backend_req = S3Request {
3925 input: backend_input,
3926 method: http::Method::DELETE,
3927 uri: req.uri.clone(),
3928 headers: req.headers.clone(),
3929 extensions: http::Extensions::new(),
3930 credentials: req.credentials.clone(),
3931 region: req.region.clone(),
3932 service: req.service.clone(),
3933 trailing_headers: None,
3934 };
3935 let _ = self.backend.delete_object(backend_req).await;
3936 }
3937 let output = DeleteObjectOutput {
3938 delete_marker: Some(true),
3939 version_id: outcome.version_id.clone(),
3940 ..Default::default()
3941 };
3942 // v0.6 #35: versioned bucket DELETE without a version-id
3943 // creates a delete marker — the dedicated AWS event
3944 // taxonomy entry. Suspended-state buckets also push a
3945 // (null) marker, so the same event fires there.
3946 self.fire_delete_notification(
3947 &bucket,
3948 &key,
3949 crate::notifications::EventType::ObjectRemovedDeleteMarker,
3950 outcome.version_id,
3951 );
3952 return Ok(S3Response::new(output));
3953 }
3954 }
3955 // Legacy / Unversioned path: physical delete on the backend +
3956 // best-effort sidecar cleanup (mirrors v0.4 behaviour).
3957 let resp = self.backend.delete_object(req).await?;
3958 // v0.5 #30: drop any per-object lock state once the delete has
3959 // succeeded so the freed key can be re-armed by a future PUT
3960 // under the bucket default. Reaching here implies the lock had
3961 // already passed `can_delete` above, so this is purely cleanup.
3962 if let Some(mgr) = self.object_lock.as_ref() {
3963 mgr.clear(&bucket, &key);
3964 }
3965 // v0.6 #39: drop any object-level tag set on physical delete —
3966 // the freed key starts a fresh tag history if a future PUT
3967 // re-creates it. (Versioned-delete branches above return early
3968 // and do NOT touch tags, mirroring AWS where tag state is
3969 // attached to the logical key, not the version chain.)
3970 if let Some(mgr) = self.tagging.as_ref() {
3971 mgr.delete_object_tags(&bucket, &key);
3972 }
3973 let sidecar = sidecar_key(&key);
3974 // v0.7 #49: skip the sidecar DELETE if the key + sidecar suffix
3975 // can't be encoded into a request URI — the primary delete
3976 // already succeeded and a stale sidecar is harmless (Range GET
3977 // re-validates the underlying object on next read).
3978 if let Ok(uri) = safe_object_uri(&bucket, &sidecar) {
3979 let sidecar_input = DeleteObjectInput {
3980 bucket: bucket.clone(),
3981 key: sidecar,
3982 ..Default::default()
3983 };
3984 let sidecar_req = S3Request {
3985 input: sidecar_input,
3986 method: http::Method::DELETE,
3987 uri,
3988 headers: http::HeaderMap::new(),
3989 extensions: http::Extensions::new(),
3990 credentials: None,
3991 region: None,
3992 service: None,
3993 trailing_headers: None,
3994 };
3995 let _ = self.backend.delete_object(sidecar_req).await;
3996 }
3997 // v0.6 #35: legacy unversioned-bucket hard delete fires the
3998 // canonical `ObjectRemoved:Delete` event.
3999 self.fire_delete_notification(
4000 &bucket,
4001 &key,
4002 crate::notifications::EventType::ObjectRemovedDelete,
4003 None,
4004 );
4005 Ok(resp)
4006 }
4007 async fn delete_objects(
4008 &self,
4009 req: S3Request<DeleteObjectsInput>,
4010 ) -> S3Result<S3Response<DeleteObjectsOutput>> {
4011 // v0.6 #42: MFA Delete applies once to the whole batch (S3 spec:
4012 // when MFA-Delete is on the bucket, a missing / invalid token
4013 // fails the entire DeleteObjects request, not per-object).
4014 if let Some(mgr) = self.mfa_delete.as_ref()
4015 && mgr.is_enabled(&req.input.bucket)
4016 {
4017 let header = req.input.mfa.as_deref();
4018 if let Err(e) =
4019 crate::mfa::check_mfa(&req.input.bucket, header, mgr, current_unix_secs())
4020 {
4021 crate::metrics::record_mfa_delete_denial(&req.input.bucket);
4022 return Err(mfa_error_to_s3(e));
4023 }
4024 }
4025 // v0.8.11 CRIT-3 fix: route every entry through the gated
4026 // per-object `delete_object` path so Object Lock, IAM policy,
4027 // versioning, tagging, sidecar cleanup and notification fan-
4028 // out all fire for batch DELETE. The previous
4029 // `self.backend.delete_objects(req).await` straight-through
4030 // bypassed every gate, so a `legal_hold=on` key listed inside
4031 // a DeleteObjects XML was happily removed.
4032 //
4033 // S3 spec note: DeleteObjects is "best-effort per object" —
4034 // a failure on one key surfaces as an `Errors` entry without
4035 // aborting the rest of the batch. Quiet-mode suppresses the
4036 // `Deleted` list (errors are still reported). We honour both.
4037 let bucket = req.input.bucket.clone();
4038 let bypass_governance = req.input.bypass_governance_retention.unwrap_or(false);
4039 let mfa_header = req.input.mfa.clone();
4040 let quiet = req.input.delete.quiet.unwrap_or(false);
4041 let mut deleted: Vec<DeletedObject> = Vec::new();
4042 let mut errors: Vec<s3s::dto::Error> = Vec::new();
4043 for ident in req.input.delete.objects.iter() {
4044 let key = ident.key.clone();
4045 let version_id = ident.version_id.clone();
4046 let per_input = DeleteObjectInput {
4047 bucket: bucket.clone(),
4048 key: key.clone(),
4049 version_id: version_id.clone(),
4050 bypass_governance_retention: Some(bypass_governance),
4051 mfa: mfa_header.clone(),
4052 ..Default::default()
4053 };
4054 let per_uri = match safe_object_uri(&bucket, &key) {
4055 Ok(u) => u,
4056 Err(_) => {
4057 errors.push(s3s::dto::Error {
4058 code: Some("InvalidArgument".to_owned()),
4059 key: Some(key),
4060 message: Some("object key is not URI-encodable".to_owned()),
4061 version_id,
4062 });
4063 continue;
4064 }
4065 };
4066 let per_req = S3Request {
4067 input: per_input,
4068 method: http::Method::DELETE,
4069 uri: per_uri,
4070 headers: req.headers.clone(),
4071 extensions: http::Extensions::new(),
4072 credentials: req.credentials.clone(),
4073 region: req.region.clone(),
4074 service: req.service.clone(),
4075 trailing_headers: None,
4076 };
4077 match self.delete_object(per_req).await {
4078 Ok(resp) => {
4079 let out = resp.output;
4080 // DeleteObjectOutput doesn't surface a separate
4081 // `delete_marker_version_id`; the marker's version
4082 // id is whatever `version_id` carries (when the
4083 // versioning manager pushed a delete-marker, that
4084 // field already holds the marker's vid).
4085 let vid = out.version_id.clone().or(version_id);
4086 deleted.push(DeletedObject {
4087 key: Some(key),
4088 version_id: vid.clone(),
4089 delete_marker: out.delete_marker,
4090 delete_marker_version_id: vid,
4091 });
4092 }
4093 Err(e) => {
4094 let code_str = e.code().as_str().to_owned();
4095 let msg = e.message().unwrap_or(code_str.as_str()).to_owned();
4096 errors.push(s3s::dto::Error {
4097 code: Some(code_str),
4098 key: Some(key),
4099 message: Some(msg),
4100 version_id,
4101 });
4102 }
4103 }
4104 }
4105 let output = DeleteObjectsOutput {
4106 deleted: if quiet || deleted.is_empty() {
4107 None
4108 } else {
4109 Some(deleted)
4110 },
4111 errors: if errors.is_empty() {
4112 None
4113 } else {
4114 Some(errors)
4115 },
4116 ..Default::default()
4117 };
4118 Ok(S3Response::new(output))
4119 }
4120 async fn copy_object(
4121 &self,
4122 mut req: S3Request<CopyObjectInput>,
4123 ) -> S3Result<S3Response<CopyObjectOutput>> {
4124 // copy is conceptually "GetObject src + PutObject dst" — enforce both.
4125 let dst_bucket = req.input.bucket.clone();
4126 let dst_key = req.input.key.clone();
4127 // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
4128 self.check_not_reserved_key(&dst_key, ReservedKeyMode::Mutating)?;
4129 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
4130 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4131 // v0.8.17 G-2: source `<key>.s4index` would let
4132 // CopyObject expose the raw sidecar (frame layout +
4133 // source ETag) into a writable destination, bypassing
4134 // the F-13 GET reject. Same guard, Read mode (returns
4135 // NoSuchKey to match listing semantics).
4136 self.check_not_reserved_key(key, ReservedKeyMode::Read)?;
4137 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
4138 }
4139 // S4-aware copy: source object に s4-* metadata がある場合、それを
4140 // destination に確実に preserve する。
4141 //
4142 // - MetadataDirective::COPY (default): backend が source metadata を
4143 // そのまま copy するので S4 metadata も自動で渡る。介入不要
4144 // - MetadataDirective::REPLACE: 客が指定した metadata で source を
4145 // 上書き → s4-* metadata が消えると destination は decompress 不能に
4146 // なる (silent corruption)。S4 が source metadata を HEAD で取得し、
4147 // s4-* fields を input.metadata に強制 merge する
4148 let needs_merge = req
4149 .input
4150 .metadata_directive
4151 .as_ref()
4152 .map(|d| d.as_str() == MetadataDirective::REPLACE)
4153 .unwrap_or(false);
4154 if needs_merge && let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4155 // v0.8.16 F-8: strip the client-supplied `s4-*` keys
4156 // *unconditionally* — the v0.8.15 M-2 fix only ran the
4157 // strip inside the `if let Ok(head) = ...` block, so a
4158 // backend HEAD failure (transient 5xx, NoSuchKey on a
4159 // racing delete) left attacker-injected `s4-*` /
4160 // `S4-*` metadata intact on the destination. Now we
4161 // strip first, then re-populate from the source HEAD
4162 // when available — HEAD failure simply means the
4163 // destination loses the codec markers (correct: a
4164 // CopyObject without the source's codec metadata
4165 // produces an unreadable object, but doesn't allow
4166 // injection).
4167 let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4168 dest_meta.retain(|k, _| !k.to_ascii_lowercase().starts_with("s4-"));
4169 let head_input = HeadObjectInput {
4170 bucket: bucket.to_string(),
4171 key: key.to_string(),
4172 ..Default::default()
4173 };
4174 let head_req = S3Request {
4175 input: head_input,
4176 method: req.method.clone(),
4177 uri: req.uri.clone(),
4178 headers: req.headers.clone(),
4179 extensions: http::Extensions::new(),
4180 credentials: req.credentials.clone(),
4181 region: req.region.clone(),
4182 service: req.service.clone(),
4183 trailing_headers: None,
4184 };
4185 if let Ok(head) = self.backend.head_object(head_req).await
4186 && let Some(src_meta) = head.output.metadata.as_ref()
4187 {
4188 let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4189 for key in [
4190 META_CODEC,
4191 META_ORIGINAL_SIZE,
4192 META_COMPRESSED_SIZE,
4193 META_CRC32C,
4194 META_MULTIPART,
4195 META_FRAMED,
4196 ] {
4197 if let Some(v) = src_meta.get(key) {
4198 dest_meta.insert(key.to_string(), v.clone());
4199 }
4200 }
4201 // SSE markers are equally reserved — propagate any
4202 // source flags so a copy of an encrypted object stays
4203 // marked as encrypted at the destination.
4204 for sse_key in [
4205 "s4-encrypted",
4206 "s4-sse-type",
4207 "s4-sse-c-key-md5",
4208 "s4-sse-kms-key-id",
4209 ] {
4210 if let Some(v) = src_meta.get(sse_key) {
4211 dest_meta.insert(sse_key.to_string(), v.clone());
4212 }
4213 }
4214 debug!(
4215 src_bucket = %bucket,
4216 src_key = %key,
4217 "S4 copy_object: replaced client s4-* metadata with source values across REPLACE directive (v0.8.15 M-2)"
4218 );
4219 }
4220 }
4221 self.backend.copy_object(req).await
4222 }
4223 async fn list_objects(
4224 &self,
4225 req: S3Request<ListObjectsInput>,
4226 ) -> S3Result<S3Response<ListObjectsOutput>> {
4227 self.enforce_rate_limit(&req, &req.input.bucket)?;
4228 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4229 let mut resp = self.backend.list_objects(req).await?;
4230 // S4 内部 object (`*.s4index` sidecar、`.__s4ver__/` shadow versions
4231 // — v0.5 #34) を顧客から隠す。
4232 if let Some(contents) = resp.output.contents.as_mut() {
4233 contents.retain(|o| {
4234 o.key
4235 .as_ref()
4236 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4237 .unwrap_or(true)
4238 });
4239 }
4240 Ok(resp)
4241 }
4242 async fn list_objects_v2(
4243 &self,
4244 req: S3Request<ListObjectsV2Input>,
4245 ) -> S3Result<S3Response<ListObjectsV2Output>> {
4246 self.enforce_rate_limit(&req, &req.input.bucket)?;
4247 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4248 let mut resp = self.backend.list_objects_v2(req).await?;
4249 if let Some(contents) = resp.output.contents.as_mut() {
4250 let before = contents.len();
4251 contents.retain(|o| {
4252 o.key
4253 .as_ref()
4254 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4255 .unwrap_or(true)
4256 });
4257 // key_count も補正 (S3 spec compliance)
4258 if let Some(kc) = resp.output.key_count.as_mut() {
4259 *kc -= (before - contents.len()) as i32;
4260 }
4261 }
4262 Ok(resp)
4263 }
4264 /// v0.4 #17: filter S4-internal sidecars from versioned listings.
4265 /// v0.5 #34: when a [`crate::versioning::VersioningManager`] is
4266 /// attached AND the bucket is in a versioning-aware state, build
4267 /// the `Versions` / `DeleteMarkers` arrays directly from the
4268 /// in-memory chain (paginated + ordered the S3 way: key asc,
4269 /// version newest-first inside each key). Otherwise fall back to
4270 /// passthrough + sidecar-filter (legacy v0.4 behaviour).
4271 async fn list_object_versions(
4272 &self,
4273 req: S3Request<ListObjectVersionsInput>,
4274 ) -> S3Result<S3Response<ListObjectVersionsOutput>> {
4275 self.enforce_rate_limit(&req, &req.input.bucket)?;
4276 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4277 // v0.5 #34: VersioningManager-owned path.
4278 if let Some(mgr) = self.versioning.as_ref()
4279 && mgr.state(&req.input.bucket) != crate::versioning::VersioningState::Unversioned
4280 {
4281 let max_keys = req.input.max_keys.unwrap_or(1000) as usize;
4282 let page = mgr.list_versions(
4283 &req.input.bucket,
4284 req.input.prefix.as_deref(),
4285 req.input.key_marker.as_deref(),
4286 req.input.version_id_marker.as_deref(),
4287 max_keys,
4288 );
4289 let versions: Vec<ObjectVersion> = page
4290 .versions
4291 .into_iter()
4292 .map(|e| ObjectVersion {
4293 key: Some(e.key),
4294 version_id: Some(e.version_id),
4295 is_latest: Some(e.is_latest),
4296 e_tag: Some(ETag::Strong(e.etag)),
4297 size: Some(e.size as i64),
4298 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4299 ..Default::default()
4300 })
4301 .collect();
4302 let delete_markers: Vec<DeleteMarkerEntry> = page
4303 .delete_markers
4304 .into_iter()
4305 .map(|e| DeleteMarkerEntry {
4306 key: Some(e.key),
4307 version_id: Some(e.version_id),
4308 is_latest: Some(e.is_latest),
4309 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4310 ..Default::default()
4311 })
4312 .collect();
4313 let output = ListObjectVersionsOutput {
4314 name: Some(req.input.bucket.clone()),
4315 prefix: req.input.prefix.clone(),
4316 key_marker: req.input.key_marker.clone(),
4317 version_id_marker: req.input.version_id_marker.clone(),
4318 max_keys: req.input.max_keys,
4319 versions: if versions.is_empty() {
4320 None
4321 } else {
4322 Some(versions)
4323 },
4324 delete_markers: if delete_markers.is_empty() {
4325 None
4326 } else {
4327 Some(delete_markers)
4328 },
4329 is_truncated: Some(page.is_truncated),
4330 next_key_marker: page.next_key_marker,
4331 next_version_id_marker: page.next_version_id_marker,
4332 ..Default::default()
4333 };
4334 return Ok(S3Response::new(output));
4335 }
4336 // Legacy passthrough path (v0.4 #17 sidecar filter retained).
4337 let mut resp = self.backend.list_object_versions(req).await?;
4338 if let Some(versions) = resp.output.versions.as_mut() {
4339 versions.retain(|v| {
4340 v.key
4341 .as_ref()
4342 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4343 .unwrap_or(true)
4344 });
4345 }
4346 if let Some(markers) = resp.output.delete_markers.as_mut() {
4347 markers.retain(|m| {
4348 m.key
4349 .as_ref()
4350 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4351 .unwrap_or(true)
4352 });
4353 }
4354 Ok(resp)
4355 }
4356
4357 async fn create_multipart_upload(
4358 &self,
4359 mut req: S3Request<CreateMultipartUploadInput>,
4360 ) -> S3Result<S3Response<CreateMultipartUploadOutput>> {
4361 // v0.8.12 HIGH-9 fix: gate multipart Create on `s3:PutObject` —
4362 // the destination is conceptually about to host a new object,
4363 // matching what `put_object` enforces L2078. Without this, a
4364 // bucket policy denying `s3:PutObject` was bypassable simply
4365 // by switching the client to the multipart wire path.
4366 let mp_bucket = req.input.bucket.clone();
4367 let mp_key = req.input.key.clone();
4368 // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
4369 self.check_not_reserved_key(&mp_key, ReservedKeyMode::Mutating)?;
4370 self.enforce_policy(&req, "s3:PutObject", &mp_bucket, Some(&mp_key))?;
4371 self.enforce_rate_limit(&req, &mp_bucket)?;
4372 // Multipart object は per-part 圧縮 + frame 形式で書く。GET 時に
4373 // frame parse を起動するため、object metadata に flag を立てる。
4374 // codec は dispatcher の default kind を採用 (per-part 別 codec は Phase 2)。
4375 let codec_kind = self.registry.default_kind();
4376 let meta = req.input.metadata.get_or_insert_with(Default::default);
4377 meta.insert(META_MULTIPART.into(), "true".into());
4378 meta.insert(META_CODEC.into(), codec_kind.as_str().into());
4379 // v0.8 #54 BUG-10 fix: take() the SSE request fields off
4380 // `req.input` so they are NOT forwarded to the backend on
4381 // CreateMultipartUpload. Same root cause as v0.7 #48 BUG-2/3 on
4382 // single-PUT — MinIO rejects SSE-C with "HTTPS required" and
4383 // SSE-KMS with "KMS not configured" when the headers reach it.
4384 // S4 owns the encrypt-then-store contract; we capture the
4385 // recipe in `multipart_state` here and apply it on Complete.
4386 let sse_c_alg = req.input.sse_customer_algorithm.take();
4387 let sse_c_key = req.input.sse_customer_key.take();
4388 let sse_c_md5 = req.input.sse_customer_key_md5.take();
4389 let sse_header = req.input.server_side_encryption.take();
4390 let sse_kms_key = req.input.ssekms_key_id.take();
4391 // Strip the encryption-context too — leaving it would make
4392 // MinIO try to validate it against a non-existent KMS key.
4393 let _ = req.input.ssekms_encryption_context.take();
4394 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
4395 let kms_key_id = extract_kms_key_id(
4396 &sse_header,
4397 &sse_kms_key,
4398 self.kms_default_key_id.as_deref(),
4399 );
4400 // SSE-C / SSE-KMS exclusivity (mirrors put_object L1870).
4401 if sse_c_material.is_some() && kms_key_id.is_some() {
4402 return Err(S3Error::with_message(
4403 S3ErrorCode::InvalidArgument,
4404 "SSE-C and SSE-KMS cannot be used together on the same multipart upload",
4405 ));
4406 }
4407 let sse_mode = if let Some(ref m) = sse_c_material {
4408 // v0.8.2 #62 (H-6 audit fix): wrap the customer-supplied
4409 // 32-byte key in `Zeroizing` so abandoned uploads (or
4410 // normal Complete/Abort) wipe the key bytes on drop. The
4411 // `key_md5` is the public fingerprint and stays as a
4412 // bare `[u8; 16]`.
4413 crate::multipart_state::MultipartSseMode::SseC {
4414 key: zeroize::Zeroizing::new(m.key),
4415 key_md5: m.key_md5,
4416 }
4417 } else if let Some(ref kid) = kms_key_id {
4418 // KMS pre-flight: fail at Create rather than at Complete if
4419 // the gateway has no KMS backend wired (mirrors the
4420 // put_object L1879 check).
4421 if self.kms.is_none() {
4422 return Err(S3Error::with_message(
4423 S3ErrorCode::InvalidRequest,
4424 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4425 ));
4426 }
4427 crate::multipart_state::MultipartSseMode::SseKms {
4428 key_id: kid.clone(),
4429 }
4430 } else if self.sse_keyring.is_some() {
4431 // SSE-S4: server-driven transparent encryption. Activates
4432 // whenever the gateway has a keyring configured AND the
4433 // client didn't pick a different SSE mode.
4434 crate::multipart_state::MultipartSseMode::SseS4
4435 } else {
4436 crate::multipart_state::MultipartSseMode::None
4437 };
4438 // v0.8 #54 BUG-9 fix: parse the Tagging header on Create. The
4439 // single-PUT path does this on PutObject; the multipart path
4440 // captures it now and commits via TagManager on Complete.
4441 let request_tags: Option<crate::tagging::TagSet> = req
4442 .input
4443 .tagging
4444 .as_deref()
4445 .map(crate::tagging::parse_tagging_header)
4446 .transpose()
4447 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
4448 // Strip the `Tagging` field off the input so the backend
4449 // doesn't try to apply it (no-op on MinIO but keeps the wire
4450 // clean).
4451 let _ = req.input.tagging.take();
4452 // Object Lock recipe (BUG-7 — captured here, applied on Complete).
4453 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
4454 .input
4455 .object_lock_mode
4456 .as_ref()
4457 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
4458 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
4459 .input
4460 .object_lock_retain_until_date
4461 .as_ref()
4462 .and_then(timestamp_to_chrono_utc);
4463 let explicit_legal_hold_on: bool = req
4464 .input
4465 .object_lock_legal_hold_status
4466 .as_ref()
4467 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
4468 .unwrap_or(false);
4469 let bucket = req.input.bucket.clone();
4470 let key = req.input.key.clone();
4471 debug!(
4472 bucket = %bucket,
4473 key = %key,
4474 codec = codec_kind.as_str(),
4475 sse = ?sse_mode,
4476 "S4 create_multipart_upload: marking object for per-part compression"
4477 );
4478 let mut resp = self.backend.create_multipart_upload(req).await?;
4479 // Stash the per-upload context only after the backend handed
4480 // us an upload_id (failed Creates leave nothing in the store).
4481 if let Some(upload_id) = resp.output.upload_id.as_ref() {
4482 self.multipart_state.put(
4483 upload_id,
4484 crate::multipart_state::MultipartUploadContext {
4485 bucket,
4486 key,
4487 sse: sse_mode.clone(),
4488 tags: request_tags,
4489 object_lock_mode: explicit_lock_mode,
4490 object_lock_retain_until: explicit_retain_until,
4491 object_lock_legal_hold: explicit_legal_hold_on,
4492 },
4493 );
4494 }
4495 // SSE-C / SSE-KMS response echo (mirrors put_object L2036-L2050).
4496 match &sse_mode {
4497 crate::multipart_state::MultipartSseMode::SseC { key_md5, .. } => {
4498 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
4499 resp.output.sse_customer_key_md5 =
4500 Some(base64::engine::general_purpose::STANDARD.encode(key_md5));
4501 }
4502 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
4503 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
4504 ServerSideEncryption::AWS_KMS,
4505 ));
4506 resp.output.ssekms_key_id = Some(key_id.clone());
4507 }
4508 _ => {}
4509 }
4510 Ok(resp)
4511 }
4512
4513 async fn upload_part(
4514 &self,
4515 mut req: S3Request<UploadPartInput>,
4516 ) -> S3Result<S3Response<UploadPartOutput>> {
4517 // v0.8.12 HIGH-9 fix: same `s3:PutObject` gate as
4518 // `put_object` / `create_multipart_upload`. Even though
4519 // Create already passed the gate, a bucket policy that
4520 // *revokes* `s3:PutObject` mid-flight should stop further
4521 // parts (e.g. legal hold drops, retention shortened).
4522 let part_bucket = req.input.bucket.clone();
4523 let part_key = req.input.key.clone();
4524 self.enforce_policy(&req, "s3:PutObject", &part_bucket, Some(&part_key))?;
4525 self.enforce_rate_limit(&req, &part_bucket)?;
4526 // 各 part を圧縮して frame header 付きで forward。GET 時に
4527 // `decompress_multipart` が frame iter で順に解凍する。
4528 // **per-part codec dispatch**: dispatcher が body 先頭 sample から
4529 // codec を選ぶので、parquet 風の mixed-content multipart で part ごとに
4530 // 最適 codec を使える (整数列 part → Bitcomp、text 列 part → zstd 等)。
4531 //
4532 // v0.8 #54 BUG-5/BUG-10 fix: lookup the per-upload SSE
4533 // context captured by `create_multipart_upload` and (a) strip
4534 // any SSE-C request headers off `req.input` so the backend
4535 // doesn't see them — same root cause as v0.7 #48 BUG-2/3 on
4536 // single-PUT; MinIO refuses SSE-C parts over HTTP — and (b)
4537 // observe that an upload context exists for `upload_id`. The
4538 // actual encrypt happens once at `complete_multipart_upload`
4539 // time on the assembled body (the per-part-encrypt approach
4540 // would require a matching multi-segment decrypt path on GET;
4541 // encrypting the whole assembled body keeps the GET path's
4542 // `is_sse_encrypted` branch in get_object L2429 working
4543 // unchanged).
4544 let sse_ctx = self.multipart_state.get(req.input.upload_id.as_str());
4545 // v0.8.2 #62 (H-1 audit fix): SSE-C key consistency check.
4546 // The AWS S3 spec requires the same SSE-C key headers on
4547 // every UploadPart and rejects mismatches with 400. Prior to
4548 // #62 we silently stripped the headers (BUG-10 fix) without
4549 // validating them, allowing a client to send part 1 under
4550 // key-A and part 2 under key-B; both got stored, then
4551 // re-encrypted with key-A on Complete — the client thinks
4552 // part 2 is under key-B but a GET with key-B would in fact
4553 // hit the part-1 ciphertext that was actually encrypted with
4554 // key-A. That would either decrypt successfully (silent
4555 // corruption: client lost track of which key encrypts what)
4556 // or fail in a confusing way. Validate the per-part headers
4557 // now and reject with 400 InvalidArgument on mismatch /
4558 // omission / partial supply, matching real-S3 behaviour.
4559 if let Some(ref ctx) = sse_ctx {
4560 if let crate::multipart_state::MultipartSseMode::SseC {
4561 key_md5: ctx_md5, ..
4562 } = &ctx.sse
4563 {
4564 let alg = req.input.sse_customer_algorithm.take();
4565 let key_b64 = req.input.sse_customer_key.take();
4566 let md5_b64 = req.input.sse_customer_key_md5.take();
4567 match (alg, key_b64, md5_b64) {
4568 (Some(a), Some(k), Some(m)) => {
4569 // Parse + validate; if the per-part headers
4570 // are themselves malformed (algorithm not
4571 // AES256, MD5 mismatch, key not 32 bytes)
4572 // surface the same 400 the single-PUT path
4573 // would. Then compare the parsed MD5 to the
4574 // upload-context's MD5; mismatch is a
4575 // different-key UploadPart and must reject.
4576 let part_material = crate::sse::parse_customer_key_headers(&a, &k, &m)
4577 .map_err(sse_c_error_to_s3)?;
4578 if part_material.key_md5 != *ctx_md5 {
4579 return Err(S3Error::with_message(
4580 S3ErrorCode::InvalidArgument,
4581 "SSE-C key on UploadPart does not match the key supplied on CreateMultipartUpload",
4582 ));
4583 }
4584 // OK — same key as Create. Headers are
4585 // already taken off `req.input` so the
4586 // backend never sees them.
4587 }
4588 (None, None, None) => {
4589 // AWS S3 spec: SSE-C headers MUST be replayed
4590 // on every UploadPart of an SSE-C multipart.
4591 // Real-S3 returns 400 InvalidRequest in this
4592 // case; mirror that.
4593 return Err(S3Error::with_message(
4594 S3ErrorCode::InvalidRequest,
4595 "SSE-C requires customer-key headers on every UploadPart (CreateMultipartUpload was SSE-C)",
4596 ));
4597 }
4598 _ => {
4599 // Partial header set (e.g. algorithm + key
4600 // but no MD5) — same handling as the
4601 // single-PUT `extract_sse_c_material` helper.
4602 return Err(S3Error::with_message(
4603 S3ErrorCode::InvalidRequest,
4604 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
4605 ));
4606 }
4607 }
4608 } else {
4609 // CreateMultipartUpload was non-SSE-C (None / SseS4 /
4610 // SseKms). A part that arrives carrying SSE-C headers
4611 // is either a confused client or an attempt to
4612 // smuggle SSE-C around the gateway-internal SSE
4613 // recipe. Reject with 400 InvalidRequest rather than
4614 // silently strip — the strip would let the client
4615 // believe the part was encrypted under their key
4616 // when in fact the upload's encryption recipe is
4617 // whatever the Create captured.
4618 if req.input.sse_customer_algorithm.is_some()
4619 || req.input.sse_customer_key.is_some()
4620 || req.input.sse_customer_key_md5.is_some()
4621 {
4622 return Err(S3Error::with_message(
4623 S3ErrorCode::InvalidRequest,
4624 "UploadPart sent SSE-C headers but CreateMultipartUpload was not SSE-C",
4625 ));
4626 }
4627 }
4628 } else {
4629 // No upload context registered (gateway crashed between
4630 // Create and Part, or pre-#62 abandoned-upload restore).
4631 // We can't check key consistency in this case — strip
4632 // the headers and let the request through unchanged so
4633 // the backend's `NoSuchUpload` reply (or whatever it
4634 // chooses to do) flows back to the client.
4635 let _ = req.input.sse_customer_algorithm.take();
4636 let _ = req.input.sse_customer_key.take();
4637 let _ = req.input.sse_customer_key_md5.take();
4638 }
4639 let _sse_ctx = sse_ctx;
4640 if let Some(blob) = req.input.body.take() {
4641 let bytes = collect_blob(blob, self.max_body_bytes)
4642 .await
4643 .map_err(internal("collect upload_part body"))?;
4644 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
4645 // checksum algorithms against the received part body.
4646 verify_client_body_checksums(
4647 &bytes,
4648 req.input.content_md5.as_deref(),
4649 req.input.checksum_crc32.as_deref(),
4650 req.input.checksum_crc32c.as_deref(),
4651 req.input.checksum_sha1.as_deref(),
4652 req.input.checksum_sha256.as_deref(),
4653 req.input.checksum_crc64nvme.as_deref(),
4654 )?;
4655 let sample_len = bytes.len().min(SAMPLE_BYTES);
4656 // v0.8 #56: full part body is already in memory here; use its
4657 // length as the size hint so the dispatcher can promote to GPU
4658 // if it's big enough.
4659 let codec_kind = self
4660 .dispatcher
4661 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
4662 .await;
4663 let original_size = bytes.len() as u64;
4664 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
4665 let (compress_res, tel) = self
4666 .registry
4667 .compress_with_telemetry(bytes, codec_kind)
4668 .await;
4669 stamp_gpu_compress_telemetry(&tel);
4670 let (compressed, manifest) =
4671 compress_res.map_err(internal("registry compress part"))?;
4672 let header = FrameHeader {
4673 codec: codec_kind,
4674 original_size,
4675 compressed_size: compressed.len() as u64,
4676 crc32c: manifest.crc32c,
4677 };
4678 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
4679 write_frame(&mut framed, header, &compressed);
4680 // v0.2 #5: heuristic-based padding skip for likely-final parts.
4681 //
4682 // AWS SDK / aws-cli / boto3 always send the final (and only the
4683 // final) part below the configured part_size. So if the raw user
4684 // part is already smaller than S3's 5 MiB multipart minimum, this
4685 // is overwhelmingly likely to be the final part — and the final
4686 // part is exempt from S3's size constraint. Skipping padding here
4687 // saves up to ~5 MiB per object on highly compressible workloads.
4688 //
4689 // If a misbehaving client sends a tiny **non-final** part, S3
4690 // itself rejects with EntityTooSmall at CompleteMultipartUpload —
4691 // identical outcome to a vanilla S3 PUT, just earlier than
4692 // padding-then-complete would catch it.
4693 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
4694 if !likely_final {
4695 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
4696 }
4697 let framed_bytes = framed.freeze();
4698 let new_len = framed_bytes.len() as i64;
4699 // 同じ wire 互換問題が multipart にもある (content-length / checksum)
4700 req.input.content_length = Some(new_len);
4701 req.input.checksum_algorithm = None;
4702 req.input.checksum_crc32 = None;
4703 req.input.checksum_crc32c = None;
4704 req.input.checksum_crc64nvme = None;
4705 req.input.checksum_sha1 = None;
4706 req.input.checksum_sha256 = None;
4707 req.input.content_md5 = None;
4708 req.input.body = Some(bytes_to_blob(framed_bytes));
4709 debug!(
4710 part_number = ?req.input.part_number,
4711 upload_id = ?req.input.upload_id,
4712 original_size,
4713 framed_size = new_len,
4714 "S4 upload_part: framed compressed payload"
4715 );
4716 }
4717 self.backend.upload_part(req).await
4718 }
4719 async fn complete_multipart_upload(
4720 &self,
4721 mut req: S3Request<CompleteMultipartUploadInput>,
4722 ) -> S3Result<S3Response<CompleteMultipartUploadOutput>> {
4723 let bucket = req.input.bucket.clone();
4724 let key = req.input.key.clone();
4725 let upload_id = req.input.upload_id.clone();
4726 // v0.8.12 HIGH-9 fix: gate Complete on `s3:PutObject` (the
4727 // commit point for the multipart-assembled object).
4728 self.enforce_policy(&req, "s3:PutObject", &bucket, Some(&key))?;
4729 self.enforce_rate_limit(&req, &bucket)?;
4730 // v0.8.12 HIGH-6 fix: re-verify Object Lock on the target key
4731 // at Complete time. Without this an attacker with PutObject
4732 // permission could `CreateMultipartUpload` against a key
4733 // that's currently under retention / legal hold and silently
4734 // overwrite it on Complete (the single-PUT path runs the
4735 // same check at L2007). Compliance retention is never
4736 // bypassable; Governance only with explicit IAM permission
4737 // (HIGH-7 gate below).
4738 if let Some(mgr) = self.object_lock.as_ref()
4739 && let Some(state) = mgr.get(&bucket, &key)
4740 {
4741 // CompleteMultipartUpload doesn't carry the bypass header
4742 // (the s3s DTO matches AWS' wire schema). A locked key
4743 // therefore cannot be overwritten by Complete regardless
4744 // of caller permission — operators who need to break a
4745 // Governance lock do it via PutObjectRetention before
4746 // calling Complete.
4747 let now = chrono::Utc::now();
4748 if !state.can_delete(now, false) {
4749 crate::metrics::record_policy_denial("s3:PutObject", &bucket);
4750 return Err(S3Error::with_message(
4751 S3ErrorCode::AccessDenied,
4752 "Access Denied because target key is protected by object lock",
4753 ));
4754 }
4755 }
4756 // v0.8.1 #59: serialise concurrent Complete invocations on the
4757 // same `(bucket, key)`. The race window the lock closes is the
4758 // GET-assembled-body → encrypt → PUT-encrypted-body triple
4759 // below (BUG-5 fix); without serialisation, two Completes for
4760 // different `upload_id` but the same logical key could each
4761 // read the other's plaintext assembled body and overwrite the
4762 // peer's encrypted result. The guard is held to function exit
4763 // (drop on `Ok` / `Err`), covering version-id mint, object-
4764 // lock apply, tagging persist, and replication enqueue too.
4765 let completion_lock = self.multipart_state.completion_lock(&bucket, &key);
4766 let _completion_guard = completion_lock.lock().await;
4767 // v0.8 #54 — fetch the per-upload context captured on Create.
4768 // `None` means an abandoned / unknown upload_id (gateway
4769 // crashed between Create and Complete, or pre-v0.8 state
4770 // restore); we still let the backend do its thing for
4771 // transparency, but we can't apply any SSE / version / lock /
4772 // tag / replication post-processing because we never captured
4773 // the recipe.
4774 let ctx = self.multipart_state.get(upload_id.as_str());
4775 // v0.8 #54 BUG-10 fix: same SSE-C header strip as upload_part
4776 // — some clients (boto3 / aws-sdk-cpp older versions) replay
4777 // the SSE-C triple on Complete too, and MinIO will choke if
4778 // they reach the backend.
4779 let _ = req.input.sse_customer_algorithm.take();
4780 let _ = req.input.sse_customer_key.take();
4781 let _ = req.input.sse_customer_key_md5.take();
4782 let mut resp = self.backend.complete_multipart_upload(req).await?;
4783 // CompleteMultipartUpload 成功 → 完成した object を full fetch して frame
4784 // index を build、`<key>.s4index` sidecar として保存。これで Range GET の
4785 // partial fetch path が利用可能になる (Range request の帯域節約)。
4786 // 注: 巨大 object の場合この pass は重いが、Range query は一度 sidecar が
4787 // できれば爆速になるので 1 回の cost は payback される
4788 //
4789 // v0.8 #54 BUG-5..9: this same fetch is the choke-point for
4790 // the SSE encrypt re-PUT + versioning shadow-key rewrite +
4791 // replication source-bytes capture, so we GET once and reuse
4792 // the bytes for every post-processing step.
4793 let assembled_body: Option<bytes::Bytes> = if let Ok(uri) = safe_object_uri(&bucket, &key) {
4794 let get_input = GetObjectInput {
4795 bucket: bucket.clone(),
4796 key: key.clone(),
4797 ..Default::default()
4798 };
4799 let get_req = S3Request {
4800 input: get_input,
4801 method: http::Method::GET,
4802 uri,
4803 headers: http::HeaderMap::new(),
4804 extensions: http::Extensions::new(),
4805 credentials: None,
4806 region: None,
4807 service: None,
4808 trailing_headers: None,
4809 };
4810 match self.backend.get_object(get_req).await {
4811 Ok(get_resp) => match get_resp.output.body {
4812 Some(blob) => collect_blob(blob, self.max_body_bytes).await.ok(),
4813 None => None,
4814 },
4815 Err(e) => {
4816 // v0.8.4 #71 (C-1 audit fix): a silent
4817 // `Err(_) => None` here is a SSE plaintext
4818 // leak. The post-processing block below only
4819 // runs the SSE re-encrypt branch when
4820 // `assembled_body.is_some()`, so swallowing a
4821 // backend error skipped the encrypt step and
4822 // left the multipart object on disk as
4823 // plaintext, even on SSE-S4 / SSE-C / SSE-KMS
4824 // configured buckets. Same root-cause family
4825 // as v0.8 BUG-5; this branch closes the
4826 // remaining read-side window.
4827 //
4828 // We distinguish two cases:
4829 // - `NoSuchKey`: the object is genuinely
4830 // missing post-Complete. This is rare and
4831 // typically races with a concurrent
4832 // DeleteObject; there is nothing to re-
4833 // encrypt and no SSE markers to honour, so
4834 // falling through to the legacy
4835 // `assembled_body = None` path is safe.
4836 // - everything else (5xx, network, auth,
4837 // etc.): we must FAIL the Complete so the
4838 // client can retry. Returning Ok with
4839 // `assembled_body = None` would silently
4840 // skip the SSE re-encrypt and leave the
4841 // backend bytes plaintext.
4842 if matches!(e.code(), &S3ErrorCode::NoSuchKey) {
4843 tracing::warn!(
4844 bucket = %bucket,
4845 key = %key,
4846 "multipart Complete: backend GET returned NoSuchKey; \
4847 skipping post-processing (object likely raced with DeleteObject)"
4848 );
4849 None
4850 } else {
4851 tracing::error!(
4852 bucket = %bucket,
4853 key = %key,
4854 error = %e,
4855 "multipart Complete: backend GET failed; failing the Complete \
4856 so the client retries (silent fall-through would skip SSE \
4857 re-encrypt and store plaintext)"
4858 );
4859 return Err(internal("multipart Complete: backend body fetch failed")(e));
4860 }
4861 }
4862 }
4863 } else {
4864 None
4865 };
4866 // Sidecar build (existing behaviour, gated on assembled body).
4867 //
4868 // v0.8.12 HIGH-10 fix: skip the sidecar when the Complete is
4869 // going to SSE-encrypt the assembled body before re-PUT (the
4870 // single-PUT path applies the same suppression at L2271).
4871 // Stale offsets into the pre-encrypt body would break Range
4872 // GET on the encrypted on-disk bytes. `ctx.sse != None`
4873 // covers all three SSE modes captured at Create time.
4874 let mp_will_encrypt = ctx
4875 .as_ref()
4876 .map(|c| !matches!(c.sse, crate::multipart_state::MultipartSseMode::None))
4877 .unwrap_or(false);
4878 // v0.8.16 F-7: versioned multipart writes the assembled body
4879 // under `versioned_shadow_key(&key, vid)` *after* this
4880 // sidecar block, then deletes the original `<key>`. Stamping
4881 // the sidecar against the to-be-deleted `<key>` (which is
4882 // what H-g did) leaves an orphan `<key>.s4index` whose
4883 // source-ETag binding can never match the live shadow body
4884 // — the Range GET fast-path's stale-sidecar check then
4885 // falls through to a full read on every request, silently
4886 // disabling partial fetch. Skip the sidecar build entirely
4887 // for versioned buckets; a follow-up issue tracks writing
4888 // the sidecar under the shadow key with the shadow's ETag.
4889 let mp_skip_sidecar_for_versioning = self
4890 .versioning
4891 .as_ref()
4892 .map(|mgr| mgr.state(&bucket))
4893 .map(|state| state == crate::versioning::VersioningState::Enabled)
4894 .unwrap_or(false);
4895 if let Some(ref body) = assembled_body
4896 && !mp_will_encrypt
4897 && !mp_skip_sidecar_for_versioning
4898 && let Ok(mut index) = build_index_from_body(body)
4899 {
4900 // v0.8.15 H-g: stamp the source-ETag / source-compressed-size
4901 // binding on the multipart sidecar. The single-PUT path
4902 // does this at L2519-L2521 via the backend's PUT response,
4903 // but Complete returns its own ETag (an opaque manifest
4904 // hash) so we have to HEAD the freshly-completed object
4905 // to pick up what backend actually wrote, then bind the
4906 // sidecar to those values. Without the binding, a
4907 // subsequent backend-side mutation (lifecycle rewrite,
4908 // out-of-band CopyObject) wouldn't trip the staleness
4909 // check on the next Range GET — the GET would happily
4910 // slice the new bytes at the old sidecar offsets, with
4911 // silent data corruption.
4912 if let Ok(uri) = safe_object_uri(&bucket, &key) {
4913 let head_req = S3Request {
4914 input: HeadObjectInput {
4915 bucket: bucket.clone(),
4916 key: key.clone(),
4917 ..Default::default()
4918 },
4919 method: http::Method::HEAD,
4920 uri,
4921 headers: http::HeaderMap::new(),
4922 extensions: http::Extensions::new(),
4923 credentials: None,
4924 region: None,
4925 service: None,
4926 trailing_headers: None,
4927 };
4928 if let Ok(head) = self.backend.head_object(head_req).await {
4929 index.source_etag = head.output.e_tag.as_ref().map(|t| t.value().to_string());
4930 index.source_compressed_size = head
4931 .output
4932 .content_length
4933 .and_then(|n| u64::try_from(n).ok());
4934 }
4935 // HEAD failure is non-fatal — the sidecar still works
4936 // as a v1-style best-effort fast path; the Range GET
4937 // simply falls back to a full read on any consistency
4938 // signal.
4939 }
4940 self.write_sidecar(&bucket, &key, &index).await;
4941 }
4942 // From here on, post-processing depends on the context —
4943 // short-circuit when the upload had no captured recipe
4944 // (legacy / crashed-Create / pre-v0.8 state restore).
4945 if let Some(ctx) = ctx {
4946 // v0.8 #54 BUG-6 fix: mint a version-id when the bucket
4947 // is versioning-Enabled. The single-PUT path does this in
4948 // `put_object` ~L1968; multipart was the missing branch.
4949 // We mint here (post-Complete, before any re-PUT) so the
4950 // same vid threads into both the shadow-key rewrite and
4951 // the VersionEntry the manager records.
4952 let pending_version: Option<crate::versioning::PutOutcome> = self
4953 .versioning
4954 .as_ref()
4955 .map(|mgr| mgr.state(&bucket))
4956 .map(|state| match state {
4957 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
4958 version_id: crate::versioning::VersioningManager::new_version_id(),
4959 versioned_response: true,
4960 },
4961 crate::versioning::VersioningState::Suspended
4962 | crate::versioning::VersioningState::Unversioned => {
4963 crate::versioning::PutOutcome {
4964 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
4965 versioned_response: false,
4966 }
4967 }
4968 });
4969 // v0.8 #54 BUG-5 fix: encrypt the assembled framed body
4970 // and re-PUT it to the backend so the on-disk bytes are
4971 // SSE-encrypted. The single-PUT path does this body-by-
4972 // body inside `put_object` (L1907-L1942); for multipart,
4973 // encrypt-per-part would require a multi-segment decrypt
4974 // path on GET — we instead do a single encrypt over the
4975 // assembled framed body so the existing GET decrypt
4976 // branch (`is_sse_encrypted` → `decrypt(body, source)` →
4977 // FrameIter) handles it unchanged.
4978 //
4979 // The cost is one extra round-trip per Complete for SSE-
4980 // enabled multipart (already-paid for the sidecar build).
4981 // For single-instance gateways pointing at a co-located
4982 // backend this is negligible; cross-region operators
4983 // would benefit from per-part encrypt + multi-segment
4984 // decrypt as a follow-up.
4985 let needs_re_put = matches!(
4986 ctx.sse,
4987 crate::multipart_state::MultipartSseMode::SseS4
4988 | crate::multipart_state::MultipartSseMode::SseC { .. }
4989 | crate::multipart_state::MultipartSseMode::SseKms { .. }
4990 ) || pending_version
4991 .as_ref()
4992 .map(|pv| pv.versioned_response)
4993 .unwrap_or(false);
4994 // v0.8.11 CRIT-2 fix: seed the replication body with the
4995 // pre-encrypt assembled bytes, but overwrite it with the
4996 // post-encrypt `new_body` once the re-PUT branch lands.
4997 // The previous "snapshot in advance" pattern shipped the
4998 // *plaintext* framed body to the destination bucket even
4999 // when SSE-S4 / SSE-C / SSE-KMS was active — the GET on
5000 // the destination would then fail to decrypt (or, worse,
5001 // succeed in handing out plaintext that the source had
5002 // promised was encrypted at rest). When `needs_re_put`
5003 // is false (no SSE, no versioning), the backend still
5004 // holds the original plaintext-framed bytes, and the
5005 // seed value is what the destination should receive.
5006 let mut replication_body = assembled_body.clone();
5007 let mut applied_metadata: Option<std::collections::HashMap<String, String>> = None;
5008 if needs_re_put && let Some(body) = assembled_body {
5009 // v0.8.1 #58: same Zeroizing pattern as put_object's
5010 // single-PUT KMS branch — DEK plaintext lives in
5011 // `Zeroizing<[u8; 32]>` for the lifetime of this
5012 // Complete handler, then is wiped on drop.
5013 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
5014 if let crate::multipart_state::MultipartSseMode::SseKms { ref key_id } = ctx.sse
5015 {
5016 let kms = self.kms.as_ref().ok_or_else(|| {
5017 S3Error::with_message(
5018 S3ErrorCode::InvalidRequest,
5019 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
5020 )
5021 })?;
5022 let (dek, wrapped) =
5023 kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
5024 if dek.len() != 32 {
5025 return Err(S3Error::with_message(
5026 S3ErrorCode::InternalError,
5027 format!(
5028 "KMS backend returned a DEK of {} bytes (expected 32)",
5029 dek.len()
5030 ),
5031 ));
5032 }
5033 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
5034 zeroize::Zeroizing::new([0u8; 32]);
5035 dek_arr.copy_from_slice(&dek);
5036 // `dek` (Zeroizing<Vec<u8>>) is dropped at scope end.
5037 Some((dek_arr, wrapped))
5038 } else {
5039 None
5040 };
5041 // Build the new metadata map: re-fetch via HEAD so
5042 // the multipart / codec markers the backend stamped
5043 // on Create flow through unchanged, then layer the
5044 // SSE markers on top.
5045 let head_req = S3Request {
5046 input: HeadObjectInput {
5047 bucket: bucket.clone(),
5048 key: key.clone(),
5049 ..Default::default()
5050 },
5051 method: http::Method::HEAD,
5052 uri: safe_object_uri(&bucket, &key)?,
5053 headers: http::HeaderMap::new(),
5054 extensions: http::Extensions::new(),
5055 credentials: None,
5056 region: None,
5057 service: None,
5058 trailing_headers: None,
5059 };
5060 let mut new_metadata: std::collections::HashMap<String, String> =
5061 match self.backend.head_object(head_req).await {
5062 Ok(h) => h.output.metadata.unwrap_or_default(),
5063 Err(_) => std::collections::HashMap::new(),
5064 };
5065 let new_body = match &ctx.sse {
5066 crate::multipart_state::MultipartSseMode::SseC { key, key_md5 } => {
5067 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5068 new_metadata.insert("s4-sse-type".into(), "AES256".into());
5069 new_metadata.insert(
5070 "s4-sse-c-key-md5".into(),
5071 base64::engine::general_purpose::STANDARD.encode(key_md5),
5072 );
5073 // v0.8.2 #62: `key` is `&Zeroizing<[u8; 32]>`;
5074 // auto-deref through one explicit binding so
5075 // `SseSource::CustomerKey` gets the `&[u8; 32]`
5076 // it expects (mirrors the SSE-KMS DEK shape
5077 // a few lines down).
5078 let key_ref: &[u8; 32] = key;
5079 crate::sse::encrypt_with_source(
5080 &body,
5081 crate::sse::SseSource::CustomerKey {
5082 key: key_ref,
5083 key_md5,
5084 },
5085 )
5086 }
5087 crate::multipart_state::MultipartSseMode::SseKms { .. } => {
5088 let (dek, wrapped) = kms_wrap
5089 .as_ref()
5090 .expect("SseKms branch implies kms_wrap is Some");
5091 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5092 new_metadata.insert("s4-sse-type".into(), "aws:kms".into());
5093 new_metadata.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
5094 // v0.8.1 #58: auto-deref from `&Zeroizing<[u8; 32]>`
5095 // to `&[u8; 32]` (same shape as the put_object
5096 // single-PUT branch).
5097 let dek_ref: &[u8; 32] = dek;
5098 crate::sse::encrypt_with_source(
5099 &body,
5100 crate::sse::SseSource::Kms {
5101 dek: dek_ref,
5102 wrapped,
5103 },
5104 )
5105 }
5106 crate::multipart_state::MultipartSseMode::SseS4 => {
5107 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
5108 S3Error::with_message(
5109 S3ErrorCode::InternalError,
5110 "SSE-S4 captured at Create but keyring missing at Complete",
5111 )
5112 })?;
5113 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5114 // SSE-S4 deliberately omits `s4-sse-type` so
5115 // HEAD doesn't falsely advertise AWS-style
5116 // SSE-S3 (matches the put_object L1929-L1939
5117 // comment).
5118 // v0.8 #52: same chunk_size dispatch as the
5119 // single-PUT branch — multipart Complete
5120 // re-encrypts the assembled body, so honoring
5121 // the chunked path here is required to keep
5122 // GET streaming on multipart-uploaded objects.
5123 if self.sse_chunk_size > 0 {
5124 crate::sse::encrypt_v2_chunked(&body, keyring, self.sse_chunk_size)
5125 .map_err(|e| {
5126 S3Error::with_message(
5127 S3ErrorCode::InternalError,
5128 format!("SSE-S4 chunked encrypt failed at Complete: {e}"),
5129 )
5130 })?
5131 } else {
5132 crate::sse::encrypt_v2(&body, keyring)
5133 }
5134 }
5135 crate::multipart_state::MultipartSseMode::None => body.clone(),
5136 };
5137 // v0.8 #54 BUG-6 fix: write the re-PUT under the
5138 // shadow key so the version chain doesn't overwrite
5139 // the previous version on a versioned bucket. The
5140 // original (unshadowed) key was assembled by the
5141 // backend on Complete; we delete it after the shadow
5142 // PUT lands.
5143 let put_target_key = if let Some(pv) = pending_version.as_ref() {
5144 if pv.versioned_response {
5145 versioned_shadow_key(&key, &pv.version_id)
5146 } else {
5147 key.clone()
5148 }
5149 } else {
5150 key.clone()
5151 };
5152 let new_body_len = new_body.len() as i64;
5153 let put_req = S3Request {
5154 input: PutObjectInput {
5155 bucket: bucket.clone(),
5156 key: put_target_key.clone(),
5157 body: Some(bytes_to_blob(new_body.clone())),
5158 metadata: Some(new_metadata.clone()),
5159 content_length: Some(new_body_len),
5160 ..Default::default()
5161 },
5162 method: http::Method::PUT,
5163 uri: safe_object_uri(&bucket, &put_target_key)?,
5164 headers: http::HeaderMap::new(),
5165 extensions: http::Extensions::new(),
5166 credentials: None,
5167 region: None,
5168 service: None,
5169 trailing_headers: None,
5170 };
5171 self.backend.put_object(put_req).await?;
5172 // v0.8.11 CRIT-2 fix: refresh the replication snapshot
5173 // with the bytes that were actually persisted to the
5174 // backend (post-SSE-encrypt for SSE modes; identical to
5175 // `body` for `MultipartSseMode::None` + versioning-only
5176 // re-PUT). The destination then sees the same on-disk
5177 // shape the source does, and a destination GET decrypts
5178 // correctly when SSE is on.
5179 replication_body = Some(new_body.clone());
5180 // If we rewrote the storage key (versioning shadow),
5181 // we must drop the original (unshadowed) Complete-
5182 // assembled bytes so subsequent listings don't see a
5183 // duplicate.
5184 if put_target_key != key {
5185 let del_req = S3Request {
5186 input: DeleteObjectInput {
5187 bucket: bucket.clone(),
5188 key: key.clone(),
5189 ..Default::default()
5190 },
5191 method: http::Method::DELETE,
5192 uri: safe_object_uri(&bucket, &key)?,
5193 headers: http::HeaderMap::new(),
5194 extensions: http::Extensions::new(),
5195 credentials: None,
5196 region: None,
5197 service: None,
5198 trailing_headers: None,
5199 };
5200 let _ = self.backend.delete_object(del_req).await;
5201 }
5202 applied_metadata = Some(new_metadata);
5203 }
5204 // v0.8 #54 BUG-6 commit: register the new version with
5205 // the VersioningManager so list_object_versions /
5206 // GET ?versionId= see it.
5207 if let (Some(mgr), Some(pv)) = (self.versioning.as_ref(), pending_version.as_ref()) {
5208 let etag = resp
5209 .output
5210 .e_tag
5211 .clone()
5212 .map(ETag::into_value)
5213 .unwrap_or_default();
5214 let now = chrono::Utc::now();
5215 mgr.commit_put_with_version(
5216 &bucket,
5217 &key,
5218 crate::versioning::VersionEntry {
5219 version_id: pv.version_id.clone(),
5220 etag,
5221 size: replication_body
5222 .as_ref()
5223 .map(|b| b.len() as u64)
5224 .unwrap_or(0),
5225 is_delete_marker: false,
5226 created_at: now,
5227 },
5228 );
5229 if pv.versioned_response {
5230 resp.output.version_id = Some(pv.version_id.clone());
5231 }
5232 }
5233 // v0.8 #54 BUG-7 fix: persist any per-upload Object Lock
5234 // recipe + auto-apply the bucket default. Mirrors the
5235 // put_object L2057-L2074 block.
5236 if let Some(mgr) = self.object_lock.as_ref() {
5237 if ctx.object_lock_mode.is_some()
5238 || ctx.object_lock_retain_until.is_some()
5239 || ctx.object_lock_legal_hold
5240 {
5241 let mut state = mgr.get(&bucket, &key).unwrap_or_default();
5242 if let Some(m) = ctx.object_lock_mode {
5243 state.mode = Some(m);
5244 }
5245 if let Some(u) = ctx.object_lock_retain_until {
5246 state.retain_until = Some(u);
5247 }
5248 if ctx.object_lock_legal_hold {
5249 state.legal_hold_on = true;
5250 }
5251 mgr.set(&bucket, &key, state);
5252 }
5253 mgr.apply_default_on_put(&bucket, &key, chrono::Utc::now());
5254 }
5255 // v0.8 #54 BUG-9 fix: persist the captured tags via the
5256 // TagManager so GetObjectTagging returns them.
5257 if let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), ctx.tags.as_ref()) {
5258 mgr.put_object_tags(&bucket, &key, tags.clone());
5259 }
5260 // SSE-C / SSE-KMS response echo. The
5261 // CompleteMultipartUploadOutput only exposes
5262 // `server_side_encryption` + `ssekms_key_id` (no
5263 // sse_customer_* — those round-tripped on Create / parts).
5264 match &ctx.sse {
5265 crate::multipart_state::MultipartSseMode::SseC { .. } => {
5266 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5267 ServerSideEncryption::AES256,
5268 ));
5269 }
5270 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5271 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5272 ServerSideEncryption::AWS_KMS,
5273 ));
5274 resp.output.ssekms_key_id = Some(key_id.clone());
5275 }
5276 _ => {}
5277 }
5278 // v0.8 #54 BUG-8 fix: fire cross-bucket replication just
5279 // like put_object L2165 does. We hand the dispatcher the
5280 // assembled body bytes (post-encrypt where applicable, so
5281 // the destination ends up byte-identical to the source's
5282 // on-disk shape) plus the metadata that was actually
5283 // committed.
5284 let replication_body_bytes = replication_body.unwrap_or_default();
5285 // v0.8.2 #61: thread the multipart-Complete `pending_version`
5286 // through so a versioning-Enabled source's destination
5287 // receives the same shadow-key path (mirror of the
5288 // single-PUT branch above).
5289 self.spawn_replication_if_matched(
5290 &bucket,
5291 &key,
5292 &ctx.tags,
5293 &replication_body_bytes,
5294 &applied_metadata,
5295 true,
5296 pending_version.as_ref(),
5297 );
5298 self.multipart_state.remove(upload_id.as_str());
5299 }
5300 // v0.8.1 #59 janitor: best-effort sweep of stale completion
5301 // locks while we are still on the critical path of a single
5302 // Complete (so steady-state workloads of unique keys don't
5303 // accumulate `DashMap` entries). The sweep only retires
5304 // entries whose `Arc::strong_count == 1`, so any other in-
5305 // flight Complete on a different key keeps its lock alive.
5306 // Our own `_completion_guard` keeps `bucket`/`key`'s entry
5307 // alive across this call; it's reaped on the next Complete or
5308 // the next caller-driven prune.
5309 self.multipart_state.prune_completion_locks();
5310 Ok(resp)
5311 }
5312 async fn abort_multipart_upload(
5313 &self,
5314 req: S3Request<AbortMultipartUploadInput>,
5315 ) -> S3Result<S3Response<AbortMultipartUploadOutput>> {
5316 // v0.8.12 HIGH-9 fix: gate Abort on `s3:AbortMultipartUpload`
5317 // — the AWS-spec action verb for this operation. Without the
5318 // gate, anyone who could guess an upload_id could throw away
5319 // someone else's in-flight multipart upload.
5320 let abort_bucket = req.input.bucket.clone();
5321 let abort_key = req.input.key.clone();
5322 self.enforce_policy(
5323 &req,
5324 "s3:AbortMultipartUpload",
5325 &abort_bucket,
5326 Some(&abort_key),
5327 )?;
5328 // v0.8 #54: drop the per-upload state (SSE-C key bytes / tag
5329 // set) promptly so an aborted upload doesn't leak the
5330 // customer's key into a long-running gateway's RSS.
5331 //
5332 // v0.8.4 #71 (H-7 audit fix): backend.abort_multipart_upload
5333 // FIRST, then drop in-process state ONLY on success. The
5334 // previous order ("remove → call backend") meant a transient
5335 // backend abort failure (5xx, network) wiped the SSE-C key
5336 // bytes locally while leaving the parts on the backend, so a
5337 // client retry would have to re-validate the SSE-C key against
5338 // a context the gateway no longer has — and the retried abort
5339 // would still hit the unaborted backend parts. Calling the
5340 // backend first lets the failure propagate to the client with
5341 // state intact for a clean retry; only on success do we wipe
5342 // the local state.
5343 let upload_id = req.input.upload_id.as_str().to_owned();
5344 let resp = self.backend.abort_multipart_upload(req).await?;
5345 self.multipart_state.remove(&upload_id);
5346 Ok(resp)
5347 }
5348 async fn list_multipart_uploads(
5349 &self,
5350 req: S3Request<ListMultipartUploadsInput>,
5351 ) -> S3Result<S3Response<ListMultipartUploadsOutput>> {
5352 self.backend.list_multipart_uploads(req).await
5353 }
5354 async fn list_parts(
5355 &self,
5356 req: S3Request<ListPartsInput>,
5357 ) -> S3Result<S3Response<ListPartsOutput>> {
5358 self.backend.list_parts(req).await
5359 }
5360
5361 // =========================================================================
5362 // Phase 2 — pure passthrough delegations。S4 はこれらに対して圧縮 hook を
5363 // 持たないので、backend (= AWS S3) の動作と完全に同一。
5364 //
5365 // 既知の制限事項:
5366 // - copy_object / upload_part_copy: source object が S4-compressed の場合、
5367 // backend が bytes を copy するだけなので metadata (s4-codec etc) も一緒に
5368 // coppied される (AWS S3 default = MetadataDirective COPY)。GET は manifest
5369 // 経由で正しく decompress できる。MetadataDirective REPLACE で上書き
5370 // されると圧縮 metadata が消えて壊れる — 顧客側の運用で注意
5371 // - list_object_versions: versioning enabled bucket では各 version も S4
5372 // metadata を維持する。古い version も S4 経由で正しく GET できる。
5373 // =========================================================================
5374
5375 // ---- Object ACL / tagging / attributes ----
5376 async fn get_object_acl(
5377 &self,
5378 req: S3Request<GetObjectAclInput>,
5379 ) -> S3Result<S3Response<GetObjectAclOutput>> {
5380 // v0.8.17 G-2: reserved-name guard. Without it a hostile
5381 // client can `GetObjectAcl(<key>.s4index)` to confirm the
5382 // sidecar exists, an information leak the F-13 GET reject
5383 // closed for the same object.
5384 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5385 self.backend.get_object_acl(req).await
5386 }
5387 async fn put_object_acl(
5388 &self,
5389 req: S3Request<PutObjectAclInput>,
5390 ) -> S3Result<S3Response<PutObjectAclOutput>> {
5391 // v0.8.17 G-2: reserved-name guard. `put-object-acl
5392 // --acl public-read` against `<key>.s4index` would grant
5393 // external read access to the internal sidecar, bypassing
5394 // the F-13 GET reject via the backend's public-URL path.
5395 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5396 self.backend.put_object_acl(req).await
5397 }
5398 // v0.6 #39: object tagging — when a `TagManager` is attached the
5399 // configuration / per-(bucket, key) state lives in the manager and
5400 // these handlers serve directly from it; when no manager is
5401 // attached they fall back to the backend (legacy passthrough so
5402 // v0.5 deployments are unaffected).
5403 async fn get_object_tagging(
5404 &self,
5405 req: S3Request<GetObjectTaggingInput>,
5406 ) -> S3Result<S3Response<GetObjectTaggingOutput>> {
5407 // v0.8.17 G-2: reserved-name guard.
5408 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5409 let Some(mgr) = self.tagging.as_ref() else {
5410 return self.backend.get_object_tagging(req).await;
5411 };
5412 let tags = mgr
5413 .get_object_tags(&req.input.bucket, &req.input.key)
5414 .unwrap_or_default();
5415 Ok(S3Response::new(GetObjectTaggingOutput {
5416 tag_set: tagset_to_aws(&tags),
5417 ..Default::default()
5418 }))
5419 }
5420 async fn put_object_tagging(
5421 &self,
5422 req: S3Request<PutObjectTaggingInput>,
5423 ) -> S3Result<S3Response<PutObjectTaggingOutput>> {
5424 // v0.8.17 G-2: reserved-name guard.
5425 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5426 let Some(mgr) = self.tagging.as_ref() else {
5427 return self.backend.put_object_tagging(req).await;
5428 };
5429 let bucket = req.input.bucket.clone();
5430 let key = req.input.key.clone();
5431 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
5432 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5433 // v0.6 #39: gate via IAM policy with both the request tags
5434 // (`s3:RequestObjectTag/<key>`) and any existing tags on the
5435 // target object (`s3:ExistingObjectTag/<key>`).
5436 let existing = mgr.get_object_tags(&bucket, &key);
5437 self.enforce_policy_with_extra(
5438 &req,
5439 "s3:PutObjectTagging",
5440 &bucket,
5441 Some(&key),
5442 Some(&parsed),
5443 existing.as_ref(),
5444 )?;
5445 mgr.put_object_tags(&bucket, &key, parsed);
5446 Ok(S3Response::new(PutObjectTaggingOutput::default()))
5447 }
5448 async fn delete_object_tagging(
5449 &self,
5450 req: S3Request<DeleteObjectTaggingInput>,
5451 ) -> S3Result<S3Response<DeleteObjectTaggingOutput>> {
5452 // v0.8.17 G-2: reserved-name guard.
5453 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5454 let Some(mgr) = self.tagging.as_ref() else {
5455 return self.backend.delete_object_tagging(req).await;
5456 };
5457 let bucket = req.input.bucket.clone();
5458 let key = req.input.key.clone();
5459 let existing = mgr.get_object_tags(&bucket, &key);
5460 self.enforce_policy_with_extra(
5461 &req,
5462 "s3:DeleteObjectTagging",
5463 &bucket,
5464 Some(&key),
5465 None,
5466 existing.as_ref(),
5467 )?;
5468 mgr.delete_object_tags(&bucket, &key);
5469 Ok(S3Response::new(DeleteObjectTaggingOutput::default()))
5470 }
5471 async fn get_object_attributes(
5472 &self,
5473 req: S3Request<GetObjectAttributesInput>,
5474 ) -> S3Result<S3Response<GetObjectAttributesOutput>> {
5475 // v0.8.17 G-2: reserved-name guard. Attributes leak the
5476 // sidecar's size + ETag, same shape as F-13's GET concern.
5477 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5478 self.backend.get_object_attributes(req).await
5479 }
5480 async fn restore_object(
5481 &self,
5482 req: S3Request<RestoreObjectInput>,
5483 ) -> S3Result<S3Response<RestoreObjectOutput>> {
5484 // v0.8.17 G-2: reserved-name guard.
5485 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5486 self.backend.restore_object(req).await
5487 }
5488 async fn upload_part_copy(
5489 &self,
5490 req: S3Request<UploadPartCopyInput>,
5491 ) -> S3Result<S3Response<UploadPartCopyOutput>> {
5492 // v0.8.12 HIGH-9 fix: same per-action gates as `copy_object` —
5493 // destination PUT + source GET.
5494 let dst_bucket = req.input.bucket.clone();
5495 let dst_key = req.input.key.clone();
5496 // v0.8.17 G-2: reserved-name guard on both destination
5497 // and source. Mirrors what `copy_object` enforces.
5498 self.check_not_reserved_key(&dst_key, ReservedKeyMode::Mutating)?;
5499 if let CopySource::Bucket { key, .. } = &req.input.copy_source {
5500 self.check_not_reserved_key(key, ReservedKeyMode::Read)?;
5501 }
5502 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
5503 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
5504 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
5505 }
5506 self.enforce_rate_limit(&req, &dst_bucket)?;
5507 // v0.2 #6: byte-range aware copy when the source is S4-framed.
5508 //
5509 // For a framed source (multipart upload OR single-PUT framed-v2),
5510 // a naive byte-range passthrough would copy compressed bytes that
5511 // don't align with S4 frame boundaries — silently corrupting the
5512 // result. Instead we GET the source through S4 (which handles
5513 // decompression + Range), re-compress + re-frame as a new part,
5514 // and forward as upload_part. For non-framed sources (S4-untouched
5515 // raw objects), passthrough is correct and we keep the original
5516 // (cheaper) code path.
5517 // v0.8.4 #74: propagate the optional `?versionId=<vid>` from the
5518 // copy-source header. Without this, a versioned source bucket
5519 // copy that pins a specific old version would silently fall
5520 // back to "latest", assembling wrong bytes into the destination
5521 // multipart object (silent data corruption).
5522 let CopySource::Bucket {
5523 bucket: src_bucket,
5524 key: src_key,
5525 version_id: src_version_id,
5526 } = &req.input.copy_source
5527 else {
5528 return self.backend.upload_part_copy(req).await;
5529 };
5530 let src_bucket = src_bucket.to_string();
5531 let src_key = src_key.to_string();
5532 let src_version_id: Option<String> = src_version_id.as_deref().map(str::to_owned);
5533
5534 // Probe metadata to decide whether the source needs S4-aware copy.
5535 let head_input = HeadObjectInput {
5536 bucket: src_bucket.clone(),
5537 key: src_key.clone(),
5538 version_id: src_version_id.clone(),
5539 ..Default::default()
5540 };
5541 let head_req = S3Request {
5542 input: head_input,
5543 method: http::Method::HEAD,
5544 uri: req.uri.clone(),
5545 headers: req.headers.clone(),
5546 extensions: http::Extensions::new(),
5547 credentials: req.credentials.clone(),
5548 region: req.region.clone(),
5549 service: req.service.clone(),
5550 trailing_headers: None,
5551 };
5552 let needs_s4_copy = match self.backend.head_object(head_req).await {
5553 Ok(h) => {
5554 is_multipart_object(&h.output.metadata) || is_framed_v2_object(&h.output.metadata)
5555 }
5556 Err(_) => false,
5557 };
5558 if !needs_s4_copy {
5559 return self.backend.upload_part_copy(req).await;
5560 }
5561
5562 // Resolve the optional source byte range to pass to GET.
5563 let source_range = req
5564 .input
5565 .copy_source_range
5566 .as_ref()
5567 .map(|r| parse_copy_source_range(r))
5568 .transpose()
5569 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
5570
5571 // GET source via S4 (handles decompression + sidecar partial fetch
5572 // when range is present). The result is the requested user-visible
5573 // byte range, fully decompressed. version_id is propagated so
5574 // pinned-version copies fetch the exact version requested.
5575 let mut get_input = GetObjectInput {
5576 bucket: src_bucket.clone(),
5577 key: src_key.clone(),
5578 version_id: src_version_id.clone(),
5579 ..Default::default()
5580 };
5581 get_input.range = source_range;
5582 let get_req = S3Request {
5583 input: get_input,
5584 method: http::Method::GET,
5585 uri: req.uri.clone(),
5586 headers: req.headers.clone(),
5587 extensions: http::Extensions::new(),
5588 credentials: req.credentials.clone(),
5589 region: req.region.clone(),
5590 service: req.service.clone(),
5591 trailing_headers: None,
5592 };
5593 let get_resp = self.get_object(get_req).await?;
5594 let blob = get_resp.output.body.ok_or_else(|| {
5595 S3Error::with_message(
5596 S3ErrorCode::InternalError,
5597 "upload_part_copy: empty body from source GET",
5598 )
5599 })?;
5600 let bytes = collect_blob(blob, self.max_body_bytes)
5601 .await
5602 .map_err(internal("collect upload_part_copy source body"))?;
5603
5604 // Compress + frame as a fresh part (mirrors upload_part path).
5605 let sample_len = bytes.len().min(SAMPLE_BYTES);
5606 // v0.8 #56: same size-hint promotion as the upload_part path.
5607 let codec_kind = self
5608 .dispatcher
5609 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
5610 .await;
5611 let original_size = bytes.len() as u64;
5612 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
5613 let (compress_res, tel) = self
5614 .registry
5615 .compress_with_telemetry(bytes, codec_kind)
5616 .await;
5617 stamp_gpu_compress_telemetry(&tel);
5618 let (compressed, manifest) =
5619 compress_res.map_err(internal("registry compress upload_part_copy"))?;
5620 let header = FrameHeader {
5621 codec: codec_kind,
5622 original_size,
5623 compressed_size: compressed.len() as u64,
5624 crc32c: manifest.crc32c,
5625 };
5626 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
5627 write_frame(&mut framed, header, &compressed);
5628 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
5629 if !likely_final {
5630 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
5631 }
5632 let framed_bytes = framed.freeze();
5633 let framed_len = framed_bytes.len() as i64;
5634
5635 // Forward as upload_part to the destination multipart upload.
5636 let part_input = UploadPartInput {
5637 bucket: req.input.bucket.clone(),
5638 key: req.input.key.clone(),
5639 part_number: req.input.part_number,
5640 upload_id: req.input.upload_id.clone(),
5641 body: Some(bytes_to_blob(framed_bytes)),
5642 content_length: Some(framed_len),
5643 ..Default::default()
5644 };
5645 let part_req = S3Request {
5646 input: part_input,
5647 method: http::Method::PUT,
5648 uri: req.uri.clone(),
5649 headers: req.headers.clone(),
5650 extensions: http::Extensions::new(),
5651 credentials: req.credentials.clone(),
5652 region: req.region.clone(),
5653 service: req.service.clone(),
5654 trailing_headers: None,
5655 };
5656 let upload_resp = self.backend.upload_part(part_req).await?;
5657
5658 let copy_output = UploadPartCopyOutput {
5659 copy_part_result: Some(CopyPartResult {
5660 e_tag: upload_resp.output.e_tag.clone(),
5661 ..Default::default()
5662 }),
5663 ..Default::default()
5664 };
5665 Ok(S3Response::new(copy_output))
5666 }
5667
5668 // ---- Object lock / retention / legal hold (v0.5 #30) ----
5669 //
5670 // When an `ObjectLockManager` is attached the configuration / per-object
5671 // state lives in the manager and these handlers serve directly from it;
5672 // when no manager is attached they fall back to the backend (legacy
5673 // passthrough so v0.4 deployments are unaffected).
5674 async fn get_object_lock_configuration(
5675 &self,
5676 req: S3Request<GetObjectLockConfigurationInput>,
5677 ) -> S3Result<S3Response<GetObjectLockConfigurationOutput>> {
5678 self.enforce_policy(
5679 &req,
5680 "s3:GetBucketObjectLockConfiguration",
5681 &req.input.bucket,
5682 None,
5683 )?;
5684 if let Some(mgr) = self.object_lock.as_ref() {
5685 let cfg = mgr
5686 .bucket_default(&req.input.bucket)
5687 .map(|d| ObjectLockConfiguration {
5688 object_lock_enabled: Some(ObjectLockEnabled::from_static(
5689 ObjectLockEnabled::ENABLED,
5690 )),
5691 rule: Some(ObjectLockRule {
5692 default_retention: Some(DefaultRetention {
5693 days: Some(d.retention_days as i32),
5694 mode: Some(ObjectLockRetentionMode::from_static(match d.mode {
5695 crate::object_lock::LockMode::Governance => {
5696 ObjectLockRetentionMode::GOVERNANCE
5697 }
5698 crate::object_lock::LockMode::Compliance => {
5699 ObjectLockRetentionMode::COMPLIANCE
5700 }
5701 })),
5702 years: None,
5703 }),
5704 }),
5705 });
5706 let output = GetObjectLockConfigurationOutput {
5707 object_lock_configuration: cfg,
5708 };
5709 return Ok(S3Response::new(output));
5710 }
5711 self.backend.get_object_lock_configuration(req).await
5712 }
5713 async fn put_object_lock_configuration(
5714 &self,
5715 req: S3Request<PutObjectLockConfigurationInput>,
5716 ) -> S3Result<S3Response<PutObjectLockConfigurationOutput>> {
5717 self.enforce_policy(
5718 &req,
5719 "s3:PutBucketObjectLockConfiguration",
5720 &req.input.bucket,
5721 None,
5722 )?;
5723 if let Some(mgr) = self.object_lock.as_ref() {
5724 let bucket = req.input.bucket.clone();
5725 if let Some(cfg) = req.input.object_lock_configuration.as_ref()
5726 && let Some(rule) = cfg.rule.as_ref()
5727 && let Some(d) = rule.default_retention.as_ref()
5728 {
5729 let mode = d
5730 .mode
5731 .as_ref()
5732 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()))
5733 .ok_or_else(|| {
5734 S3Error::with_message(
5735 S3ErrorCode::InvalidRequest,
5736 "Object Lock default retention requires a valid Mode (GOVERNANCE | COMPLIANCE)",
5737 )
5738 })?;
5739 // S3 spec: exactly one of Days / Years (we accept Days
5740 // outright and convert Years → Days for storage; Years
5741 // is just a UX shorthand on the wire).
5742 let days: u32 = match (d.days, d.years) {
5743 (Some(d), None) if d > 0 => d as u32,
5744 (None, Some(y)) if y > 0 => (y as u32).saturating_mul(365),
5745 _ => {
5746 return Err(S3Error::with_message(
5747 S3ErrorCode::InvalidRequest,
5748 "Object Lock default retention requires exactly one of Days or Years (positive integer)",
5749 ));
5750 }
5751 };
5752 mgr.set_bucket_default(
5753 &bucket,
5754 crate::object_lock::BucketObjectLockDefault {
5755 mode,
5756 retention_days: days,
5757 },
5758 );
5759 }
5760 return Ok(S3Response::new(PutObjectLockConfigurationOutput::default()));
5761 }
5762 self.backend.put_object_lock_configuration(req).await
5763 }
5764 async fn get_object_legal_hold(
5765 &self,
5766 req: S3Request<GetObjectLegalHoldInput>,
5767 ) -> S3Result<S3Response<GetObjectLegalHoldOutput>> {
5768 let key = req.input.key.clone();
5769 self.enforce_policy(&req, "s3:GetObjectLegalHold", &req.input.bucket, Some(&key))?;
5770 if let Some(mgr) = self.object_lock.as_ref() {
5771 let on = mgr
5772 .get(&req.input.bucket, &req.input.key)
5773 .map(|s| s.legal_hold_on)
5774 .unwrap_or(false);
5775 let status = ObjectLockLegalHoldStatus::from_static(if on {
5776 ObjectLockLegalHoldStatus::ON
5777 } else {
5778 ObjectLockLegalHoldStatus::OFF
5779 });
5780 let output = GetObjectLegalHoldOutput {
5781 legal_hold: Some(ObjectLockLegalHold {
5782 status: Some(status),
5783 }),
5784 };
5785 return Ok(S3Response::new(output));
5786 }
5787 self.backend.get_object_legal_hold(req).await
5788 }
5789 async fn put_object_legal_hold(
5790 &self,
5791 req: S3Request<PutObjectLegalHoldInput>,
5792 ) -> S3Result<S3Response<PutObjectLegalHoldOutput>> {
5793 let key = req.input.key.clone();
5794 self.enforce_policy(&req, "s3:PutObjectLegalHold", &req.input.bucket, Some(&key))?;
5795 if let Some(mgr) = self.object_lock.as_ref() {
5796 let on = req
5797 .input
5798 .legal_hold
5799 .as_ref()
5800 .and_then(|h| h.status.as_ref())
5801 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
5802 .unwrap_or(false);
5803 mgr.set_legal_hold(&req.input.bucket, &req.input.key, on);
5804 return Ok(S3Response::new(PutObjectLegalHoldOutput::default()));
5805 }
5806 self.backend.put_object_legal_hold(req).await
5807 }
5808 async fn get_object_retention(
5809 &self,
5810 req: S3Request<GetObjectRetentionInput>,
5811 ) -> S3Result<S3Response<GetObjectRetentionOutput>> {
5812 let key = req.input.key.clone();
5813 self.enforce_policy(&req, "s3:GetObjectRetention", &req.input.bucket, Some(&key))?;
5814 if let Some(mgr) = self.object_lock.as_ref() {
5815 let retention = mgr
5816 .get(&req.input.bucket, &req.input.key)
5817 .filter(|s| s.mode.is_some() || s.retain_until.is_some())
5818 .map(|s| {
5819 let mode = s.mode.map(|m| {
5820 ObjectLockRetentionMode::from_static(match m {
5821 crate::object_lock::LockMode::Governance => {
5822 ObjectLockRetentionMode::GOVERNANCE
5823 }
5824 crate::object_lock::LockMode::Compliance => {
5825 ObjectLockRetentionMode::COMPLIANCE
5826 }
5827 })
5828 });
5829 let until = s.retain_until.map(chrono_utc_to_timestamp);
5830 ObjectLockRetention {
5831 mode,
5832 retain_until_date: until,
5833 }
5834 });
5835 let output = GetObjectRetentionOutput { retention };
5836 return Ok(S3Response::new(output));
5837 }
5838 self.backend.get_object_retention(req).await
5839 }
5840 async fn put_object_retention(
5841 &self,
5842 req: S3Request<PutObjectRetentionInput>,
5843 ) -> S3Result<S3Response<PutObjectRetentionOutput>> {
5844 let key = req.input.key.clone();
5845 self.enforce_policy(&req, "s3:PutObjectRetention", &req.input.bucket, Some(&key))?;
5846 if let Some(mgr) = self.object_lock.as_ref() {
5847 let bucket = req.input.bucket.clone();
5848 let key = req.input.key.clone();
5849 // v0.8.12 HIGH-7 fix: the bypass header gates Governance
5850 // shortening only when the caller has the matching IAM
5851 // action explicitly allowed; otherwise it's silently
5852 // dropped to `false` and the "shortening Governance
5853 // requires bypass" branch below rejects.
5854 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
5855 let bypass = if bypass_header {
5856 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
5857 .is_ok()
5858 } else {
5859 false
5860 };
5861 let retention = req.input.retention.as_ref().ok_or_else(|| {
5862 S3Error::with_message(
5863 S3ErrorCode::InvalidRequest,
5864 "PutObjectRetention requires a Retention element",
5865 )
5866 })?;
5867 let new_mode = retention
5868 .mode
5869 .as_ref()
5870 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
5871 let new_until = retention
5872 .retain_until_date
5873 .as_ref()
5874 .map(timestamp_to_chrono_utc)
5875 .unwrap_or(None);
5876 let now = chrono::Utc::now();
5877 let existing = mgr.get(&bucket, &key).unwrap_or_default();
5878 // S3 immutability rules:
5879 // - Compliance is one-way: once set, mode cannot move to
5880 // Governance, and retain-until cannot be shortened.
5881 // - Governance can be lengthened freely; shortened only
5882 // with bypass=true.
5883 if let Some(existing_mode) = existing.mode
5884 && existing_mode == crate::object_lock::LockMode::Compliance
5885 && existing.is_locked(now)
5886 {
5887 if matches!(new_mode, Some(crate::object_lock::LockMode::Governance)) {
5888 return Err(S3Error::with_message(
5889 S3ErrorCode::AccessDenied,
5890 "Cannot downgrade Compliance retention to Governance while lock is active",
5891 ));
5892 }
5893 if let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5894 && next < prev
5895 {
5896 return Err(S3Error::with_message(
5897 S3ErrorCode::AccessDenied,
5898 "Cannot shorten Compliance retention while lock is active",
5899 ));
5900 }
5901 }
5902 if let Some(existing_mode) = existing.mode
5903 && existing_mode == crate::object_lock::LockMode::Governance
5904 && existing.is_locked(now)
5905 && !bypass
5906 && let (Some(prev), Some(next)) = (existing.retain_until, new_until)
5907 && next < prev
5908 {
5909 return Err(S3Error::with_message(
5910 S3ErrorCode::AccessDenied,
5911 "Shortening Governance retention requires x-amz-bypass-governance-retention: true",
5912 ));
5913 }
5914 let mut state = existing;
5915 if new_mode.is_some() {
5916 state.mode = new_mode;
5917 }
5918 if new_until.is_some() {
5919 state.retain_until = new_until;
5920 }
5921 mgr.set(&bucket, &key, state);
5922 return Ok(S3Response::new(PutObjectRetentionOutput::default()));
5923 }
5924 self.backend.put_object_retention(req).await
5925 }
5926
5927 // ---- Versioning ----
5928 // list_object_versions is implemented above in the compression-hook
5929 // section so it filters S4-internal sidecars (v0.4 #17) AND, when a
5930 // VersioningManager is attached (v0.5 #34), serves chains directly
5931 // from the in-memory index.
5932 async fn get_bucket_versioning(
5933 &self,
5934 req: S3Request<GetBucketVersioningInput>,
5935 ) -> S3Result<S3Response<GetBucketVersioningOutput>> {
5936 // v0.5 #34: when a VersioningManager is attached, the bucket's
5937 // versioning state lives in the manager (= S4-server's
5938 // authoritative source). Pass-through hits the backend only
5939 // when no manager is configured (legacy v0.4 behaviour).
5940 if let Some(mgr) = self.versioning.as_ref() {
5941 let output = match mgr.state(&req.input.bucket).as_aws_status() {
5942 Some(s) => GetBucketVersioningOutput {
5943 status: Some(BucketVersioningStatus::from(s.to_owned())),
5944 ..Default::default()
5945 },
5946 None => GetBucketVersioningOutput::default(),
5947 };
5948 return Ok(S3Response::new(output));
5949 }
5950 self.backend.get_bucket_versioning(req).await
5951 }
5952 async fn put_bucket_versioning(
5953 &self,
5954 req: S3Request<PutBucketVersioningInput>,
5955 ) -> S3Result<S3Response<PutBucketVersioningOutput>> {
5956 // v0.6 #42: MFA gating on the `PutBucketVersioning` request
5957 // itself. S3 spec: when the request body carries an
5958 // `MfaDelete` element (either `Enabled` or `Disabled`), the
5959 // request must include a valid `x-amz-mfa` token — both for
5960 // the *first* enable (so the operator can't quietly side-step
5961 // the gate by never enabling it) and for any subsequent
5962 // change (so a leaked credential alone can't disable MFA
5963 // Delete to bypass it on subsequent DELETEs). Requests that
5964 // omit the `MfaDelete` element entirely (i.e. they flip only
5965 // `Status`) skip this gate, matching AWS.
5966 if let Some(mgr) = self.mfa_delete.as_ref()
5967 && let Some(target_enabled) = req
5968 .input
5969 .versioning_configuration
5970 .mfa_delete
5971 .as_ref()
5972 .map(|m| m.as_str().eq_ignore_ascii_case("Enabled"))
5973 {
5974 let bucket = req.input.bucket.clone();
5975 let header = req.input.mfa.as_deref();
5976 let secret = mgr.lookup_secret(&bucket);
5977 let verified = match (header, secret.as_ref()) {
5978 (Some(h), Some(s)) => match crate::mfa::parse_mfa_header(h) {
5979 Ok((serial, code)) => {
5980 serial == s.serial
5981 && crate::mfa::verify_totp(&s.secret_base32, &code, current_unix_secs())
5982 }
5983 Err(_) => false,
5984 },
5985 _ => false,
5986 };
5987 if !verified {
5988 crate::metrics::record_mfa_delete_denial(&bucket);
5989 let err = if header.is_none() {
5990 crate::mfa::MfaError::Missing
5991 } else {
5992 crate::mfa::MfaError::InvalidCode
5993 };
5994 return Err(mfa_error_to_s3(err));
5995 }
5996 mgr.set_bucket_state(&bucket, target_enabled);
5997 }
5998 // v0.5 #34: stash the new state in the manager, then forward to
5999 // the backend so any downstream that *also* tracks state
6000 // (e.g. a real S3 backend) stays in sync. Manager-attached but
6001 // backend rejection is treated as a soft-fail (state is still
6002 // owned by the manager).
6003 if let Some(mgr) = self.versioning.as_ref() {
6004 let new_state = match req
6005 .input
6006 .versioning_configuration
6007 .status
6008 .as_ref()
6009 .map(|s| s.as_str())
6010 {
6011 Some(s) if s.eq_ignore_ascii_case("Enabled") => {
6012 crate::versioning::VersioningState::Enabled
6013 }
6014 Some(s) if s.eq_ignore_ascii_case("Suspended") => {
6015 crate::versioning::VersioningState::Suspended
6016 }
6017 _ => crate::versioning::VersioningState::Unversioned,
6018 };
6019 mgr.set_state(&req.input.bucket, new_state);
6020 return Ok(S3Response::new(PutBucketVersioningOutput::default()));
6021 }
6022 self.backend.put_bucket_versioning(req).await
6023 }
6024
6025 // ---- Bucket location ----
6026 async fn get_bucket_location(
6027 &self,
6028 req: S3Request<GetBucketLocationInput>,
6029 ) -> S3Result<S3Response<GetBucketLocationOutput>> {
6030 self.backend.get_bucket_location(req).await
6031 }
6032
6033 // ---- Bucket policy ----
6034 async fn get_bucket_policy(
6035 &self,
6036 req: S3Request<GetBucketPolicyInput>,
6037 ) -> S3Result<S3Response<GetBucketPolicyOutput>> {
6038 self.backend.get_bucket_policy(req).await
6039 }
6040 async fn put_bucket_policy(
6041 &self,
6042 req: S3Request<PutBucketPolicyInput>,
6043 ) -> S3Result<S3Response<PutBucketPolicyOutput>> {
6044 self.backend.put_bucket_policy(req).await
6045 }
6046 async fn delete_bucket_policy(
6047 &self,
6048 req: S3Request<DeleteBucketPolicyInput>,
6049 ) -> S3Result<S3Response<DeleteBucketPolicyOutput>> {
6050 self.backend.delete_bucket_policy(req).await
6051 }
6052 async fn get_bucket_policy_status(
6053 &self,
6054 req: S3Request<GetBucketPolicyStatusInput>,
6055 ) -> S3Result<S3Response<GetBucketPolicyStatusOutput>> {
6056 self.backend.get_bucket_policy_status(req).await
6057 }
6058
6059 // ---- Bucket ACL ----
6060 async fn get_bucket_acl(
6061 &self,
6062 req: S3Request<GetBucketAclInput>,
6063 ) -> S3Result<S3Response<GetBucketAclOutput>> {
6064 self.backend.get_bucket_acl(req).await
6065 }
6066 async fn put_bucket_acl(
6067 &self,
6068 req: S3Request<PutBucketAclInput>,
6069 ) -> S3Result<S3Response<PutBucketAclOutput>> {
6070 self.backend.put_bucket_acl(req).await
6071 }
6072
6073 // ---- Bucket CORS (v0.6 #38) ----
6074 async fn get_bucket_cors(
6075 &self,
6076 req: S3Request<GetBucketCorsInput>,
6077 ) -> S3Result<S3Response<GetBucketCorsOutput>> {
6078 if let Some(mgr) = self.cors.as_ref() {
6079 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6080 S3Error::with_message(
6081 S3ErrorCode::NoSuchCORSConfiguration,
6082 "The CORS configuration does not exist".to_string(),
6083 )
6084 })?;
6085 let rules: Vec<CORSRule> = cfg
6086 .rules
6087 .into_iter()
6088 .map(|r| CORSRule {
6089 allowed_headers: if r.allowed_headers.is_empty() {
6090 None
6091 } else {
6092 Some(r.allowed_headers)
6093 },
6094 allowed_methods: r.allowed_methods,
6095 allowed_origins: r.allowed_origins,
6096 expose_headers: if r.expose_headers.is_empty() {
6097 None
6098 } else {
6099 Some(r.expose_headers)
6100 },
6101 id: r.id,
6102 max_age_seconds: r.max_age_seconds.map(|s| s as i32),
6103 })
6104 .collect();
6105 return Ok(S3Response::new(GetBucketCorsOutput {
6106 cors_rules: Some(rules),
6107 }));
6108 }
6109 self.backend.get_bucket_cors(req).await
6110 }
6111 async fn put_bucket_cors(
6112 &self,
6113 req: S3Request<PutBucketCorsInput>,
6114 ) -> S3Result<S3Response<PutBucketCorsOutput>> {
6115 if let Some(mgr) = self.cors.as_ref() {
6116 let cfg = crate::cors::CorsConfig {
6117 rules: req
6118 .input
6119 .cors_configuration
6120 .cors_rules
6121 .into_iter()
6122 .map(|r| crate::cors::CorsRule {
6123 allowed_origins: r.allowed_origins,
6124 allowed_methods: r.allowed_methods,
6125 allowed_headers: r.allowed_headers.unwrap_or_default(),
6126 expose_headers: r.expose_headers.unwrap_or_default(),
6127 max_age_seconds: r
6128 .max_age_seconds
6129 .and_then(|s| if s < 0 { None } else { Some(s as u32) }),
6130 id: r.id,
6131 })
6132 .collect(),
6133 };
6134 // v0.8.15 M-3: AWS S3 rejects `AllowedMethods` outside
6135 // the canonical {GET,PUT,POST,DELETE,HEAD} set (including
6136 // the `*` wildcard). Validate at PutBucketCors time so
6137 // operators see the misconfiguration in the API response
6138 // instead of having silently-broken preflights at the
6139 // browser later.
6140 if let Err(e) = crate::cors::CorsManager::validate(&cfg) {
6141 return Err(S3Error::with_message(
6142 S3ErrorCode::InvalidArgument,
6143 e.to_string(),
6144 ));
6145 }
6146 mgr.put(&req.input.bucket, cfg);
6147 return Ok(S3Response::new(PutBucketCorsOutput::default()));
6148 }
6149 self.backend.put_bucket_cors(req).await
6150 }
6151 async fn delete_bucket_cors(
6152 &self,
6153 req: S3Request<DeleteBucketCorsInput>,
6154 ) -> S3Result<S3Response<DeleteBucketCorsOutput>> {
6155 if let Some(mgr) = self.cors.as_ref() {
6156 mgr.delete(&req.input.bucket);
6157 return Ok(S3Response::new(DeleteBucketCorsOutput::default()));
6158 }
6159 self.backend.delete_bucket_cors(req).await
6160 }
6161
6162 // ---- Bucket lifecycle (v0.6 #37) ----
6163 async fn get_bucket_lifecycle_configuration(
6164 &self,
6165 req: S3Request<GetBucketLifecycleConfigurationInput>,
6166 ) -> S3Result<S3Response<GetBucketLifecycleConfigurationOutput>> {
6167 if let Some(mgr) = self.lifecycle.as_ref() {
6168 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6169 S3Error::with_message(
6170 S3ErrorCode::NoSuchLifecycleConfiguration,
6171 "The lifecycle configuration does not exist".to_string(),
6172 )
6173 })?;
6174 let rules: Vec<LifecycleRule> = cfg.rules.iter().map(internal_rule_to_dto).collect();
6175 return Ok(S3Response::new(GetBucketLifecycleConfigurationOutput {
6176 rules: Some(rules),
6177 transition_default_minimum_object_size: None,
6178 }));
6179 }
6180 self.backend.get_bucket_lifecycle_configuration(req).await
6181 }
6182 async fn put_bucket_lifecycle_configuration(
6183 &self,
6184 req: S3Request<PutBucketLifecycleConfigurationInput>,
6185 ) -> S3Result<S3Response<PutBucketLifecycleConfigurationOutput>> {
6186 if let Some(mgr) = self.lifecycle.as_ref() {
6187 let bucket = req.input.bucket.clone();
6188 let dto_cfg = req.input.lifecycle_configuration.unwrap_or_default();
6189 let cfg = dto_lifecycle_to_internal(&dto_cfg);
6190 mgr.put(&bucket, cfg);
6191 return Ok(S3Response::new(
6192 PutBucketLifecycleConfigurationOutput::default(),
6193 ));
6194 }
6195 self.backend.put_bucket_lifecycle_configuration(req).await
6196 }
6197 async fn delete_bucket_lifecycle(
6198 &self,
6199 req: S3Request<DeleteBucketLifecycleInput>,
6200 ) -> S3Result<S3Response<DeleteBucketLifecycleOutput>> {
6201 if let Some(mgr) = self.lifecycle.as_ref() {
6202 mgr.delete(&req.input.bucket);
6203 return Ok(S3Response::new(DeleteBucketLifecycleOutput::default()));
6204 }
6205 self.backend.delete_bucket_lifecycle(req).await
6206 }
6207
6208 // ---- Bucket tagging (v0.6 #39) ----
6209 async fn get_bucket_tagging(
6210 &self,
6211 req: S3Request<GetBucketTaggingInput>,
6212 ) -> S3Result<S3Response<GetBucketTaggingOutput>> {
6213 let Some(mgr) = self.tagging.as_ref() else {
6214 return self.backend.get_bucket_tagging(req).await;
6215 };
6216 let tags = mgr.get_bucket_tags(&req.input.bucket).unwrap_or_default();
6217 Ok(S3Response::new(GetBucketTaggingOutput {
6218 tag_set: tagset_to_aws(&tags),
6219 }))
6220 }
6221 async fn put_bucket_tagging(
6222 &self,
6223 req: S3Request<PutBucketTaggingInput>,
6224 ) -> S3Result<S3Response<PutBucketTaggingOutput>> {
6225 let Some(mgr) = self.tagging.as_ref() else {
6226 return self.backend.put_bucket_tagging(req).await;
6227 };
6228 let bucket = req.input.bucket.clone();
6229 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6230 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6231 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6232 mgr.put_bucket_tags(&bucket, parsed);
6233 Ok(S3Response::new(PutBucketTaggingOutput::default()))
6234 }
6235 async fn delete_bucket_tagging(
6236 &self,
6237 req: S3Request<DeleteBucketTaggingInput>,
6238 ) -> S3Result<S3Response<DeleteBucketTaggingOutput>> {
6239 let Some(mgr) = self.tagging.as_ref() else {
6240 return self.backend.delete_bucket_tagging(req).await;
6241 };
6242 let bucket = req.input.bucket.clone();
6243 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6244 mgr.delete_bucket_tags(&bucket);
6245 Ok(S3Response::new(DeleteBucketTaggingOutput::default()))
6246 }
6247
6248 // ---- Bucket encryption ----
6249 async fn get_bucket_encryption(
6250 &self,
6251 req: S3Request<GetBucketEncryptionInput>,
6252 ) -> S3Result<S3Response<GetBucketEncryptionOutput>> {
6253 self.backend.get_bucket_encryption(req).await
6254 }
6255 async fn put_bucket_encryption(
6256 &self,
6257 req: S3Request<PutBucketEncryptionInput>,
6258 ) -> S3Result<S3Response<PutBucketEncryptionOutput>> {
6259 self.backend.put_bucket_encryption(req).await
6260 }
6261 async fn delete_bucket_encryption(
6262 &self,
6263 req: S3Request<DeleteBucketEncryptionInput>,
6264 ) -> S3Result<S3Response<DeleteBucketEncryptionOutput>> {
6265 self.backend.delete_bucket_encryption(req).await
6266 }
6267
6268 // ---- Bucket logging ----
6269 async fn get_bucket_logging(
6270 &self,
6271 req: S3Request<GetBucketLoggingInput>,
6272 ) -> S3Result<S3Response<GetBucketLoggingOutput>> {
6273 self.backend.get_bucket_logging(req).await
6274 }
6275 async fn put_bucket_logging(
6276 &self,
6277 req: S3Request<PutBucketLoggingInput>,
6278 ) -> S3Result<S3Response<PutBucketLoggingOutput>> {
6279 self.backend.put_bucket_logging(req).await
6280 }
6281
6282 // ---- Bucket notification (v0.6 #35) ----
6283 //
6284 // When a `NotificationManager` is attached, S4 itself owns per-bucket
6285 // notification configurations and the PUT / GET handlers route through
6286 // the manager. The wire DTO's queue / topic configurations map onto
6287 // S4's `Destination::Sqs` / `Destination::Sns`; LambdaFunction and
6288 // EventBridge configurations are accepted on PUT but silently dropped
6289 // (out of scope for v0.6 #35). When no manager is attached the legacy
6290 // backend-passthrough behaviour applies.
6291 async fn get_bucket_notification_configuration(
6292 &self,
6293 req: S3Request<GetBucketNotificationConfigurationInput>,
6294 ) -> S3Result<S3Response<GetBucketNotificationConfigurationOutput>> {
6295 if let Some(mgr) = self.notifications.as_ref() {
6296 let cfg = mgr.get(&req.input.bucket).unwrap_or_default();
6297 let dto = notif_to_dto(&cfg);
6298 return Ok(S3Response::new(GetBucketNotificationConfigurationOutput {
6299 event_bridge_configuration: dto.event_bridge_configuration,
6300 lambda_function_configurations: dto.lambda_function_configurations,
6301 queue_configurations: dto.queue_configurations,
6302 topic_configurations: dto.topic_configurations,
6303 }));
6304 }
6305 self.backend
6306 .get_bucket_notification_configuration(req)
6307 .await
6308 }
6309 async fn put_bucket_notification_configuration(
6310 &self,
6311 req: S3Request<PutBucketNotificationConfigurationInput>,
6312 ) -> S3Result<S3Response<PutBucketNotificationConfigurationOutput>> {
6313 if let Some(mgr) = self.notifications.as_ref() {
6314 let cfg = notif_from_dto(&req.input.notification_configuration);
6315 mgr.put(&req.input.bucket, cfg);
6316 return Ok(S3Response::new(
6317 PutBucketNotificationConfigurationOutput::default(),
6318 ));
6319 }
6320 self.backend
6321 .put_bucket_notification_configuration(req)
6322 .await
6323 }
6324
6325 // ---- Bucket request payment ----
6326 async fn get_bucket_request_payment(
6327 &self,
6328 req: S3Request<GetBucketRequestPaymentInput>,
6329 ) -> S3Result<S3Response<GetBucketRequestPaymentOutput>> {
6330 self.backend.get_bucket_request_payment(req).await
6331 }
6332 async fn put_bucket_request_payment(
6333 &self,
6334 req: S3Request<PutBucketRequestPaymentInput>,
6335 ) -> S3Result<S3Response<PutBucketRequestPaymentOutput>> {
6336 self.backend.put_bucket_request_payment(req).await
6337 }
6338
6339 // ---- Bucket website ----
6340 async fn get_bucket_website(
6341 &self,
6342 req: S3Request<GetBucketWebsiteInput>,
6343 ) -> S3Result<S3Response<GetBucketWebsiteOutput>> {
6344 self.backend.get_bucket_website(req).await
6345 }
6346 async fn put_bucket_website(
6347 &self,
6348 req: S3Request<PutBucketWebsiteInput>,
6349 ) -> S3Result<S3Response<PutBucketWebsiteOutput>> {
6350 self.backend.put_bucket_website(req).await
6351 }
6352 async fn delete_bucket_website(
6353 &self,
6354 req: S3Request<DeleteBucketWebsiteInput>,
6355 ) -> S3Result<S3Response<DeleteBucketWebsiteOutput>> {
6356 self.backend.delete_bucket_website(req).await
6357 }
6358
6359 // ---- Bucket replication (v0.6 #40) ----
6360 async fn get_bucket_replication(
6361 &self,
6362 req: S3Request<GetBucketReplicationInput>,
6363 ) -> S3Result<S3Response<GetBucketReplicationOutput>> {
6364 if let Some(mgr) = self.replication.as_ref() {
6365 return match mgr.get(&req.input.bucket) {
6366 Some(cfg) => Ok(S3Response::new(GetBucketReplicationOutput {
6367 replication_configuration: Some(replication_to_dto(&cfg)),
6368 })),
6369 None => Err(S3Error::with_message(
6370 S3ErrorCode::Custom("ReplicationConfigurationNotFoundError".into()),
6371 format!(
6372 "no replication configuration on bucket {}",
6373 req.input.bucket
6374 ),
6375 )),
6376 };
6377 }
6378 self.backend.get_bucket_replication(req).await
6379 }
6380 async fn put_bucket_replication(
6381 &self,
6382 req: S3Request<PutBucketReplicationInput>,
6383 ) -> S3Result<S3Response<PutBucketReplicationOutput>> {
6384 if let Some(mgr) = self.replication.as_ref() {
6385 let cfg = replication_from_dto(&req.input.replication_configuration);
6386 mgr.put(&req.input.bucket, cfg);
6387 return Ok(S3Response::new(PutBucketReplicationOutput::default()));
6388 }
6389 self.backend.put_bucket_replication(req).await
6390 }
6391 async fn delete_bucket_replication(
6392 &self,
6393 req: S3Request<DeleteBucketReplicationInput>,
6394 ) -> S3Result<S3Response<DeleteBucketReplicationOutput>> {
6395 if let Some(mgr) = self.replication.as_ref() {
6396 mgr.delete(&req.input.bucket);
6397 return Ok(S3Response::new(DeleteBucketReplicationOutput::default()));
6398 }
6399 self.backend.delete_bucket_replication(req).await
6400 }
6401
6402 // ---- Bucket accelerate ----
6403 async fn get_bucket_accelerate_configuration(
6404 &self,
6405 req: S3Request<GetBucketAccelerateConfigurationInput>,
6406 ) -> S3Result<S3Response<GetBucketAccelerateConfigurationOutput>> {
6407 self.backend.get_bucket_accelerate_configuration(req).await
6408 }
6409 async fn put_bucket_accelerate_configuration(
6410 &self,
6411 req: S3Request<PutBucketAccelerateConfigurationInput>,
6412 ) -> S3Result<S3Response<PutBucketAccelerateConfigurationOutput>> {
6413 self.backend.put_bucket_accelerate_configuration(req).await
6414 }
6415
6416 // ---- Bucket ownership controls ----
6417 async fn get_bucket_ownership_controls(
6418 &self,
6419 req: S3Request<GetBucketOwnershipControlsInput>,
6420 ) -> S3Result<S3Response<GetBucketOwnershipControlsOutput>> {
6421 self.backend.get_bucket_ownership_controls(req).await
6422 }
6423 async fn put_bucket_ownership_controls(
6424 &self,
6425 req: S3Request<PutBucketOwnershipControlsInput>,
6426 ) -> S3Result<S3Response<PutBucketOwnershipControlsOutput>> {
6427 self.backend.put_bucket_ownership_controls(req).await
6428 }
6429 async fn delete_bucket_ownership_controls(
6430 &self,
6431 req: S3Request<DeleteBucketOwnershipControlsInput>,
6432 ) -> S3Result<S3Response<DeleteBucketOwnershipControlsOutput>> {
6433 self.backend.delete_bucket_ownership_controls(req).await
6434 }
6435
6436 // ---- Public access block ----
6437 async fn get_public_access_block(
6438 &self,
6439 req: S3Request<GetPublicAccessBlockInput>,
6440 ) -> S3Result<S3Response<GetPublicAccessBlockOutput>> {
6441 self.backend.get_public_access_block(req).await
6442 }
6443 async fn put_public_access_block(
6444 &self,
6445 req: S3Request<PutPublicAccessBlockInput>,
6446 ) -> S3Result<S3Response<PutPublicAccessBlockOutput>> {
6447 self.backend.put_public_access_block(req).await
6448 }
6449 async fn delete_public_access_block(
6450 &self,
6451 req: S3Request<DeletePublicAccessBlockInput>,
6452 ) -> S3Result<S3Response<DeletePublicAccessBlockOutput>> {
6453 self.backend.delete_public_access_block(req).await
6454 }
6455
6456 // ====================================================================
6457 // v0.6 #41: S3 Select — server-side SQL filter on object body.
6458 //
6459 // Fetch the object via the regular `get_object` path (so SSE-C /
6460 // SSE-S4 / SSE-KMS / S4 codec all decompress + decrypt transparently),
6461 // run a small SQL subset (CSV + JSON Lines, equality / inequality /
6462 // LIKE / AND / OR / NOT) over the in-memory body, and stream the
6463 // matched rows back as AWS event-stream `Records` + `Stats` + `End`
6464 // frames.
6465 //
6466 // Limitations (deliberate, documented):
6467 // - Parquet input is rejected with NotImplemented.
6468 // - Aggregates / GROUP BY / JOIN / ORDER BY / LIMIT are rejected at
6469 // parse time as InvalidRequest (s3s 0.13 doesn't expose AWS's
6470 // domain-specific `InvalidSqlExpression` code).
6471 // - The body is fully buffered before SQL evaluation (S3 Select
6472 // streaming-during-evaluation is v0.7 scope).
6473 // - GPU-accelerated WHERE evaluation is stubbed out (always None).
6474 async fn select_object_content(
6475 &self,
6476 req: S3Request<SelectObjectContentInput>,
6477 ) -> S3Result<S3Response<SelectObjectContentOutput>> {
6478 use crate::select::{
6479 EventStreamWriter, SelectInputFormat, SelectOutputFormat, run_select_csv,
6480 run_select_jsonlines,
6481 };
6482
6483 let select_bucket = req.input.bucket.clone();
6484 let select_key = req.input.key.clone();
6485 self.enforce_rate_limit(&req, &select_bucket)?;
6486 self.enforce_policy(&req, "s3:GetObject", &select_bucket, Some(&select_key))?;
6487
6488 let request = req.input.request;
6489 let sql = request.expression.clone();
6490 if request.expression_type.as_str() != "SQL" {
6491 return Err(S3Error::with_message(
6492 S3ErrorCode::InvalidExpressionType,
6493 format!(
6494 "ExpressionType must be SQL, got: {}",
6495 request.expression_type.as_str()
6496 ),
6497 ));
6498 }
6499
6500 let input_format = if let Some(_json) = request.input_serialization.json.as_ref() {
6501 SelectInputFormat::JsonLines
6502 } else if let Some(csv) = request.input_serialization.csv.as_ref() {
6503 let has_header = csv
6504 .file_header_info
6505 .as_ref()
6506 .map(|h| {
6507 let s = h.as_str();
6508 s.eq_ignore_ascii_case("USE") || s.eq_ignore_ascii_case("IGNORE")
6509 })
6510 .unwrap_or(false);
6511 let delim = csv
6512 .field_delimiter
6513 .as_deref()
6514 .and_then(|s| s.chars().next())
6515 .unwrap_or(',');
6516 SelectInputFormat::Csv {
6517 has_header,
6518 delimiter: delim,
6519 }
6520 } else if request.input_serialization.parquet.is_some() {
6521 return Err(S3Error::with_message(
6522 S3ErrorCode::NotImplemented,
6523 "Parquet input is not supported by this S3 Select implementation (v0.6: CSV / JSON Lines only)",
6524 ));
6525 } else {
6526 return Err(S3Error::with_message(
6527 S3ErrorCode::InvalidRequest,
6528 "InputSerialization requires exactly one of CSV / JSON / Parquet",
6529 ));
6530 };
6531 if let Some(ct) = request.input_serialization.compression_type.as_ref()
6532 && !ct.as_str().eq_ignore_ascii_case("NONE")
6533 {
6534 return Err(S3Error::with_message(
6535 S3ErrorCode::NotImplemented,
6536 format!(
6537 "InputSerialization CompressionType={} is not supported (v0.6: NONE only)",
6538 ct.as_str()
6539 ),
6540 ));
6541 }
6542
6543 let output_format = if request.output_serialization.json.is_some() {
6544 SelectOutputFormat::Json
6545 } else if request.output_serialization.csv.is_some() {
6546 SelectOutputFormat::Csv
6547 } else {
6548 return Err(S3Error::with_message(
6549 S3ErrorCode::InvalidRequest,
6550 "OutputSerialization requires exactly one of CSV / JSON",
6551 ));
6552 };
6553
6554 let get_input = GetObjectInput {
6555 bucket: select_bucket.clone(),
6556 key: select_key.clone(),
6557 sse_customer_algorithm: req.input.sse_customer_algorithm.clone(),
6558 sse_customer_key: req.input.sse_customer_key.clone(),
6559 sse_customer_key_md5: req.input.sse_customer_key_md5.clone(),
6560 ..Default::default()
6561 };
6562 let get_req = S3Request {
6563 input: get_input,
6564 method: http::Method::GET,
6565 uri: format!("/{}/{}", select_bucket, select_key)
6566 .parse()
6567 .map_err(|e| {
6568 S3Error::with_message(
6569 S3ErrorCode::InternalError,
6570 format!("constructing inner GET URI: {e}"),
6571 )
6572 })?,
6573 headers: http::HeaderMap::new(),
6574 extensions: http::Extensions::new(),
6575 credentials: req.credentials.clone(),
6576 region: req.region.clone(),
6577 service: req.service.clone(),
6578 trailing_headers: None,
6579 };
6580 let mut get_resp = self.get_object(get_req).await?;
6581 let blob = get_resp.output.body.take().ok_or_else(|| {
6582 S3Error::with_message(
6583 S3ErrorCode::InternalError,
6584 "Select: object body was empty after GET",
6585 )
6586 })?;
6587 let body_bytes = crate::blob::collect_blob(blob, self.max_body_bytes)
6588 .await
6589 .map_err(internal("collect Select body"))?;
6590 let scanned = body_bytes.len() as u64;
6591
6592 let matched_payload = match input_format {
6593 SelectInputFormat::JsonLines => run_select_jsonlines(&sql, &body_bytes, output_format)
6594 .map_err(|e| select_error_to_s3(e, "JSON Lines"))?,
6595 SelectInputFormat::Csv { .. } => {
6596 run_select_csv(&sql, &body_bytes, input_format, output_format)
6597 .map_err(|e| select_error_to_s3(e, "CSV"))?
6598 }
6599 };
6600
6601 let returned = matched_payload.len() as u64;
6602 let processed = scanned;
6603 let mut events: Vec<S3Result<SelectObjectContentEvent>> = Vec::with_capacity(3);
6604 if !matched_payload.is_empty() {
6605 events.push(Ok(SelectObjectContentEvent::Records(RecordsEvent {
6606 payload: Some(bytes::Bytes::from(matched_payload)),
6607 })));
6608 }
6609 events.push(Ok(SelectObjectContentEvent::Stats(StatsEvent {
6610 details: Some(Stats {
6611 bytes_scanned: Some(scanned as i64),
6612 bytes_processed: Some(processed as i64),
6613 bytes_returned: Some(returned as i64),
6614 }),
6615 })));
6616 events.push(Ok(SelectObjectContentEvent::End(EndEvent {})));
6617 // Touch EventStreamWriter so the public API stays linked into the
6618 // build (the actual wire framing is delegated to s3s).
6619 let _writer = EventStreamWriter::new();
6620
6621 let stream = SelectObjectContentEventStream::new(futures::stream::iter(events));
6622 let output = SelectObjectContentOutput {
6623 payload: Some(stream),
6624 };
6625 Ok(S3Response::new(output))
6626 }
6627
6628 // ---- Bucket Inventory configuration (v0.6 #36) ----
6629 //
6630 // When an `InventoryManager` is attached, S4-server owns the
6631 // configuration store and these handlers no longer pass through to
6632 // the backend. The mapping between the s3s-typed
6633 // `InventoryConfiguration` and the inventory module's internal
6634 // `InventoryConfig` is intentionally lossy: only the fields S4
6635 // actually uses for periodic CSV emission survive the round trip
6636 // (id, source bucket, destination bucket / prefix, format, included
6637 // versions, schedule frequency). Optional fields, encryption, and
6638 // filter prefixes are accepted on PUT and re-surfaced on GET via
6639 // a best-effort default-shape `InventoryConfiguration` so the
6640 // client sees a roundtrip-clean response.
6641 async fn put_bucket_inventory_configuration(
6642 &self,
6643 req: S3Request<PutBucketInventoryConfigurationInput>,
6644 ) -> S3Result<S3Response<PutBucketInventoryConfigurationOutput>> {
6645 if let Some(mgr) = self.inventory.as_ref() {
6646 let cfg = inv_from_dto(
6647 &req.input.bucket,
6648 &req.input.id,
6649 &req.input.inventory_configuration,
6650 );
6651 mgr.put(cfg);
6652 return Ok(S3Response::new(
6653 PutBucketInventoryConfigurationOutput::default(),
6654 ));
6655 }
6656 self.backend.put_bucket_inventory_configuration(req).await
6657 }
6658
6659 async fn get_bucket_inventory_configuration(
6660 &self,
6661 req: S3Request<GetBucketInventoryConfigurationInput>,
6662 ) -> S3Result<S3Response<GetBucketInventoryConfigurationOutput>> {
6663 if let Some(mgr) = self.inventory.as_ref() {
6664 let cfg = mgr.get(&req.input.bucket, &req.input.id);
6665 if let Some(cfg) = cfg {
6666 let out = GetBucketInventoryConfigurationOutput {
6667 inventory_configuration: Some(inv_to_dto(&cfg)),
6668 };
6669 return Ok(S3Response::new(out));
6670 }
6671 // AWS returns `NoSuchConfiguration` (404) when the id has no
6672 // matching inventory configuration on the bucket. The
6673 // generated `S3ErrorCode` enum doesn't expose a typed variant
6674 // for this code, so we round-trip through `from_bytes` which
6675 // wraps unknown codes as `Custom(...)` (= the AWS-canonical
6676 // error-code string survives into the XML response envelope).
6677 let code =
6678 S3ErrorCode::from_bytes(b"NoSuchConfiguration").unwrap_or(S3ErrorCode::NoSuchKey);
6679 return Err(S3Error::with_message(
6680 code,
6681 format!(
6682 "no inventory configuration with id={} on bucket={}",
6683 req.input.id, req.input.bucket
6684 ),
6685 ));
6686 }
6687 self.backend.get_bucket_inventory_configuration(req).await
6688 }
6689
6690 async fn list_bucket_inventory_configurations(
6691 &self,
6692 req: S3Request<ListBucketInventoryConfigurationsInput>,
6693 ) -> S3Result<S3Response<ListBucketInventoryConfigurationsOutput>> {
6694 if let Some(mgr) = self.inventory.as_ref() {
6695 let list = mgr.list_for_bucket(&req.input.bucket);
6696 let dto_list: Vec<InventoryConfiguration> = list.iter().map(inv_to_dto).collect();
6697 let out = ListBucketInventoryConfigurationsOutput {
6698 continuation_token: req.input.continuation_token.clone(),
6699 inventory_configuration_list: if dto_list.is_empty() {
6700 None
6701 } else {
6702 Some(dto_list)
6703 },
6704 is_truncated: Some(false),
6705 next_continuation_token: None,
6706 };
6707 return Ok(S3Response::new(out));
6708 }
6709 self.backend.list_bucket_inventory_configurations(req).await
6710 }
6711
6712 async fn delete_bucket_inventory_configuration(
6713 &self,
6714 req: S3Request<DeleteBucketInventoryConfigurationInput>,
6715 ) -> S3Result<S3Response<DeleteBucketInventoryConfigurationOutput>> {
6716 if let Some(mgr) = self.inventory.as_ref() {
6717 mgr.delete(&req.input.bucket, &req.input.id);
6718 return Ok(S3Response::new(
6719 DeleteBucketInventoryConfigurationOutput::default(),
6720 ));
6721 }
6722 self.backend
6723 .delete_bucket_inventory_configuration(req)
6724 .await
6725 }
6726}
6727
6728// ---------------------------------------------------------------------------
6729// v0.6 #36: Convert between the s3s-typed `InventoryConfiguration` (the wire
6730// surface) and our internal `crate::inventory::InventoryConfig`. Only the
6731// fields S4 actually uses for CSV emission survive the round trip; the
6732// missing fields (filter prefix, optional fields, encryption) are dropped on
6733// PUT and re-rendered as the AWS-default shape on GET so the client sees a
6734// well-formed `InventoryConfiguration`.
6735// ---------------------------------------------------------------------------
6736
6737fn inv_from_dto(
6738 bucket: &str,
6739 id: &str,
6740 dto: &InventoryConfiguration,
6741) -> crate::inventory::InventoryConfig {
6742 let frequency_hours = match dto.schedule.frequency.as_str() {
6743 "Weekly" => 24 * 7,
6744 // Daily is the default; anything S4 doesn't recognise (incl.
6745 // empty, which is the s3s-default) maps to Daily so the
6746 // operator's PUT doesn't silently turn into a no-op cadence.
6747 _ => 24,
6748 };
6749 // Parquet/ORC are not supported (issue #36 scope); we still accept
6750 // the PUT so callers don't fail-loud, but we record CSV and rely on
6751 // the operator catching the discrepancy on GET.
6752 let format = crate::inventory::InventoryFormat::Csv;
6753 crate::inventory::InventoryConfig {
6754 id: id.to_owned(),
6755 bucket: bucket.to_owned(),
6756 destination_bucket: dto.destination.s3_bucket_destination.bucket.clone(),
6757 destination_prefix: dto
6758 .destination
6759 .s3_bucket_destination
6760 .prefix
6761 .clone()
6762 .unwrap_or_default(),
6763 frequency_hours,
6764 format,
6765 included_object_versions: crate::inventory::IncludedVersions::from_aws_str(
6766 dto.included_object_versions.as_str(),
6767 ),
6768 }
6769}
6770
6771fn inv_to_dto(cfg: &crate::inventory::InventoryConfig) -> InventoryConfiguration {
6772 InventoryConfiguration {
6773 id: cfg.id.clone(),
6774 is_enabled: true,
6775 included_object_versions: InventoryIncludedObjectVersions::from(
6776 cfg.included_object_versions.as_aws_str().to_owned(),
6777 ),
6778 destination: InventoryDestination {
6779 s3_bucket_destination: InventoryS3BucketDestination {
6780 account_id: None,
6781 bucket: cfg.destination_bucket.clone(),
6782 encryption: None,
6783 format: InventoryFormat::from(cfg.format.as_aws_str().to_owned()),
6784 prefix: if cfg.destination_prefix.is_empty() {
6785 None
6786 } else {
6787 Some(cfg.destination_prefix.clone())
6788 },
6789 },
6790 },
6791 schedule: InventorySchedule {
6792 // `frequency_hours == 168` -> Weekly; everything else maps to
6793 // Daily for the wire response (the manager keeps the precise
6794 // hour count internally for due-checking).
6795 frequency: InventoryFrequency::from(
6796 if cfg.frequency_hours == 24 * 7 {
6797 "Weekly"
6798 } else {
6799 "Daily"
6800 }
6801 .to_owned(),
6802 ),
6803 },
6804 filter: None,
6805 optional_fields: None,
6806 }
6807}
6808
6809// ---------------------------------------------------------------------------
6810// v0.6 #35: Convert between the s3s-typed `NotificationConfiguration` (the
6811// wire surface) and our internal `crate::notifications::NotificationConfig`.
6812//
6813// We support TopicConfiguration (-> Destination::Sns) and QueueConfiguration
6814// (-> Destination::Sqs). LambdaFunction and EventBridge configurations are
6815// silently dropped on PUT (out of scope for v0.6 #35); the GET response only
6816// surfaces topic / queue rules.
6817//
6818// The webhook destination has no AWS-native wire form: operators configure
6819// webhooks via the JSON snapshot file (`--notifications-state-file`) or by
6820// poking `NotificationManager::put` directly from a custom binary. This
6821// keeps the wire surface AWS-compatible while still letting the always-
6822// available `Webhook` destination be reachable.
6823// ---------------------------------------------------------------------------
6824
6825fn notif_from_dto(dto: &NotificationConfiguration) -> crate::notifications::NotificationConfig {
6826 let mut rules: Vec<crate::notifications::NotificationRule> = Vec::new();
6827 if let Some(topics) = dto.topic_configurations.as_ref() {
6828 for (idx, t) in topics.iter().enumerate() {
6829 let events = events_from_dto(&t.events);
6830 let (prefix, suffix) = filter_from_dto(t.filter.as_ref());
6831 rules.push(crate::notifications::NotificationRule {
6832 id: t.id.clone().unwrap_or_else(|| format!("topic-{idx}")),
6833 events,
6834 destination: crate::notifications::Destination::Sns {
6835 topic_arn: t.topic_arn.clone(),
6836 },
6837 filter_prefix: prefix,
6838 filter_suffix: suffix,
6839 });
6840 }
6841 }
6842 if let Some(queues) = dto.queue_configurations.as_ref() {
6843 for (idx, q) in queues.iter().enumerate() {
6844 let events = events_from_dto(&q.events);
6845 let (prefix, suffix) = filter_from_dto(q.filter.as_ref());
6846 rules.push(crate::notifications::NotificationRule {
6847 id: q.id.clone().unwrap_or_else(|| format!("queue-{idx}")),
6848 events,
6849 destination: crate::notifications::Destination::Sqs {
6850 queue_arn: q.queue_arn.clone(),
6851 },
6852 filter_prefix: prefix,
6853 filter_suffix: suffix,
6854 });
6855 }
6856 }
6857 crate::notifications::NotificationConfig { rules }
6858}
6859
6860fn notif_to_dto(cfg: &crate::notifications::NotificationConfig) -> NotificationConfiguration {
6861 let mut topics: Vec<TopicConfiguration> = Vec::new();
6862 let mut queues: Vec<QueueConfiguration> = Vec::new();
6863 for rule in &cfg.rules {
6864 let events: Vec<Event> = rule
6865 .events
6866 .iter()
6867 .map(|e| Event::from(e.as_aws_str().to_owned()))
6868 .collect();
6869 let filter = filter_to_dto(rule.filter_prefix.as_deref(), rule.filter_suffix.as_deref());
6870 match &rule.destination {
6871 crate::notifications::Destination::Sns { topic_arn } => {
6872 topics.push(TopicConfiguration {
6873 events,
6874 filter,
6875 id: Some(rule.id.clone()),
6876 topic_arn: topic_arn.clone(),
6877 });
6878 }
6879 crate::notifications::Destination::Sqs { queue_arn } => {
6880 queues.push(QueueConfiguration {
6881 events,
6882 filter,
6883 id: Some(rule.id.clone()),
6884 queue_arn: queue_arn.clone(),
6885 });
6886 }
6887 // Webhook destinations have no AWS wire equivalent — they
6888 // round-trip through the JSON snapshot only. Skip them on the
6889 // GET surface (an SDK consumer wouldn't know what to do with
6890 // them anyway).
6891 crate::notifications::Destination::Webhook { .. } => {}
6892 }
6893 }
6894 NotificationConfiguration {
6895 event_bridge_configuration: None,
6896 lambda_function_configurations: None,
6897 queue_configurations: if queues.is_empty() {
6898 None
6899 } else {
6900 Some(queues)
6901 },
6902 topic_configurations: if topics.is_empty() {
6903 None
6904 } else {
6905 Some(topics)
6906 },
6907 }
6908}
6909
6910fn events_from_dto(events: &[Event]) -> Vec<crate::notifications::EventType> {
6911 events
6912 .iter()
6913 .filter_map(|e| crate::notifications::EventType::from_aws_str(e.as_ref()))
6914 .collect()
6915}
6916
6917fn filter_from_dto(
6918 f: Option<&NotificationConfigurationFilter>,
6919) -> (Option<String>, Option<String>) {
6920 let Some(f) = f else {
6921 return (None, None);
6922 };
6923 let Some(key) = f.key.as_ref() else {
6924 return (None, None);
6925 };
6926 let Some(rules) = key.filter_rules.as_ref() else {
6927 return (None, None);
6928 };
6929 let mut prefix = None;
6930 let mut suffix = None;
6931 for r in rules {
6932 let name = r.name.as_ref().map(|n| n.as_str().to_ascii_lowercase());
6933 let value = r.value.clone();
6934 match name.as_deref() {
6935 Some("prefix") => prefix = value,
6936 Some("suffix") => suffix = value,
6937 _ => {}
6938 }
6939 }
6940 (prefix, suffix)
6941}
6942
6943fn filter_to_dto(
6944 prefix: Option<&str>,
6945 suffix: Option<&str>,
6946) -> Option<NotificationConfigurationFilter> {
6947 if prefix.is_none() && suffix.is_none() {
6948 return None;
6949 }
6950 let mut rules: Vec<FilterRule> = Vec::new();
6951 if let Some(p) = prefix {
6952 rules.push(FilterRule {
6953 name: Some(FilterRuleName::from("prefix".to_owned())),
6954 value: Some(p.to_owned()),
6955 });
6956 }
6957 if let Some(s) = suffix {
6958 rules.push(FilterRule {
6959 name: Some(FilterRuleName::from("suffix".to_owned())),
6960 value: Some(s.to_owned()),
6961 });
6962 }
6963 Some(NotificationConfigurationFilter {
6964 key: Some(S3KeyFilter {
6965 filter_rules: Some(rules),
6966 }),
6967 })
6968}
6969
6970// ---------------------------------------------------------------------------
6971// v0.6 #40: Convert between the s3s-typed `ReplicationConfiguration` (the
6972// wire surface) and our internal `crate::replication::ReplicationConfig`.
6973// AWS's `ReplicationRuleFilter` is a sum type — `Prefix | Tag | And { Prefix,
6974// Tags }`; we flatten it into the single `(prefix, tag-vec)` representation
6975// the matcher needs. Sub-blocks v0.6 #40 does not implement
6976// (DeleteMarkerReplication / SourceSelectionCriteria / ReplicationTime /
6977// Metrics / EncryptionConfiguration) round-trip as `None` on GET — operators
6978// who set them on PUT see them silently dropped, mirroring "feature not
6979// supported in this release" semantics.
6980// ---------------------------------------------------------------------------
6981
6982fn replication_from_dto(dto: &ReplicationConfiguration) -> crate::replication::ReplicationConfig {
6983 let rules = dto
6984 .rules
6985 .iter()
6986 .enumerate()
6987 .map(|(idx, r)| {
6988 let id =
6989 r.id.as_ref()
6990 .map(|s| s.as_str().to_owned())
6991 .unwrap_or_else(|| format!("rule-{idx}"));
6992 let priority = r.priority.unwrap_or(0).max(0) as u32;
6993 let status_enabled = r.status.as_str() == ReplicationRuleStatus::ENABLED;
6994 let filter = replication_filter_from_dto(r.filter.as_ref(), r.prefix.as_deref());
6995 let destination_bucket = r.destination.bucket.clone();
6996 let destination_storage_class = r
6997 .destination
6998 .storage_class
6999 .as_ref()
7000 .map(|s| s.as_str().to_owned());
7001 crate::replication::ReplicationRule {
7002 id,
7003 priority,
7004 status_enabled,
7005 filter,
7006 destination_bucket,
7007 destination_storage_class,
7008 }
7009 })
7010 .collect();
7011 crate::replication::ReplicationConfig {
7012 role: dto.role.clone(),
7013 rules,
7014 }
7015}
7016
7017fn replication_to_dto(cfg: &crate::replication::ReplicationConfig) -> ReplicationConfiguration {
7018 let rules = cfg
7019 .rules
7020 .iter()
7021 .map(|r| {
7022 let status = if r.status_enabled {
7023 ReplicationRuleStatus::from_static(ReplicationRuleStatus::ENABLED)
7024 } else {
7025 ReplicationRuleStatus::from_static(ReplicationRuleStatus::DISABLED)
7026 };
7027 let destination = Destination {
7028 access_control_translation: None,
7029 account: None,
7030 bucket: r.destination_bucket.clone(),
7031 encryption_configuration: None,
7032 metrics: None,
7033 replication_time: None,
7034 storage_class: r
7035 .destination_storage_class
7036 .as_ref()
7037 .map(|s| StorageClass::from(s.clone())),
7038 };
7039 let filter = Some(replication_filter_to_dto(&r.filter));
7040 ReplicationRule {
7041 delete_marker_replication: None,
7042 destination,
7043 existing_object_replication: None,
7044 filter,
7045 id: Some(r.id.clone()),
7046 prefix: None,
7047 priority: Some(r.priority as i32),
7048 source_selection_criteria: None,
7049 status,
7050 }
7051 })
7052 .collect();
7053 ReplicationConfiguration {
7054 role: cfg.role.clone(),
7055 rules,
7056 }
7057}
7058
7059fn replication_filter_from_dto(
7060 f: Option<&ReplicationRuleFilter>,
7061 rule_level_prefix: Option<&str>,
7062) -> crate::replication::ReplicationFilter {
7063 let mut prefix: Option<String> = rule_level_prefix.map(str::to_owned);
7064 let mut tags: Vec<(String, String)> = Vec::new();
7065 if let Some(f) = f {
7066 if let Some(p) = f.prefix.as_ref()
7067 && prefix.is_none()
7068 {
7069 prefix = Some(p.clone());
7070 }
7071 if let Some(t) = f.tag.as_ref()
7072 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7073 {
7074 tags.push((k.clone(), v.clone()));
7075 }
7076 if let Some(and) = f.and.as_ref() {
7077 if let Some(p) = and.prefix.as_ref()
7078 && prefix.is_none()
7079 {
7080 prefix = Some(p.clone());
7081 }
7082 if let Some(ts) = and.tags.as_ref() {
7083 for t in ts {
7084 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7085 tags.push((k.clone(), v.clone()));
7086 }
7087 }
7088 }
7089 }
7090 }
7091 crate::replication::ReplicationFilter { prefix, tags }
7092}
7093
7094fn replication_filter_to_dto(f: &crate::replication::ReplicationFilter) -> ReplicationRuleFilter {
7095 if f.tags.is_empty() {
7096 ReplicationRuleFilter {
7097 and: None,
7098 prefix: f.prefix.clone(),
7099 tag: None,
7100 }
7101 } else if f.tags.len() == 1 && f.prefix.is_none() {
7102 let (k, v) = &f.tags[0];
7103 ReplicationRuleFilter {
7104 and: None,
7105 prefix: None,
7106 tag: Some(Tag {
7107 key: Some(k.clone()),
7108 value: Some(v.clone()),
7109 }),
7110 }
7111 } else {
7112 let tags: Vec<Tag> = f
7113 .tags
7114 .iter()
7115 .map(|(k, v)| Tag {
7116 key: Some(k.clone()),
7117 value: Some(v.clone()),
7118 })
7119 .collect();
7120 ReplicationRuleFilter {
7121 and: Some(ReplicationRuleAndOperator {
7122 prefix: f.prefix.clone(),
7123 tags: Some(tags),
7124 }),
7125 prefix: None,
7126 tag: None,
7127 }
7128 }
7129}
7130
7131// ---------------------------------------------------------------------------
7132// v0.6 #37: Convert between the s3s-typed `BucketLifecycleConfiguration`
7133// (the wire surface) and our internal `crate::lifecycle::LifecycleConfig`.
7134// The internal representation flattens AWS's "Filter | And" disjunction
7135// into a single `LifecycleFilter` struct of optional fields plus a tag
7136// vector. Fields S4's evaluator does not consume
7137// (`expired_object_delete_marker`, `noncurrent_version_transitions`,
7138// `transition_default_minimum_object_size`, the storage class on the
7139// noncurrent expiration) are dropped on PUT and re-rendered as their
7140// AWS-default shape on GET so the client always sees a well-formed
7141// configuration.
7142// ---------------------------------------------------------------------------
7143
7144fn dto_lifecycle_to_internal(
7145 dto: &BucketLifecycleConfiguration,
7146) -> crate::lifecycle::LifecycleConfig {
7147 crate::lifecycle::LifecycleConfig {
7148 rules: dto.rules.iter().map(dto_rule_to_internal).collect(),
7149 }
7150}
7151
7152fn dto_rule_to_internal(rule: &LifecycleRule) -> crate::lifecycle::LifecycleRule {
7153 let status = crate::lifecycle::LifecycleStatus::from_aws_str(rule.status.as_str());
7154 let filter = rule
7155 .filter
7156 .as_ref()
7157 .map(dto_filter_to_internal)
7158 .unwrap_or_default();
7159 let expiration_days = rule
7160 .expiration
7161 .as_ref()
7162 .and_then(|e| e.days)
7163 .and_then(|d| u32::try_from(d).ok());
7164 let expiration_date = rule
7165 .expiration
7166 .as_ref()
7167 .and_then(|e| e.date.as_ref())
7168 .and_then(timestamp_to_chrono_utc);
7169 let transitions: Vec<crate::lifecycle::TransitionRule> = rule
7170 .transitions
7171 .as_ref()
7172 .map(|ts| {
7173 ts.iter()
7174 .filter_map(|t| {
7175 let days = u32::try_from(t.days?).ok()?;
7176 let storage_class = t.storage_class.as_ref()?.as_str().to_owned();
7177 Some(crate::lifecycle::TransitionRule {
7178 days,
7179 storage_class,
7180 })
7181 })
7182 .collect()
7183 })
7184 .unwrap_or_default();
7185 let noncurrent_version_expiration_days = rule
7186 .noncurrent_version_expiration
7187 .as_ref()
7188 .and_then(|n| n.noncurrent_days)
7189 .and_then(|d| u32::try_from(d).ok());
7190 let abort_incomplete_multipart_upload_days = rule
7191 .abort_incomplete_multipart_upload
7192 .as_ref()
7193 .and_then(|a| a.days_after_initiation)
7194 .and_then(|d| u32::try_from(d).ok());
7195 crate::lifecycle::LifecycleRule {
7196 id: rule.id.clone().unwrap_or_default(),
7197 status,
7198 filter,
7199 expiration_days,
7200 expiration_date,
7201 transitions,
7202 noncurrent_version_expiration_days,
7203 abort_incomplete_multipart_upload_days,
7204 }
7205}
7206
7207fn dto_filter_to_internal(filter: &LifecycleRuleFilter) -> crate::lifecycle::LifecycleFilter {
7208 let mut prefix = filter.prefix.clone();
7209 let mut tags: Vec<(String, String)> = Vec::new();
7210 let mut size_gt: Option<u64> = filter
7211 .object_size_greater_than
7212 .and_then(|n| u64::try_from(n).ok());
7213 let mut size_lt: Option<u64> = filter
7214 .object_size_less_than
7215 .and_then(|n| u64::try_from(n).ok());
7216 if let Some(t) = &filter.tag
7217 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7218 {
7219 tags.push((k.clone(), v.clone()));
7220 }
7221 if let Some(and) = &filter.and {
7222 if prefix.is_none() {
7223 prefix = and.prefix.clone();
7224 }
7225 if size_gt.is_none() {
7226 size_gt = and
7227 .object_size_greater_than
7228 .and_then(|n| u64::try_from(n).ok());
7229 }
7230 if size_lt.is_none() {
7231 size_lt = and
7232 .object_size_less_than
7233 .and_then(|n| u64::try_from(n).ok());
7234 }
7235 if let Some(ts) = &and.tags {
7236 for t in ts {
7237 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7238 tags.push((k.clone(), v.clone()));
7239 }
7240 }
7241 }
7242 }
7243 crate::lifecycle::LifecycleFilter {
7244 prefix,
7245 tags,
7246 object_size_greater_than: size_gt,
7247 object_size_less_than: size_lt,
7248 }
7249}
7250
7251fn internal_rule_to_dto(rule: &crate::lifecycle::LifecycleRule) -> LifecycleRule {
7252 let expiration = if rule.expiration_days.is_some() || rule.expiration_date.is_some() {
7253 Some(LifecycleExpiration {
7254 date: rule.expiration_date.map(chrono_utc_to_timestamp),
7255 days: rule.expiration_days.map(|d| d as i32),
7256 expired_object_delete_marker: None,
7257 })
7258 } else {
7259 None
7260 };
7261 let transitions: Option<TransitionList> = if rule.transitions.is_empty() {
7262 None
7263 } else {
7264 Some(
7265 rule.transitions
7266 .iter()
7267 .map(|t| Transition {
7268 date: None,
7269 days: Some(t.days as i32),
7270 storage_class: Some(TransitionStorageClass::from(t.storage_class.clone())),
7271 })
7272 .collect(),
7273 )
7274 };
7275 let noncurrent_version_expiration =
7276 rule.noncurrent_version_expiration_days
7277 .map(|d| NoncurrentVersionExpiration {
7278 newer_noncurrent_versions: None,
7279 noncurrent_days: Some(d as i32),
7280 });
7281 let abort_incomplete_multipart_upload =
7282 rule.abort_incomplete_multipart_upload_days
7283 .map(|d| AbortIncompleteMultipartUpload {
7284 days_after_initiation: Some(d as i32),
7285 });
7286 let filter = if rule.filter.tags.is_empty()
7287 && rule.filter.object_size_greater_than.is_none()
7288 && rule.filter.object_size_less_than.is_none()
7289 {
7290 rule.filter.prefix.as_ref().map(|p| LifecycleRuleFilter {
7291 and: None,
7292 object_size_greater_than: None,
7293 object_size_less_than: None,
7294 prefix: Some(p.clone()),
7295 tag: None,
7296 })
7297 } else if rule.filter.tags.len() == 1
7298 && rule.filter.prefix.is_none()
7299 && rule.filter.object_size_greater_than.is_none()
7300 && rule.filter.object_size_less_than.is_none()
7301 {
7302 let (k, v) = rule.filter.tags[0].clone();
7303 Some(LifecycleRuleFilter {
7304 and: None,
7305 object_size_greater_than: None,
7306 object_size_less_than: None,
7307 prefix: None,
7308 tag: Some(Tag {
7309 key: Some(k),
7310 value: Some(v),
7311 }),
7312 })
7313 } else {
7314 let tags = if rule.filter.tags.is_empty() {
7315 None
7316 } else {
7317 Some(
7318 rule.filter
7319 .tags
7320 .iter()
7321 .map(|(k, v)| Tag {
7322 key: Some(k.clone()),
7323 value: Some(v.clone()),
7324 })
7325 .collect(),
7326 )
7327 };
7328 Some(LifecycleRuleFilter {
7329 and: Some(LifecycleRuleAndOperator {
7330 object_size_greater_than: rule
7331 .filter
7332 .object_size_greater_than
7333 .and_then(|n| i64::try_from(n).ok()),
7334 object_size_less_than: rule
7335 .filter
7336 .object_size_less_than
7337 .and_then(|n| i64::try_from(n).ok()),
7338 prefix: rule.filter.prefix.clone(),
7339 tags,
7340 }),
7341 object_size_greater_than: None,
7342 object_size_less_than: None,
7343 prefix: None,
7344 tag: None,
7345 })
7346 };
7347 LifecycleRule {
7348 abort_incomplete_multipart_upload,
7349 expiration,
7350 filter,
7351 id: if rule.id.is_empty() {
7352 None
7353 } else {
7354 Some(rule.id.clone())
7355 },
7356 noncurrent_version_expiration,
7357 noncurrent_version_transitions: None,
7358 prefix: None,
7359 status: ExpirationStatus::from(rule.status.as_aws_str().to_owned()),
7360 transitions,
7361 }
7362}
7363
7364// (timestamp <-> chrono helpers `timestamp_to_chrono_utc` /
7365// `chrono_utc_to_timestamp` are defined earlier in this file for the
7366// tagging/notifications work; the lifecycle DTO converters reuse them.)
7367
7368// ---------------------------------------------------------------------------
7369// v0.5 #33: SigV4a (asymmetric ECDSA-P256) integration hook.
7370//
7371// Kept as a self-contained block at the bottom of the file so it doesn't
7372// touch the existing `S4Service` struct, `new()`, or any of the per-op
7373// handlers above. The hook is wired in by the binary at server-build time
7374// as a hyper middleware layer (see `main.rs`), NOT inside `S4Service`.
7375//
7376// Lifecycle:
7377// 1. `SigV4aGate::new(store)` is constructed once at boot from the
7378// operator-supplied credential directory.
7379// 2. For each incoming request, `SigV4aGate::pre_route(&req,
7380// &requested_region, &canonical_request_bytes)` is invoked BEFORE
7381// the request hits the S3 framework. If the request claims SigV4a
7382// and verifies, control returns to the framework. Otherwise a 403
7383// `SignatureDoesNotMatch` is produced.
7384// 3. Plain SigV4 (HMAC-SHA256) requests pass through untouched.
7385// ---------------------------------------------------------------------------
7386
7387/// Gate that fronts the S3 service path with SigV4a verification (v0.5 #33).
7388///
7389/// Wraps a [`crate::sigv4a::SigV4aCredentialStore`] and exposes a single
7390/// `pre_route` entry point that returns `Ok(())` for both
7391/// "request is plain SigV4 — pass through" and "request is SigV4a and
7392/// verified", and an `Err(...)` containing a 403-equivalent diagnostic
7393/// otherwise. Cheap to clone (the inner store is `Arc`-backed).
7394///
7395/// v0.8.4 #76 (audit H-6): the gate now enforces an `x-amz-date`
7396/// freshness window (default 15 min, AWS-spec) and a strict credential
7397/// scope shape (`<key>/<YYYYMMDD>/s3/aws4_request`), shutting the
7398/// captured-request replay vector — previously a stolen valid SigV4a
7399/// signature could be replayed indefinitely (including DELETE).
7400#[derive(Debug, Clone)]
7401pub struct SigV4aGate {
7402 store: crate::sigv4a::SharedSigV4aCredentialStore,
7403 /// v0.8.4 #76: how far the request's `x-amz-date` may drift from
7404 /// the server's clock before being rejected with 403
7405 /// `RequestTimeTooSkewed`. Matches the AWS S3 spec default of
7406 /// 15 min when constructed via [`SigV4aGate::new`]; the operator
7407 /// can override via [`SigV4aGate::with_skew_tolerance`] (CLI flag
7408 /// `--sigv4a-skew-tolerance-seconds`).
7409 skew_tolerance: chrono::Duration,
7410}
7411
7412impl SigV4aGate {
7413 /// Default `x-amz-date` skew tolerance — 15 min, matching AWS S3.
7414 pub const DEFAULT_SKEW_TOLERANCE_SECS: i64 = 900;
7415
7416 #[must_use]
7417 pub fn new(store: crate::sigv4a::SharedSigV4aCredentialStore) -> Self {
7418 Self {
7419 store,
7420 skew_tolerance: chrono::Duration::seconds(Self::DEFAULT_SKEW_TOLERANCE_SECS),
7421 }
7422 }
7423
7424 /// v0.8.4 #76: override the `x-amz-date` skew tolerance (default
7425 /// 15 min). Operators can widen this for high-clock-drift
7426 /// environments or tighten it for compliance regimes that demand
7427 /// stricter freshness.
7428 #[must_use]
7429 pub fn with_skew_tolerance(mut self, skew: chrono::Duration) -> Self {
7430 self.skew_tolerance = skew;
7431 self
7432 }
7433
7434 /// Read the configured skew tolerance — exposed mostly for test +
7435 /// observability use.
7436 #[must_use]
7437 pub fn skew_tolerance(&self) -> chrono::Duration {
7438 self.skew_tolerance
7439 }
7440
7441 /// Inspect an incoming HTTP request. Behaviour:
7442 ///
7443 /// - Not SigV4a (no `X-Amz-Region-Set` and no SigV4a `Authorization`
7444 /// prefix) → returns `Ok(())`; the framework's existing SigV4
7445 /// path handles the request.
7446 /// - SigV4a + valid signature + region match + fresh x-amz-date
7447 /// → `Ok(())`.
7448 /// - SigV4a + unknown access-key-id → `Err` with `InvalidAccessKeyId`.
7449 /// - SigV4a + bad signature / region mismatch → `Err` with
7450 /// `SignatureDoesNotMatch`.
7451 /// - SigV4a + missing or skewed `x-amz-date` → `Err` with one of
7452 /// the v0.8.4 #76 freshness variants (`RequestTimeTooSkewed`
7453 /// et al.).
7454 ///
7455 /// `canonical_request_bytes` is the SigV4a string-to-sign (or
7456 /// canonical-request bytes; the caller decides) that the framework
7457 /// has already produced for this request. Keeping it as a parameter
7458 /// instead of rebuilding it inside the hook avoids duplicating the
7459 /// canonicalisation logic.
7460 pub fn pre_route<B>(
7461 &self,
7462 req: &http::Request<B>,
7463 requested_region: &str,
7464 canonical_request_bytes: &[u8],
7465 ) -> Result<(), SigV4aGateError> {
7466 self.pre_route_at(
7467 req,
7468 requested_region,
7469 canonical_request_bytes,
7470 chrono::Utc::now(),
7471 )
7472 }
7473
7474 /// Like [`SigV4aGate::pre_route`] but takes an explicit `now` for
7475 /// tests that need to pin the freshness clock. Production callers
7476 /// use `pre_route` (which calls `chrono::Utc::now()`).
7477 pub fn pre_route_at<B>(
7478 &self,
7479 req: &http::Request<B>,
7480 requested_region: &str,
7481 canonical_request_bytes: &[u8],
7482 now: chrono::DateTime<chrono::Utc>,
7483 ) -> Result<(), SigV4aGateError> {
7484 if !crate::sigv4a::detect(req) {
7485 return Ok(());
7486 }
7487 let auth_hdr = req
7488 .headers()
7489 .get(http::header::AUTHORIZATION)
7490 .and_then(|v| v.to_str().ok())
7491 .ok_or(SigV4aGateError::MissingAuthorization)?;
7492 let parsed = crate::sigv4a::parse_authorization_header(auth_hdr)
7493 .map_err(|_| SigV4aGateError::MalformedAuthorization)?;
7494 let region_set = req
7495 .headers()
7496 .get(crate::sigv4a::REGION_SET_HEADER)
7497 .and_then(|v| v.to_str().ok())
7498 .unwrap_or("*");
7499 let key = self
7500 .store
7501 .get(&parsed.access_key_id)
7502 .ok_or_else(|| SigV4aGateError::UnknownAccessKey(parsed.access_key_id.clone()))?;
7503 // v0.8.4 #76: snapshot the request headers into a
7504 // lowercase-keyed flat map so `verify_request` can do the
7505 // x-amz-date freshness checks without taking a generic
7506 // `HeaderMap` dep. Cheap because the headers list is tiny.
7507 //
7508 // v0.8.5 #84 (audit H-4): detect duplicate header names while
7509 // we flatten — `HashMap::insert` would silently overwrite the
7510 // first value with the second, mirroring the auth-confusion
7511 // vector the canonical-request builder also defends against.
7512 // Reject upfront so the rest of the gate (freshness check,
7513 // ECDSA verify) never sees a half-truncated header set. We
7514 // detect by checking `contains_key` *before* insertion rather
7515 // than by counting via `headers().get_all`, because the
7516 // upstream `HeaderMap` iteration yields each duplicate entry
7517 // as its own (name, value) pair — the second-seen entry is
7518 // exactly what `contains_key` traps.
7519 let mut header_map: std::collections::HashMap<String, String> =
7520 std::collections::HashMap::with_capacity(req.headers().len());
7521 for (name, value) in req.headers() {
7522 if let Ok(v) = value.to_str() {
7523 let lower = name.as_str().to_ascii_lowercase();
7524 if header_map.contains_key(&lower) {
7525 return Err(SigV4aGateError::Verify(
7526 crate::sigv4a::SigV4aError::DuplicateSignedHeader { header: lower },
7527 ));
7528 }
7529 header_map.insert(lower, v.to_string());
7530 }
7531 }
7532 crate::sigv4a::verify_request(
7533 &parsed,
7534 &header_map,
7535 canonical_request_bytes,
7536 key,
7537 region_set,
7538 requested_region,
7539 now,
7540 self.skew_tolerance,
7541 )
7542 .map_err(SigV4aGateError::Verify)?;
7543 Ok(())
7544 }
7545}
7546
7547/// Failure modes from [`SigV4aGate::pre_route`]. All variants map to
7548/// HTTP 403 with one of the two AWS-standard error codes
7549/// (`InvalidAccessKeyId` / `SignatureDoesNotMatch` / `RequestTimeTooSkewed`)
7550/// — see [`SigV4aGateError::s3_error_code`].
7551#[derive(Debug, thiserror::Error)]
7552pub enum SigV4aGateError {
7553 #[error("missing Authorization header")]
7554 MissingAuthorization,
7555 #[error("malformed SigV4a Authorization header")]
7556 MalformedAuthorization,
7557 #[error("unknown SigV4a access-key-id: {0}")]
7558 UnknownAccessKey(String),
7559 #[error("SigV4a verification failed: {0}")]
7560 Verify(#[source] crate::sigv4a::SigV4aError),
7561}
7562
7563impl SigV4aGateError {
7564 /// AWS S3 error code that should accompany the response.
7565 ///
7566 /// v0.8.4 #76 (audit H-6): the freshness check surfaces
7567 /// `RequestTimeTooSkewed` (matches AWS spec); date / scope shape
7568 /// failures surface as `InvalidRequest` (400); other failures stay
7569 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` (403) so the wire
7570 /// surface stays AWS-compatible.
7571 #[must_use]
7572 pub fn s3_error_code(&self) -> &'static str {
7573 match self {
7574 Self::UnknownAccessKey(_) => "InvalidAccessKeyId",
7575 Self::Verify(crate::sigv4a::SigV4aError::RequestTimeTooSkewed { .. }) => {
7576 "RequestTimeTooSkewed"
7577 }
7578 Self::Verify(
7579 crate::sigv4a::SigV4aError::MissingXAmzDate
7580 | crate::sigv4a::SigV4aError::InvalidDateFormat
7581 | crate::sigv4a::SigV4aError::DateScopeMismatch
7582 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7583 | crate::sigv4a::SigV4aError::InvalidTerminator
7584 | crate::sigv4a::SigV4aError::WrongService { .. }
7585 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7586 ) => "InvalidRequest",
7587 _ => "SignatureDoesNotMatch",
7588 }
7589 }
7590
7591 /// HTTP status code to accompany the response. v0.8.4 #76: format
7592 /// errors that are clearly client mistakes (missing / malformed
7593 /// `x-amz-date`, malformed credential scope, wrong service) are
7594 /// surfaced as 400 InvalidRequest; the rest stay 403.
7595 #[must_use]
7596 pub fn http_status(&self) -> http::StatusCode {
7597 match self {
7598 Self::Verify(
7599 crate::sigv4a::SigV4aError::MissingXAmzDate
7600 | crate::sigv4a::SigV4aError::InvalidDateFormat
7601 | crate::sigv4a::SigV4aError::DateScopeMismatch
7602 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
7603 | crate::sigv4a::SigV4aError::InvalidTerminator
7604 | crate::sigv4a::SigV4aError::WrongService { .. }
7605 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
7606 ) => http::StatusCode::BAD_REQUEST,
7607 _ => http::StatusCode::FORBIDDEN,
7608 }
7609 }
7610}
7611
7612#[cfg(test)]
7613mod tests {
7614 use super::*;
7615
7616 #[test]
7617 fn manifest_roundtrip_via_metadata() {
7618 let original = ChunkManifest {
7619 codec: CodecKind::CpuZstd,
7620 original_size: 1234,
7621 compressed_size: 567,
7622 crc32c: 0xdead_beef,
7623 };
7624 let mut meta: Option<Metadata> = None;
7625 write_manifest(&mut meta, &original);
7626 let extracted = extract_manifest(&meta).expect("manifest must round-trip");
7627 assert_eq!(extracted.codec, original.codec);
7628 assert_eq!(extracted.original_size, original.original_size);
7629 assert_eq!(extracted.compressed_size, original.compressed_size);
7630 assert_eq!(extracted.crc32c, original.crc32c);
7631 }
7632
7633 #[test]
7634 fn missing_metadata_yields_none() {
7635 let meta: Option<Metadata> = None;
7636 assert!(extract_manifest(&meta).is_none());
7637 }
7638
7639 #[test]
7640 fn partial_metadata_yields_none() {
7641 let mut meta = Metadata::new();
7642 meta.insert(META_CODEC.into(), "cpu-zstd".into());
7643 let opt = Some(meta);
7644 assert!(extract_manifest(&opt).is_none());
7645 }
7646
7647 #[test]
7648 fn parse_copy_source_range_basic() {
7649 let r = parse_copy_source_range("bytes=10-20").unwrap();
7650 match r {
7651 s3s::dto::Range::Int { first, last } => {
7652 assert_eq!(first, 10);
7653 assert_eq!(last, Some(20));
7654 }
7655 _ => panic!("expected Int range"),
7656 }
7657 }
7658
7659 #[test]
7660 fn parse_copy_source_range_rejects_inverted() {
7661 let err = parse_copy_source_range("bytes=20-10").unwrap_err();
7662 assert!(err.contains("last < first"));
7663 }
7664
7665 #[test]
7666 fn parse_copy_source_range_rejects_missing_prefix() {
7667 let err = parse_copy_source_range("10-20").unwrap_err();
7668 assert!(err.contains("must start with 'bytes='"));
7669 }
7670
7671 #[test]
7672 fn parse_copy_source_range_rejects_open_ended() {
7673 // S3 upload_part_copy spec requires N-M (closed); suffix and
7674 // open-ended forms are not allowed for this header.
7675 assert!(parse_copy_source_range("bytes=10-").is_err());
7676 assert!(parse_copy_source_range("bytes=-10").is_err());
7677 }
7678
7679 // v0.7 #49: safe_object_uri must round-trip every legal S3 key
7680 // (which includes spaces, slashes, control chars, raw UTF-8) into
7681 // a parseable `http::Uri` instead of panicking like the previous
7682 // `format!(...).parse().unwrap()` call sites did.
7683
7684 #[test]
7685 fn safe_object_uri_basic_ascii() {
7686 let uri = safe_object_uri("bucket", "key").expect("ascii must be safe");
7687 assert_eq!(uri.path(), "/bucket/key");
7688 }
7689
7690 #[test]
7691 fn safe_object_uri_encodes_spaces() {
7692 let uri = safe_object_uri("bucket", "key with spaces").expect("must encode spaces");
7693 // RFC 3986 path-segment encoding turns ' ' into %20.
7694 assert!(
7695 uri.path().contains("%20"),
7696 "expected percent-encoded space, got {}",
7697 uri.path()
7698 );
7699 assert!(uri.path().starts_with("/bucket/"));
7700 }
7701
7702 #[test]
7703 fn safe_object_uri_preserves_slashes() {
7704 // S3 keys legally contain '/' as a logical path separator —
7705 // the helper must NOT escape it (otherwise the synthetic URI
7706 // changes the perceived hierarchy).
7707 let uri = safe_object_uri("bucket", "key/with/slashes").expect("slashes must round-trip");
7708 assert_eq!(uri.path(), "/bucket/key/with/slashes");
7709 }
7710
7711 #[test]
7712 fn safe_object_uri_handles_newline_without_panic() {
7713 // Newlines are control chars in URIs; whether the result is
7714 // Ok (encoded as %0A) or Err (parse rejects), the helper
7715 // MUST NOT panic. Either outcome is acceptable.
7716 let _ = safe_object_uri("bucket", "key\n");
7717 }
7718
7719 #[test]
7720 fn safe_object_uri_handles_null_byte_without_panic() {
7721 let _ = safe_object_uri("bucket", "key\0bad");
7722 }
7723
7724 #[test]
7725 fn safe_object_uri_handles_unicode_without_panic() {
7726 // RTL override, BOM, plain Japanese — none should panic.
7727 let _ = safe_object_uri("bucket", "rtl\u{202E}override");
7728 let _ = safe_object_uri("bucket", "\u{FEFF}bom-key");
7729 let _ = safe_object_uri("bucket", "日本語キー");
7730 }
7731
7732 #[test]
7733 fn safe_object_uri_no_panic_for_every_byte() {
7734 // Exhaustive byte coverage: 0x00..=0xFF as a 1-byte key.
7735 // None of these may panic. (0x80..=0xFF are not valid UTF-8
7736 // by themselves; we go through `String::from_utf8_lossy` so
7737 // the helper sees a real `&str` regardless of the raw byte.)
7738 for b in 0u8..=255 {
7739 let s = String::from_utf8_lossy(&[b]).into_owned();
7740 let _ = safe_object_uri("bucket", &s);
7741 }
7742 }
7743
7744 /// v0.8.1 #58: smoke test for the DEK-handling shape used by the
7745 /// SSE-KMS branches of `put_object` and `complete_multipart_upload`.
7746 /// Mirrors the call pattern (generate_dek → length check → copy
7747 /// into stack `[u8; 32]` → reborrow as `&[u8; 32]` for `SseSource`)
7748 /// without spinning up a full `S4Service`.
7749 ///
7750 /// The real assertion this guards against is a regression where
7751 /// the `Zeroizing` wrapper is accidentally dropped before the
7752 /// stack copy lands (e.g. someone refactors to use
7753 /// `let dek = kms.generate_dek(...).await?.0; drop(dek); ...`)
7754 /// or where `&**dek` is rewritten in a way that doesn't compile.
7755 #[tokio::test]
7756 async fn kms_dek_lifetime_within_function_scope() {
7757 use crate::kms::{KmsBackend, LocalKms};
7758 use std::collections::HashMap;
7759 use std::path::PathBuf;
7760 use zeroize::Zeroizing;
7761
7762 let mut keks = HashMap::new();
7763 keks.insert("scope".to_string(), [33u8; 32]);
7764 let kms = LocalKms::from_keks(PathBuf::from("/tmp/kms-scope-test"), keks);
7765
7766 // Mirror the put_object KMS branch shape exactly.
7767 let (dek, wrapped) = kms.generate_dek("scope").await.unwrap();
7768 assert_eq!(dek.len(), 32);
7769 let mut dek_arr: Zeroizing<[u8; 32]> = Zeroizing::new([0u8; 32]);
7770 dek_arr.copy_from_slice(&dek);
7771
7772 // The reborrow used at the SseSource construction site —
7773 // mirrors the call-site pattern where `let dek_ref: &[u8; 32]`
7774 // auto-derefs from a `Zeroizing<[u8; 32]>` reference.
7775 let dek_ref: &[u8; 32] = &dek_arr;
7776 // Sanity: the reborrow points at the same bytes.
7777 assert_eq!(dek_ref, &*dek_arr);
7778 // Wrapped key id flows through unchanged.
7779 assert_eq!(wrapped.key_id, "scope");
7780
7781 // At end of scope, both `dek` (Zeroizing<Vec<u8>>) and
7782 // `dek_arr` (Zeroizing<[u8; 32]>) are dropped, wiping the
7783 // backing memory. Cannot directly assert the wipe (would be
7784 // UB to read freed memory), so this test instead enforces
7785 // that the call shape compiles and executes; the wipe itself
7786 // is exercised by the `zeroize` crate's own test suite.
7787 }
7788
7789 /// v0.8.5 #86 (audit M-2): the replication dispatcher must
7790 /// `acquire_owned()` a permit from `replication_semaphore` before
7791 /// kicking off the destination PUT, so a saturated semaphore
7792 /// back-pressures the in-flight queue depth instead of letting it
7793 /// grow without bound. We exercise the field directly (initial
7794 /// permit count, override via `with_replication_max_concurrent`,
7795 /// permit drop on `Drop`) — the full `spawn_replication_if_matched`
7796 /// integration is exercised by the existing replication tests in
7797 /// `tests/feature_e2e.rs` once a `ReplicationManager` is attached.
7798 #[tokio::test]
7799 async fn replication_semaphore_caps_concurrent_dispatchers() {
7800 // Build a minimal `S4Service` directly — no handler path is
7801 // exercised, only the constructor + setter + accessor shape.
7802 let registry = Arc::new(
7803 CodecRegistry::new(CodecKind::Passthrough)
7804 .with(Arc::new(s4_codec::passthrough::Passthrough)),
7805 );
7806 let dispatcher = Arc::new(s4_codec::dispatcher::AlwaysDispatcher(
7807 CodecKind::Passthrough,
7808 ));
7809 let s4 = S4Service::new(NoopBackend, registry, dispatcher);
7810
7811 // Default cap matches the documented constant.
7812 assert_eq!(
7813 s4.replication_semaphore().available_permits(),
7814 S4Service::<NoopBackend>::DEFAULT_REPLICATION_MAX_CONCURRENT,
7815 "fresh S4Service must expose DEFAULT_REPLICATION_MAX_CONCURRENT permits"
7816 );
7817
7818 // Override via the builder — replaces the underlying `Semaphore`.
7819 let s4 = s4.with_replication_max_concurrent(2);
7820 assert_eq!(
7821 s4.replication_semaphore().available_permits(),
7822 2,
7823 "with_replication_max_concurrent(2) must expose exactly 2 permits"
7824 );
7825
7826 // Acquiring permits must reduce `available_permits()` and
7827 // dropping them must restore the count — this is the contract
7828 // `spawn_replication_if_matched` relies on for back-pressure.
7829 let sem = Arc::clone(s4.replication_semaphore());
7830 let p1 = sem.clone().acquire_owned().await.expect("permit 1");
7831 let p2 = sem.clone().acquire_owned().await.expect("permit 2");
7832 assert_eq!(
7833 sem.available_permits(),
7834 0,
7835 "two acquired permits must zero `available_permits()`"
7836 );
7837 // A third `try_acquire_owned` must fail — the cap is enforced
7838 // synchronously, no extra spawn slips through.
7839 assert!(
7840 sem.clone().try_acquire_owned().is_err(),
7841 "third acquire must back-pressure: cap was 2"
7842 );
7843 drop(p1);
7844 drop(p2);
7845 assert_eq!(
7846 sem.available_permits(),
7847 2,
7848 "dropping permits must restore cap"
7849 );
7850
7851 // Lower-bound clamp: a 0 cap would deadlock all dispatchers,
7852 // so the setter clamps it to 1 instead of accepting it
7853 // (callers are warned in the CLI doc).
7854 let s4 = s4.with_replication_max_concurrent(0);
7855 assert_eq!(
7856 s4.replication_semaphore().available_permits(),
7857 1,
7858 "cap=0 must be clamped to 1 to avoid total deadlock"
7859 );
7860 }
7861
7862 /// v0.8.5 #86 (audit M-1): the access-log flusher must return a
7863 /// `JoinHandle<()>` that the caller can `abort()` on shutdown
7864 /// without leaving a dangling task. The pre-#86 call site dropped
7865 /// the handle at end-of-block (silently detaching it); the fix is
7866 /// hoisting it into a process-lived `Vec` so the graceful-shutdown
7867 /// branch in `main.rs` can wait for clean exit. This test exercises
7868 /// the `JoinHandle.abort()` shape directly so a future refactor that
7869 /// stops returning the handle (or returns a non-abortable wrapper)
7870 /// trips this regression guard.
7871 #[tokio::test]
7872 async fn flusher_handle_can_be_aborted_cleanly() {
7873 // Stand up a minimal `AccessLog` pointing at a tmp dir so the
7874 // flusher's `create_dir_all` succeeds. The dir is cleaned up
7875 // by the OS / test harness; we don't assert on the contents.
7876 let tmp = std::env::temp_dir().join(format!(
7877 "s4-86-flusher-{}-{}",
7878 std::process::id(),
7879 std::time::SystemTime::now()
7880 .duration_since(std::time::UNIX_EPOCH)
7881 .map(|d| d.as_nanos())
7882 .unwrap_or(0)
7883 ));
7884 let dest = crate::access_log::AccessLogDest { dir: tmp.clone() };
7885 let log = crate::access_log::AccessLog::new(dest);
7886 let handle = log.spawn_flusher(None);
7887 assert!(
7888 !handle.is_finished(),
7889 "freshly-spawned flusher must not yet be finished"
7890 );
7891 handle.abort();
7892 // `await`-ing an aborted handle returns `Err(JoinError)` whose
7893 // `is_cancelled()` is true.
7894 let join_result = handle.await;
7895 assert!(
7896 join_result.is_err(),
7897 "aborted flusher must surface JoinError, got Ok"
7898 );
7899 assert!(
7900 join_result.unwrap_err().is_cancelled(),
7901 "JoinError must report .is_cancelled() = true after abort()"
7902 );
7903 let _ = std::fs::remove_dir_all(&tmp);
7904 }
7905
7906 /// Stub backend used solely by the v0.8.5 #86 unit tests above —
7907 /// the `S4Service` constructor needs `B: S3` but the tests only
7908 /// exercise builder / accessor shape, never a handler call. Every
7909 /// `S3` method falls through to the trait's default
7910 /// `NotImplemented` (which `s3s` provides automatically).
7911 struct NoopBackend;
7912
7913 #[async_trait::async_trait]
7914 impl S3 for NoopBackend {}
7915
7916 /// v0.8.5 #81 (audit H-7): the panic-catch wrapper at the
7917 /// dispatcher spawn site must intercept a panicking inner future,
7918 /// log at ERROR, and bump the per-kind counter — instead of letting
7919 /// the panic propagate as a `JoinError` that no operator dashboard
7920 /// scrapes. We exercise the wrapper directly (rather than driving a
7921 /// full `spawn_replication_if_matched` end-to-end, which would
7922 /// require a full `S4Service` + backend) because the wrapper shape
7923 /// is the load-bearing piece — any inner-future swap would still
7924 /// route through the same `AssertUnwindSafe(...).catch_unwind()`
7925 /// closure we want to lock in here.
7926 #[tokio::test]
7927 async fn dispatcher_panic_caught_and_metric_bumped() {
7928 use futures::FutureExt as _;
7929
7930 let handle = crate::metrics::test_metrics_handle();
7931 let kind = "replication";
7932
7933 // Mirror the production wrapper shape verbatim — if the
7934 // production code ever stops using `AssertUnwindSafe.catch_unwind`
7935 // this test shouldn't keep passing on a hand-rolled copy that
7936 // diverged.
7937 let panicking = async {
7938 panic!("simulated dispatcher panic");
7939 };
7940 let result = std::panic::AssertUnwindSafe(panicking).catch_unwind().await;
7941 assert!(
7942 result.is_err(),
7943 "catch_unwind must surface the panic instead of swallowing it"
7944 );
7945 // Bump the production counter via the same helper the wrapper
7946 // calls so the rendered output gates on the production code
7947 // path, not a parallel bookkeeping copy.
7948 crate::metrics::record_dispatcher_panic(kind);
7949
7950 let rendered = handle.render();
7951 assert!(
7952 rendered.contains("s4_dispatcher_panics_total"),
7953 "expected s4_dispatcher_panics_total in metrics output, got: {rendered}"
7954 );
7955 assert!(
7956 rendered.contains("kind=\"replication\""),
7957 "expected kind=\"replication\" label in metrics output, got: {rendered}"
7958 );
7959 }
7960}