s4_server/service.rs
1//! `s3s::S3` 実装 — `s3s_aws::Proxy` への delegation を default にしつつ、
2//! `put_object` / `get_object` 経路で `s4_codec::CodecRegistry` を呼ぶ。
3//!
4//! ## カバー範囲 (Phase 1 月 2)
5//!
6//! - 圧縮 hook あり: `put_object`, `get_object`
7//! - 純 delegation (圧縮なし): `head_bucket`, `list_buckets`, `create_bucket`, `delete_bucket`,
8//! `head_object`, `delete_object`, `delete_objects`, `copy_object`, `list_objects`,
9//! `list_objects_v2`, `create_multipart_upload`, `upload_part`,
10//! `complete_multipart_upload`, `abort_multipart_upload`, `list_multipart_uploads`,
11//! `list_parts`
12//! - 未対応 (デフォルトで NotImplemented): その他 80+ ops (Tagging / ACL / Lifecycle 等は Phase 2)
13//!
14//! ## アーキテクチャ
15//!
16//! - `S4Service<B>` は backend (B: S3) と `Arc<CodecRegistry>` と `Arc<dyn CodecDispatcher>`
17//! を保持する。`CodecRegistry` 経由で複数 codec を抱えられるので、ひとつの S4 インスタンスが
18//! 複数 codec で書かれた object を透過的に GET できる
19//! - PUT: dispatcher が body の先頭 sample から codec を選び、registry で compress、
20//! manifest を S3 metadata に書いて backend に forward
21//! - GET: backend から取得 → metadata から manifest を復元 → registry.decompress で
22//! manifest 指定の codec で解凍 → 元の bytes を return
23//!
24//! ## 既知の制限事項
25//!
26//! - **Multipart Upload は per-part 圧縮が未実装**: 現状は upload_part を素通し。
27//! Phase 1 月 2 後半で per-part compress + complete_multipart_upload で manifest 集約。
28//! - **PUT body は memory に collect**: max_body_bytes 上限あり (default 5 GiB = S3 単発 PUT 上限)。
29//! Streaming-aware 圧縮は Phase 2。
30
31use std::sync::Arc;
32
33use base64::Engine as _;
34use bytes::BytesMut;
35use s3s::dto::*;
36use s3s::{S3, S3Error, S3ErrorCode, S3Request, S3Response, S3Result};
37use s4_codec::index::{FrameIndex, build_index_from_body, decode_index, encode_index, sidecar_key};
38use s4_codec::multipart::{
39 FRAME_HEADER_BYTES, FrameHeader, FrameIter, S3_MULTIPART_MIN_PART_BYTES, pad_to_minimum,
40 write_frame,
41};
42use s4_codec::{ChunkManifest, CodecDispatcher, CodecKind, CodecRegistry, CompressTelemetry};
43use std::time::Instant;
44use tracing::{debug, info};
45
46use crate::blob::{
47 bytes_to_blob, chain_sample_with_rest, collect_blob, collect_with_sample, peek_sample,
48};
49use crate::streaming::{
50 Crc32cVerifyingReader, async_read_to_blob, blob_to_async_read, cpu_zstd_decompress_stream,
51 pick_chunk_size, streaming_compress_to_frames, supports_streaming_compress,
52 supports_streaming_decompress,
53};
54
55/// PUT body の先頭 sampling で渡す最大 byte 数。
56const SAMPLE_BYTES: usize = 4096;
57
58/// v0.8 #55: stamp the GPU pipeline metrics (`s4_gpu_compress_seconds`,
59/// `s4_gpu_throughput_bytes_per_sec`, `s4_gpu_oom_total`) from a
60/// `CompressTelemetry` returned by `CodecRegistry::compress_with_telemetry`.
61/// CPU codecs (`gpu_seconds = None`) are no-ops here — they're already
62/// covered by the existing `s4_request_latency_seconds` / `s4_bytes_*`
63/// counters in the request-level `record_put` / `record_get` calls.
64#[inline]
65fn stamp_gpu_compress_telemetry(tel: &CompressTelemetry) {
66 if let Some(secs) = tel.gpu_seconds {
67 crate::metrics::record_gpu_compress(tel.codec, secs, tel.bytes_in, tel.bytes_out);
68 }
69 if tel.oom {
70 crate::metrics::record_gpu_oom(tel.codec);
71 }
72}
73
74/// v0.7 #49: percent-encoding set covering everything that is **not** an
75/// `unreserved` character per RFC 3986 §2.3, **plus** we additionally
76/// encode the path-reserved sub-delims that `http::Uri` rejects in a
77/// path segment (`?`, `#`, `%`, control bytes, space, etc.). We
78/// deliberately keep `/` un-encoded because S3 keys legally use `/` as
79/// a logical separator and the rest of the synthetic URI relies on the
80/// path layout `/{bucket}/{key}` round-tripping byte-for-byte.
81const URI_KEY_ENCODE_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS
82 .add(b' ')
83 .add(b'"')
84 .add(b'#')
85 .add(b'<')
86 .add(b'>')
87 .add(b'?')
88 .add(b'`')
89 .add(b'{')
90 .add(b'}')
91 .add(b'|')
92 .add(b'\\')
93 .add(b'^')
94 .add(b'[')
95 .add(b']')
96 .add(b'%');
97
98/// v0.7 #49: build the synthetic `/{bucket}/{key}` request URI used by
99/// the sidecar / replication helpers when they re-enter the backend
100/// trait without going through the HTTP layer. S3 object keys can
101/// contain spaces, control bytes, and arbitrary Unicode that would
102/// make `format!(...).parse::<http::Uri>()` panic; we percent-encode
103/// the key bytes (RFC 3986 path segment) and the bucket name (defensive
104/// — bucket names are normally DNS-safe, but the helper is the single
105/// choke-point) before splicing them in. If the encoded form *still*
106/// fails to parse (extremely unlikely once everything outside the
107/// unreserved set is escaped) we surface a typed `400 InvalidObjectName`
108/// instead of crashing the worker.
109pub(crate) fn safe_object_uri(bucket: &str, key: &str) -> S3Result<http::Uri> {
110 use percent_encoding::utf8_percent_encode;
111 let bucket_enc = utf8_percent_encode(bucket, URI_KEY_ENCODE_SET);
112 let key_enc = utf8_percent_encode(key, URI_KEY_ENCODE_SET);
113 let raw = format!("/{bucket_enc}/{key_enc}");
114 raw.parse::<http::Uri>().map_err(|e| {
115 // S3 spec uses `InvalidObjectName` (HTTP 400) for keys that
116 // can't be represented in a request URI. The generated
117 // `S3ErrorCode` enum doesn't expose a typed variant for it,
118 // so we round-trip through `from_bytes` which preserves the
119 // canonical wire string while falling back to InvalidArgument
120 // if even that lookup fails (cannot happen at runtime — kept
121 // as a belt-and-suspenders branch so this helper never
122 // panics).
123 let code =
124 S3ErrorCode::from_bytes(b"InvalidObjectName").unwrap_or(S3ErrorCode::InvalidArgument);
125 S3Error::with_message(
126 code,
127 format!("object key cannot be encoded as a request URI: {e}"),
128 )
129 })
130}
131
132/// v0.8.12 HIGH-12 fix: verify a client-supplied integrity checksum
133/// against the received body BEFORE we strip the header on the way
134/// to the backend. Returns `Err(BadDigest)` on mismatch (matches
135/// AWS S3 wire behaviour); `Ok(())` when the supplied digest matches
136/// OR when the supplied algorithm is one we don't yet implement
137/// (the latter is logged so operators see the gap — fail-open on
138/// unsupported algorithms is the documented trade in the v0.8.11
139/// CHANGELOG, with full coverage tracked as a follow-up issue).
140///
141/// Algorithms covered: `Content-MD5` (base64 MD5),
142/// `x-amz-checksum-crc32c` (base64 big-endian u32),
143/// `x-amz-checksum-sha256` (base64 SHA-256). The remaining S3
144/// checksum algorithms (CRC32 non-Castagnoli, SHA-1, CRC64-NVME)
145/// are accepted and silently passed; verifying them needs new
146/// dependencies and was held back to keep the v0.8.12 surface
147/// bounded.
148#[allow(clippy::too_many_arguments)]
149fn verify_client_body_checksums(
150 body: &[u8],
151 content_md5_b64: Option<&str>,
152 checksum_crc32_b64: Option<&str>,
153 checksum_crc32c_b64: Option<&str>,
154 checksum_sha1_b64: Option<&str>,
155 checksum_sha256_b64: Option<&str>,
156 checksum_crc64nvme_b64: Option<&str>,
157) -> S3Result<()> {
158 use base64::Engine as _;
159 use md5::Md5;
160 use sha2::Sha256;
161 // `Digest` from md-5 / sha2 brings the `new`, `update`, `finalize`
162 // trait methods into scope. Bind anonymously so this `use` is
163 // never flagged as unused while still serving its real purpose.
164 use md5::Digest as _;
165 let b64 = base64::engine::general_purpose::STANDARD;
166 let bad = |what: &str| {
167 let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
168 S3Error::with_message(
169 code,
170 format!("client-supplied {what} did not match the received body"),
171 )
172 };
173 if let Some(claimed) = content_md5_b64 {
174 let want = b64.decode(claimed).map_err(|_| {
175 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed Content-MD5")
176 })?;
177 if want.len() != 16 {
178 return Err(S3Error::with_message(
179 S3ErrorCode::InvalidDigest,
180 "Content-MD5 must decode to 16 bytes",
181 ));
182 }
183 let mut h = Md5::new();
184 h.update(body);
185 let got = h.finalize();
186 // `subtle::ConstantTimeEq` would be ideal but the existing
187 // `constant_time_eq` helper in sse.rs is private; use a
188 // straightforward byte compare. The attacker doesn't get to
189 // choose the body retroactively, so a timing oracle here
190 // doesn't help them. `&got[..]` derefs the GenericArray
191 // into a `&[u8]` (the deprecated `.as_slice()` is gone in
192 // generic-array 1.x; CI runs `-D warnings`).
193 if got[..] != *want.as_slice() {
194 return Err(bad("Content-MD5"));
195 }
196 }
197 if let Some(claimed) = checksum_crc32c_b64 {
198 let want = b64.decode(claimed).map_err(|_| {
199 S3Error::with_message(
200 S3ErrorCode::InvalidDigest,
201 "malformed x-amz-checksum-crc32c",
202 )
203 })?;
204 if want.len() != 4 {
205 return Err(S3Error::with_message(
206 S3ErrorCode::InvalidDigest,
207 "x-amz-checksum-crc32c must decode to 4 bytes (big-endian u32)",
208 ));
209 }
210 let got = crc32c::crc32c(body).to_be_bytes();
211 if got != want.as_slice() {
212 return Err(bad("x-amz-checksum-crc32c"));
213 }
214 }
215 if let Some(claimed) = checksum_sha256_b64 {
216 let want = b64.decode(claimed).map_err(|_| {
217 S3Error::with_message(
218 S3ErrorCode::InvalidDigest,
219 "malformed x-amz-checksum-sha256",
220 )
221 })?;
222 if want.len() != 32 {
223 return Err(S3Error::with_message(
224 S3ErrorCode::InvalidDigest,
225 "x-amz-checksum-sha256 must decode to 32 bytes",
226 ));
227 }
228 let mut h = Sha256::new();
229 h.update(body);
230 let got = h.finalize();
231 if got[..] != *want.as_slice() {
232 return Err(bad("x-amz-checksum-sha256"));
233 }
234 }
235 // v0.8.12 #128 (MED-C): CRC32 (IEEE 802.3 — the non-Castagnoli
236 // variant AWS uses for `x-amz-checksum-crc32`). 4-byte
237 // big-endian value, base64-encoded.
238 if let Some(claimed) = checksum_crc32_b64 {
239 let want = b64.decode(claimed).map_err(|_| {
240 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-crc32")
241 })?;
242 if want.len() != 4 {
243 return Err(S3Error::with_message(
244 S3ErrorCode::InvalidDigest,
245 "x-amz-checksum-crc32 must decode to 4 bytes (big-endian u32)",
246 ));
247 }
248 let mut h = crc32fast::Hasher::new();
249 h.update(body);
250 let got = h.finalize().to_be_bytes();
251 if got != want.as_slice() {
252 return Err(bad("x-amz-checksum-crc32"));
253 }
254 }
255 // v0.8.12 #128 (MED-C): SHA-1. 20-byte digest, base64-encoded.
256 if let Some(claimed) = checksum_sha1_b64 {
257 use sha1::Sha1;
258 let want = b64.decode(claimed).map_err(|_| {
259 S3Error::with_message(S3ErrorCode::InvalidDigest, "malformed x-amz-checksum-sha1")
260 })?;
261 if want.len() != 20 {
262 return Err(S3Error::with_message(
263 S3ErrorCode::InvalidDigest,
264 "x-amz-checksum-sha1 must decode to 20 bytes",
265 ));
266 }
267 let mut h = Sha1::new();
268 h.update(body);
269 let got = h.finalize();
270 if got[..] != *want.as_slice() {
271 return Err(bad("x-amz-checksum-sha1"));
272 }
273 }
274 // v0.8.12 #128 (MED-C): CRC64-NVME — AWS's newest checksum
275 // algorithm. NVMe spec: poly 0xad93d23594c93659, init / xorout
276 // 0xffffffffffffffff, refin / refout true. The reflected
277 // polynomial + 256-entry lookup table are computed lazily on
278 // first call (small enough to inline rather than pull in a
279 // dedicated crc64 crate).
280 if let Some(claimed) = checksum_crc64nvme_b64 {
281 let want = b64.decode(claimed).map_err(|_| {
282 S3Error::with_message(
283 S3ErrorCode::InvalidDigest,
284 "malformed x-amz-checksum-crc64nvme",
285 )
286 })?;
287 if want.len() != 8 {
288 return Err(S3Error::with_message(
289 S3ErrorCode::InvalidDigest,
290 "x-amz-checksum-crc64nvme must decode to 8 bytes (big-endian u64)",
291 ));
292 }
293 let got = crc64_nvme(body).to_be_bytes();
294 if got != want.as_slice() {
295 return Err(bad("x-amz-checksum-crc64nvme"));
296 }
297 }
298 Ok(())
299}
300
301/// v0.9 #106-audit-R2 P2-INT-2: verify SigV4-streaming **trailer**-supplied
302/// checksums against an already-finalised [`ComputedDigests`].
303///
304/// Shared between the streaming-framed branch (digests computed via the
305/// tee wrapper) and the buffered branch (digests computed in one shot
306/// over the in-memory body via [`crate::streaming_checksum::compute_digests`]).
307/// Centralising the logic prevents the pre-#106 fail-open shape —
308/// where one branch verified trailers and the other silently skipped
309/// them — from regressing. Both branches now go through the same
310/// announce-parsing / fail-closed / per-name `compare_b64` pipeline.
311///
312/// Fail-closed posture (matches the streaming branch's behaviour):
313///
314/// - No `x-amz-trailer` header → returns Ok (no verification claimed).
315/// - Header announces only non-checksum trailers (`x-amz-trailer-signature`,
316/// custom) → returns Ok (filter selects checksum names only).
317/// - Header announces `x-amz-checksum-*` but the trailing-headers handle
318/// was absent → `BadDigest`.
319/// - Handle present but trailers were never delivered (`read` returns
320/// None) → `BadDigest`.
321/// - Trailer announced but value missing in the delivered block → `BadDigest`.
322/// - Value present but malformed / mismatched / refers to an unhashed
323/// algorithm → `BadDigest` / `InvalidDigest` per [`ComputedDigests::compare_b64`].
324fn verify_client_trailer_checksums(
325 announced: Option<&str>,
326 trailers_handle: Option<&s3s::TrailingHeaders>,
327 computed: &crate::streaming_checksum::ComputedDigests,
328) -> S3Result<()> {
329 let Some(announced) = announced else {
330 return Ok(());
331 };
332 let promised_checksum_trailers: Vec<String> = announced
333 .split(',')
334 .map(|s| s.trim().to_string())
335 .filter(|n| {
336 // RFC 9110 §5.1: HTTP header names are
337 // case-insensitive — match accordingly.
338 n.to_ascii_lowercase().starts_with("x-amz-checksum-")
339 })
340 .collect();
341 if promised_checksum_trailers.is_empty() {
342 return Ok(());
343 }
344 let bad_digest = |msg: String| -> S3Error {
345 let code = S3ErrorCode::from_bytes(b"BadDigest").unwrap_or(S3ErrorCode::InvalidArgument);
346 S3Error::with_message(code, msg)
347 };
348 let Some(th) = trailers_handle else {
349 return Err(bad_digest(
350 "client announced checksum trailer(s) via x-amz-trailer but \
351 no trailing-headers handle was attached to the request"
352 .into(),
353 ));
354 };
355 let result = th.read(|hmap| {
356 for name in &promised_checksum_trailers {
357 match hmap.get(name.as_str()).and_then(|v| v.to_str().ok()) {
358 Some(val) => {
359 computed.compare_b64(name, val)?;
360 }
361 None => {
362 return Err(bad_digest(format!(
363 "client announced trailer {name} via \
364 x-amz-trailer but the trailer value was \
365 missing or unparseable"
366 )));
367 }
368 }
369 }
370 Ok::<(), S3Error>(())
371 });
372 match result {
373 Some(Ok(())) => Ok(()),
374 Some(Err(e)) => Err(e),
375 None => Err(bad_digest(
376 "client announced checksum trailer(s) via x-amz-trailer \
377 but no trailing-headers block was delivered with the body"
378 .into(),
379 )),
380 }
381}
382
383/// v0.8.12 #128 (MED-C): CRC-64/NVME (AWS S3 `x-amz-checksum-crc64nvme`).
384/// NVMe spec: poly 0xad93d23594c93659, init 0xffffffffffffffff, refin
385/// true, refout true, xorout 0xffffffffffffffff. The reflected
386/// polynomial table is computed lazily on first call via
387/// [`std::sync::OnceLock`]; subsequent calls share the 256-entry table.
388fn crc64_nvme(bytes: &[u8]) -> u64 {
389 use std::sync::OnceLock;
390 static TABLE: OnceLock<[u64; 256]> = OnceLock::new();
391 let tbl = TABLE.get_or_init(|| {
392 // Reflected polynomial (bit-reverse of 0xad93d23594c93659).
393 const POLY_REFLECTED: u64 = 0x9a6c_9329_ac4b_c9b5;
394 let mut t = [0u64; 256];
395 let mut i = 0usize;
396 while i < 256 {
397 let mut c = i as u64;
398 let mut j = 0;
399 while j < 8 {
400 c = if c & 1 != 0 {
401 (c >> 1) ^ POLY_REFLECTED
402 } else {
403 c >> 1
404 };
405 j += 1;
406 }
407 t[i] = c;
408 i += 1;
409 }
410 t
411 });
412 let mut crc: u64 = !0u64;
413 for &b in bytes {
414 let idx = ((crc as u8) ^ b) as usize;
415 crc = (crc >> 8) ^ tbl[idx];
416 }
417 !crc
418}
419
420/// v0.4 #20: captured at the start of a handler, before the request is
421/// consumed by the backend call, so the matching `record_access` at
422/// end-of-request can fill in the structured access log entry.
423struct AccessLogPreamble {
424 remote_ip: Option<String>,
425 requester: Option<String>,
426 request_uri: String,
427 user_agent: Option<String>,
428}
429
430pub struct S4Service<B: S3> {
431 /// Wrapped in `Arc` so the v0.6 #40 cross-bucket replication
432 /// dispatcher can clone it into a detached `tokio::spawn` task
433 /// (Arc::clone is cheap; backend trait methods take `&self` so no
434 /// other handler is affected by the indirection).
435 backend: Arc<B>,
436 registry: Arc<CodecRegistry>,
437 dispatcher: Arc<dyn CodecDispatcher>,
438 max_body_bytes: usize,
439 policy: Option<crate::policy::SharedPolicy>,
440 /// v0.3 #13: surfaced as the `aws:SecureTransport` Condition key. Set
441 /// to `true` when the listener is wrapped in TLS (or ACME), so policies
442 /// gating "deny if not over TLS" can do their job. Defaults to `false`
443 /// (HTTP); set via [`S4Service::with_secure_transport`] at boot.
444 secure_transport: bool,
445 /// v0.4 #19: optional per-(principal, bucket) token-bucket limiter.
446 rate_limits: Option<crate::rate_limit::SharedRateLimits>,
447 /// v0.4 #20: optional S3-style access log emitter.
448 access_log: Option<crate::access_log::SharedAccessLog>,
449 /// v0.4 #21 / v0.5 #29: optional server-side encryption keyring
450 /// (AES-256-GCM). When set, every PUT body gets wrapped in S4E2
451 /// (with the keyring's active key id) after the compress + framing
452 /// steps; every GET that sniffs as S4E1/S4E2 is decrypted before
453 /// frame parsing. A `with_sse_key(...)` call wraps the supplied
454 /// key in a 1-slot keyring so single-key (v0.4) operators get the
455 /// same behaviour they had before, just on the v2 frame.
456 sse_keyring: Option<crate::sse::SharedSseKeyring>,
457 /// v0.5 #34: optional first-class versioning state machine. When
458 /// `Some(...)`, S4-server itself owns the per-bucket versioning
459 /// state + per-(bucket, key) version chain; PUT / GET / DELETE /
460 /// list_object_versions / get_bucket_versioning /
461 /// put_bucket_versioning handlers consult the manager instead of
462 /// passing through. When `None` (default), the legacy
463 /// backend-passthrough behaviour applies so existing v0.4
464 /// deployments are unaffected until they explicitly call
465 /// `with_versioning(...)`.
466 versioning: Option<Arc<crate::versioning::VersioningManager>>,
467 /// v0.5 #28: optional SSE-KMS envelope-encryption backend. When
468 /// `Some(...)`, PUTs carrying `x-amz-server-side-encryption: aws:kms`
469 /// generate a fresh DEK via the backend, encrypt the body with it
470 /// (S4E4 frame), and persist only the wrapped DEK. GETs sniffing as
471 /// S4E4 unwrap the DEK through the same backend before decrypt.
472 /// `kms_default_key_id` is used when the request omits an explicit
473 /// `x-amz-server-side-encryption-aws-kms-key-id` (mirrors AWS S3
474 /// bucket-default behaviour).
475 kms: Option<Arc<dyn crate::kms::KmsBackend>>,
476 kms_default_key_id: Option<String>,
477 /// v0.5 #30: optional Object Lock (WORM) enforcement layer. When
478 /// `Some(...)`, `delete_object` and overwrite-style `put_object`
479 /// consult the manager and refuse the operation with HTTP 403
480 /// `AccessDenied` while the object is locked (Compliance until
481 /// expiry, Governance unless the bypass header is set, or any time
482 /// a legal hold is on). PUT also auto-applies the bucket-default
483 /// retention to brand-new objects when configured. When `None`
484 /// (default), the legacy backend-passthrough behaviour applies, so
485 /// existing v0.4 deployments are unaffected until they explicitly
486 /// call `with_object_lock(...)`.
487 object_lock: Option<Arc<crate::object_lock::ObjectLockManager>>,
488 /// v0.6 #38: optional first-class CORS bucket configuration manager.
489 /// When `Some(...)`, S4-server itself owns per-bucket CORS rules and
490 /// `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
491 /// consult the manager instead of passing through to the backend.
492 /// `handle_preflight` (public method on `S4Service`) routes OPTIONS-
493 /// style preflight matching through the same store; the actual HTTP
494 /// OPTIONS routing wire-up at the listener level is a follow-up
495 /// (s3s framework does not surface OPTIONS as a typed handler).
496 cors: Option<Arc<crate::cors::CorsManager>>,
497 /// v0.6 #36: optional first-class S3 Inventory manager. When
498 /// `Some(...)`, S4-server itself owns per-(bucket, id) inventory
499 /// configurations and `put_bucket_inventory_configuration` /
500 /// `get_bucket_inventory_configuration` /
501 /// `list_bucket_inventory_configurations` /
502 /// `delete_bucket_inventory_configuration` consult the manager
503 /// instead of passing through to the backend. The actual periodic
504 /// CSV emission is driven by a tokio task in `main.rs` that calls
505 /// `InventoryManager::run_once_for_test` on a fixed cadence; the
506 /// service handlers below only deal with config-level CRUD.
507 inventory: Option<Arc<crate::inventory::InventoryManager>>,
508 /// v0.6 #35: optional first-class S3 bucket-notification manager.
509 /// When `Some(...)`, S4-server itself owns per-bucket notification
510 /// configurations and `put_bucket_notification_configuration` /
511 /// `get_bucket_notification_configuration` consult the manager
512 /// instead of passing through to the backend. Successful PUT /
513 /// DELETE handlers fire matching destinations on a detached tokio
514 /// task (best-effort; see `crate::notifications::dispatch_event`).
515 notifications: Option<Arc<crate::notifications::NotificationManager>>,
516 /// v0.6 #37: optional first-class S3 Lifecycle configuration
517 /// manager. When `Some(...)`, S4-server itself owns per-bucket
518 /// lifecycle rules and `put_bucket_lifecycle_configuration` /
519 /// `get_bucket_lifecycle_configuration` /
520 /// `delete_bucket_lifecycle` consult the manager instead of
521 /// passing through to the backend. The actual background scanner
522 /// (list_objects_v2 -> evaluate -> delete / metadata-rewrite per
523 /// rule) is a v0.7+ follow-up; the test path
524 /// `S4Service::run_lifecycle_once_for_test` exercises the
525 /// evaluator end-to-end so this v0.6 #37 wiring is enough to ship
526 /// the configuration-management half without putting a
527 /// half-wired bucket-walk in front of users.
528 lifecycle: Option<Arc<crate::lifecycle::LifecycleManager>>,
529 /// v0.6 #39: optional first-class object + bucket Tagging manager.
530 /// When `Some(...)`, S4-server itself owns per-(bucket, key) and
531 /// per-bucket tag state — `PutObjectTagging` /
532 /// `GetObjectTagging` / `DeleteObjectTagging` /
533 /// `PutBucketTagging` / `GetBucketTagging` /
534 /// `DeleteBucketTagging` route through the manager (replacing the
535 /// previous backend-passthrough behaviour). `put_object` also
536 /// pre-parses the `x-amz-tagging` header / `Tagging` input field
537 /// so the IAM policy evaluator can gate on
538 /// `s3:RequestObjectTag/<key>` and `s3:ExistingObjectTag/<key>`.
539 /// On a successful PUT the parsed tags are persisted; on a
540 /// successful DELETE the matching tag entry is dropped.
541 tagging: Option<Arc<crate::tagging::TagManager>>,
542 /// v0.6 #40: optional first-class cross-bucket replication manager.
543 /// When `Some(...)`, S4-server itself owns per-bucket replication
544 /// rules; `PutBucketReplication` / `GetBucketReplication` /
545 /// `DeleteBucketReplication` route through the manager (replacing
546 /// the previous backend-passthrough behaviour). On every successful
547 /// `put_object` the manager's rule list is consulted; the
548 /// highest-priority matching enabled rule wins, the per-key status
549 /// is recorded as `Pending`, and the source body and metadata are
550 /// handed to a detached tokio task that PUTs to the destination
551 /// bucket through the same backend. The replica is stamped with
552 /// `x-amz-replication-status: REPLICA` in its metadata; the
553 /// source-side status is updated to `Completed` on success or
554 /// `Failed` after the 3-attempt retry budget is exhausted (drop
555 /// counter bumps in either-side case so dashboards see the loss).
556 /// `head_object` / `get_object` echo the recorded status back as
557 /// `x-amz-replication-status` so consumers can poll progress.
558 /// Limited to single-instance (same `S4Service`) replication; true
559 /// cross-region (multi-instance) is a v0.7+ follow-up.
560 replication: Option<Arc<crate::replication::ReplicationManager>>,
561 /// v0.6 #42: optional MFA-Delete enforcement layer. When `Some(...)`,
562 /// every DELETE / DELETE-version / delete-marker / `PutBucketVersioning`
563 /// request against a bucket whose MFA-Delete state is `Enabled`
564 /// must carry `x-amz-mfa: <serial> <code>` (RFC 6238 6-digit TOTP);
565 /// missing or invalid tokens return HTTP 403 `AccessDenied`. When
566 /// `None` (default), the gate is a no-op so existing v0.4 / v0.5
567 /// deployments are unaffected until they explicitly call
568 /// `with_mfa_delete(...)`.
569 mfa_delete: Option<Arc<crate::mfa::MfaDeleteManager>>,
570 /// v0.5 #32: when `true`, every PUT must carry an SSE indicator
571 /// (`x-amz-server-side-encryption`, the SSE-C customer-key headers,
572 /// or be matched against a configured server-managed keyring/KMS).
573 /// Set by `--compliance-mode strict` after the boot-time
574 /// prerequisite check passes.
575 compliance_strict: bool,
576 /// v0.7 #47: optional SigV4a (asymmetric ECDSA-P256-SHA256) verify
577 /// gate. When `Some(...)`, the listener-side middleware (see
578 /// [`crate::routing::try_sigv4a_verify`]) inspects every incoming
579 /// request and short-circuits SigV4a-signed ones — verifying the
580 /// signature against the credential store and returning 403
581 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` on failure. Plain
582 /// SigV4 (HMAC-SHA256) requests pass through to s3s untouched. When
583 /// `None`, the middleware is a no-op so the existing SigV4 path is
584 /// unaffected (operators opt in via `--sigv4a-credentials <DIR>`).
585 sigv4a_gate: Option<Arc<SigV4aGate>>,
586 /// v0.8 #54 BUG-5..10: per-`upload_id` side-table that ferries the
587 /// SSE / Tagging / Object-Lock context captured at
588 /// `CreateMultipartUpload` time through to `UploadPart` /
589 /// `CompleteMultipartUpload`. Always-on (no `with_*` flag) — the
590 /// store is gateway-internal and idle when no multipart is in
591 /// flight. See [`crate::multipart_state`] for rationale.
592 multipart_state: Arc<crate::multipart_state::MultipartStateStore>,
593 /// v0.8 #52: plaintext bytes per S4E5 chunk on the SSE-S4 PUT
594 /// path. `0` (default) → use the legacy buffered S4E2 path
595 /// (whole-body AES-GCM tag, GET buffers + verifies before
596 /// emitting). Non-zero → use the chunked S4E5 frame so GET can
597 /// stream-decrypt chunk-by-chunk. Wired by `--sse-chunk-size`
598 /// in `main.rs`. SSE-C and SSE-KMS are intentionally unaffected
599 /// (chunked variants tracked in a follow-up issue).
600 sse_chunk_size: usize,
601 /// v0.8.5 #86 (audit M-2): bounded permit pool gating the detached
602 /// replication dispatcher in [`Self::spawn_replication_if_matched`].
603 /// Without this cap, a high-volume PUT workload (1k req/s × N enabled
604 /// rules × slow destination = O(10k) in-flight tokio tasks) could
605 /// exhaust process memory before the destination drains. Each
606 /// dispatcher spawn `acquire_owned`s one permit and holds it for the
607 /// lifetime of the destination PUT + status stamp; once the cap is
608 /// reached the dispatcher async-blocks on `acquire_owned()` so the
609 /// listener path itself never stalls — only the in-flight replica
610 /// queue depth is bounded. Default 1024 (operator-tunable via
611 /// `--replication-max-concurrent`).
612 replication_semaphore: Arc<tokio::sync::Semaphore>,
613 /// v0.8.11 CRIT-4 fix: trust the `X-Forwarded-For` header for the
614 /// `aws:SourceIp` Condition key only when the operator has
615 /// explicitly opted in via `--trust-x-forwarded-for`. Default
616 /// (`false`) makes the policy evaluator see `source_ip = None`
617 /// for incoming requests, so a public-internet client can no
618 /// longer spoof an internal CIDR by setting `X-Forwarded-For`
619 /// themselves. Operators behind a trusted reverse proxy that
620 /// scrubs / sets `X-Forwarded-For` enable the flag; gateways
621 /// listening directly on the public internet leave it off and
622 /// gain a clear fail-closed default. A future release plumbs
623 /// the TCP peer address through the s3s service trait so we can
624 /// validate the forwarded header against a `--trusted-proxies`
625 /// CIDR list; until then the boolean opt-in closes the immediate
626 /// auth-bypass surface.
627 trust_x_forwarded_for: bool,
628 /// v0.8.17 G-4 (#161): migration escape hatch. When `true`,
629 /// the v0.8.16 F-13 reserved-name guard does NOT block GET /
630 /// HEAD / DELETE on keys ending in `.s4index` — the operator
631 /// is asserting that the deployment may carry pre-v0.8.15
632 /// user objects with that suffix and wants a window to
633 /// migrate them off. Writes (PUT / Copy / Create-Multipart)
634 /// stay blocked regardless of this flag, so attacker
635 /// injection from M-1 / F-13 stays closed. Default
636 /// `false` matches the v0.8.16 behaviour.
637 allow_legacy_reserved_key_reads: bool,
638}
639
640/// v0.8.17 G-2: which AWS error shape the reserved-name guard
641/// should emit on hit. `Read`-mode endpoints (GET / HEAD /
642/// Attributes / Tagging-read) return `NoSuchKey` — consistent
643/// with the listing filter hiding the sidecar. `Mutating`-mode
644/// endpoints (PUT / Copy / DELETE / Tagging-write / ACL-write)
645/// return `InvalidObjectName` so the client sees the suffix is
646/// reserved by-design rather than coincidentally missing.
647#[derive(Clone, Copy, Debug)]
648enum ReservedKeyMode {
649 Read,
650 Mutating,
651}
652
653impl<B: S3> S4Service<B> {
654 /// AWS S3 単発 PUT の API 上限 (5 GiB)。
655 ///
656 /// v0.9 #106 (32-bit target support): `target_pointer_width` で gating して
657 /// 32-bit target の const-overflow を回避。 32-bit では `isize::MAX as usize`
658 /// (≈ 2 GiB on 32-bit) に collapse ── Rust 言語仕様で `Vec` / `Bytes`
659 /// 1 回の allocation は `isize::MAX` byte が上限 (`usize::MAX` ではない) で、
660 /// `usize::MAX` を cap にすると oversized-body guard を通過した後で
661 /// `Vec::with_capacity` 側が panic することがある (Codex review P2 で発覚)。
662 /// s4-server runtime は 64-bit only (README §"Supported targets") だが、
663 /// workspace-wide `cargo check --target wasm32-*` 等で blocking しない + 32-bit
664 /// build で SSE buffered-decrypt が OOM panic しないためのガード。
665 #[cfg(target_pointer_width = "64")]
666 pub const DEFAULT_MAX_BODY_BYTES: usize = 5 * 1024 * 1024 * 1024;
667 #[cfg(target_pointer_width = "32")]
668 pub const DEFAULT_MAX_BODY_BYTES: usize = isize::MAX as usize;
669
670 /// v0.8.5 #86 (audit M-2): default cap on simultaneously-in-flight
671 /// replication dispatcher tasks. See the `replication_semaphore`
672 /// field doc for the rationale + override path.
673 pub const DEFAULT_REPLICATION_MAX_CONCURRENT: usize = 1024;
674
675 pub fn new(
676 backend: B,
677 registry: Arc<CodecRegistry>,
678 dispatcher: Arc<dyn CodecDispatcher>,
679 ) -> Self {
680 Self {
681 backend: Arc::new(backend),
682 registry,
683 dispatcher,
684 max_body_bytes: Self::DEFAULT_MAX_BODY_BYTES,
685 policy: None,
686 secure_transport: false,
687 rate_limits: None,
688 access_log: None,
689 sse_keyring: None,
690 versioning: None,
691 kms: None,
692 kms_default_key_id: None,
693 object_lock: None,
694 cors: None,
695 inventory: None,
696 notifications: None,
697 lifecycle: None,
698 tagging: None,
699 replication: None,
700 mfa_delete: None,
701 compliance_strict: false,
702 sigv4a_gate: None,
703 multipart_state: Arc::new(crate::multipart_state::MultipartStateStore::new()),
704 // v0.8 #52: chunked SSE-S4 disabled by default — opt
705 // in via `S4Service::with_sse_chunk_size(...)` /
706 // `--sse-chunk-size <BYTES>`. Default keeps the legacy
707 // S4E2 buffered path so existing deployments are
708 // bit-for-bit unchanged.
709 sse_chunk_size: 0,
710 // v0.8.5 #86 (audit M-2): default cap of 1024 in-flight
711 // replication tasks. Picked to be (a) ample headroom over a
712 // typical steady-state replication rate (the v0.8.3 #66
713 // status-sweep doc cites 1k keys/hour as a "steady" rate, so
714 // even a 100x burst lands well under 1024), (b) small enough
715 // that the worst-case memory pinned by stalled dispatchers
716 // — body bytes + metadata — stays bounded (1024 × 5 MiB
717 // typical S3 PUT ≈ 5 GiB, recoverable). Operators with
718 // wider cross-region fan-out can override via
719 // `--replication-max-concurrent`.
720 replication_semaphore: Arc::new(tokio::sync::Semaphore::new(
721 Self::DEFAULT_REPLICATION_MAX_CONCURRENT,
722 )),
723 // v0.8.11 CRIT-4: default fail-closed — ignore client-
724 // supplied `X-Forwarded-For` until the operator opts in
725 // through `with_trust_x_forwarded_for(true)`.
726 trust_x_forwarded_for: false,
727 // v0.8.17 G-4: closed by default; opt in via
728 // `with_allow_legacy_reserved_key_reads(true)` for the
729 // migration window only.
730 allow_legacy_reserved_key_reads: false,
731 }
732 }
733
734 /// v0.8.17 G-4: opt in to a migration window where GET / HEAD /
735 /// DELETE on `<key>.s4index` are allowed even though new
736 /// writes against that suffix stay rejected. Used by operators
737 /// upgrading from pre-v0.8.15 deployments that may carry
738 /// legacy user-owned objects with the now-reserved suffix.
739 /// Defaults to `false`; turn off again once the legacy data
740 /// has been migrated.
741 #[must_use]
742 pub fn with_allow_legacy_reserved_key_reads(mut self, on: bool) -> Self {
743 self.allow_legacy_reserved_key_reads = on;
744 self
745 }
746
747 /// v0.8.11 CRIT-4 fix: opt in to consuming the leftmost token of
748 /// the `X-Forwarded-For` header as `aws:SourceIp`. Only enable
749 /// when the gateway sits behind a trusted reverse proxy that
750 /// strips (or rewrites) any client-supplied value. When left
751 /// off (default), the policy evaluator sees `source_ip = None`
752 /// regardless of what the client sends — closing the
753 /// public-internet `X-Forwarded-For: 10.0.0.1` IAM-allowlist
754 /// bypass.
755 #[must_use]
756 pub fn with_trust_x_forwarded_for(mut self, on: bool) -> Self {
757 self.trust_x_forwarded_for = on;
758 self
759 }
760
761 /// v0.7 #47: attach the SigV4a verify gate. Once set, the
762 /// listener-side middleware (`crate::routing::try_sigv4a_verify`)
763 /// short-circuits any incoming `AWS4-ECDSA-P256-SHA256` request,
764 /// verifying it against the supplied credential store and
765 /// returning 403 on failure. Plain SigV4 (HMAC-SHA256) requests
766 /// are unaffected. When the gate is unset (default), the
767 /// middleware skips entirely so existing SigV4 deployments keep
768 /// working.
769 #[must_use]
770 pub fn with_sigv4a_gate(mut self, gate: Arc<SigV4aGate>) -> Self {
771 self.sigv4a_gate = Some(gate);
772 self
773 }
774
775 /// v0.7 #47: borrow the attached SigV4a gate. Used by `main.rs`
776 /// to snapshot the gate `Arc` before the s3s `ServiceBuilder`
777 /// consumes the `S4Service` (the listener-side middleware needs
778 /// the same `Arc` because s3s' SigV4 verifier rejects SigV4a
779 /// algorithm tokens with "unknown algorithm" — match has to
780 /// happen at the hyper layer instead).
781 #[must_use]
782 pub fn sigv4a_gate(&self) -> Option<&Arc<SigV4aGate>> {
783 self.sigv4a_gate.as_ref()
784 }
785
786 /// v0.8.2 #62: borrow the multipart state store so `main.rs` can
787 /// snapshot the `Arc` before the s3s `ServiceBuilder` consumes
788 /// the `S4Service`. The background `sweep_stale` task in `main.rs`
789 /// holds this `Arc` and ticks once an hour to drop abandoned
790 /// upload contexts (and their `Zeroizing<[u8; 32]>` SSE-C keys).
791 #[must_use]
792 pub fn multipart_state(&self) -> &Arc<crate::multipart_state::MultipartStateStore> {
793 &self.multipart_state
794 }
795
796 /// v0.6 #39: attach the in-memory object + bucket Tagging manager.
797 /// Once set, `Put/Get/Delete` `Object/Bucket Tagging` route
798 /// through the manager (instead of forwarding to the backend),
799 /// and `put_object`'s `x-amz-tagging` parse path becomes the
800 /// source of `s3:RequestObjectTag/<key>` for the IAM policy
801 /// evaluator. The manager itself is shared via `Arc`.
802 #[must_use]
803 pub fn with_tagging(mut self, mgr: Arc<crate::tagging::TagManager>) -> Self {
804 self.tagging = Some(mgr);
805 self
806 }
807
808 /// v0.6 #39: borrow the attached tagging manager (test /
809 /// introspection — the snapshotter in `main.rs`, when wired,
810 /// will keep its own `Arc` clone).
811 #[must_use]
812 pub fn tag_manager(&self) -> Option<&Arc<crate::tagging::TagManager>> {
813 self.tagging.as_ref()
814 }
815
816 /// v0.6 #36: attach the in-memory S3 Inventory manager. Once set,
817 /// `put_bucket_inventory_configuration` /
818 /// `get_bucket_inventory_configuration` /
819 /// `list_bucket_inventory_configurations` /
820 /// `delete_bucket_inventory_configuration` route through the
821 /// manager. The actual periodic CSV / manifest emission is
822 /// orchestrated by a tokio task started in `main.rs`; the manager
823 /// itself is shared between the handler and the scheduler via
824 /// `Arc`.
825 #[must_use]
826 pub fn with_inventory(mut self, mgr: Arc<crate::inventory::InventoryManager>) -> Self {
827 self.inventory = Some(mgr);
828 self
829 }
830
831 /// v0.6 #36: borrow the attached inventory manager (test /
832 /// introspection — the background scheduler in `main.rs` keeps its
833 /// own `Arc` clone, so this accessor is for the test path that
834 /// invokes `run_once_for_test` directly).
835 #[must_use]
836 pub fn inventory_manager(&self) -> Option<&Arc<crate::inventory::InventoryManager>> {
837 self.inventory.as_ref()
838 }
839
840 /// v0.6 #37: attach the in-memory S3 Lifecycle configuration
841 /// manager. Once set, `put_bucket_lifecycle_configuration` /
842 /// `get_bucket_lifecycle_configuration` / `delete_bucket_lifecycle`
843 /// route through the manager (replacing the previous backend-
844 /// passthrough behaviour). The actual periodic scanner that walks
845 /// the source bucket and invokes Expiration / Transition /
846 /// NoncurrentExpiration actions is a v0.7+ follow-up — see
847 /// [`Self::run_lifecycle_once_for_test`] for the in-memory test
848 /// path that exercises the evaluator end-to-end.
849 #[must_use]
850 pub fn with_lifecycle(mut self, mgr: Arc<crate::lifecycle::LifecycleManager>) -> Self {
851 self.lifecycle = Some(mgr);
852 self
853 }
854
855 /// v0.6 #37: borrow the attached lifecycle manager (test /
856 /// introspection — the background scheduler in `main.rs` keeps its
857 /// own `Arc` clone, so this accessor is for the test path that
858 /// invokes the evaluator directly).
859 #[must_use]
860 pub fn lifecycle_manager(&self) -> Option<&Arc<crate::lifecycle::LifecycleManager>> {
861 self.lifecycle.as_ref()
862 }
863
864 /// v0.6 #37: synchronous test entry that runs the lifecycle evaluator
865 /// against a caller-provided list of `(key, age, size, tags)` tuples
866 /// and returns the `(key, action)` pairs that should fire. The actual
867 /// backend invocation (S3.delete_object / metadata rewrite) is left
868 /// to the caller — the unit + E2E tests use this to verify the
869 /// evaluator without spawning the (deferred) background scanner.
870 /// Returns an empty `Vec` when no lifecycle manager is attached or
871 /// no rule matches.
872 #[must_use]
873 pub fn run_lifecycle_once_for_test(
874 &self,
875 bucket: &str,
876 objects: &[crate::lifecycle::EvaluateBatchEntry],
877 ) -> Vec<(String, crate::lifecycle::LifecycleAction)> {
878 let Some(mgr) = self.lifecycle.as_ref() else {
879 return Vec::new();
880 };
881 crate::lifecycle::evaluate_batch(mgr, bucket, objects)
882 }
883
884 /// v0.6 #35: attach the in-memory bucket-notification manager. Once
885 /// set, `put_bucket_notification_configuration` /
886 /// `get_bucket_notification_configuration` route through the manager
887 /// (replacing the previous backend-passthrough behaviour); successful
888 /// `put_object` / `delete_object` calls fire matching destinations
889 /// on a detached tokio task via
890 /// `crate::notifications::dispatch_event` (best-effort, fire-and-
891 /// forget — failures bump the manager's `dropped_total` counter and
892 /// log at warn but do NOT fail the originating S3 request).
893 #[must_use]
894 pub fn with_notifications(
895 mut self,
896 mgr: Arc<crate::notifications::NotificationManager>,
897 ) -> Self {
898 self.notifications = Some(mgr);
899 self
900 }
901
902 /// v0.6 #35: borrow the attached notifications manager (test /
903 /// introspection — used by the metrics layer to read
904 /// `dropped_total`).
905 #[must_use]
906 pub fn notifications_manager(&self) -> Option<&Arc<crate::notifications::NotificationManager>> {
907 self.notifications.as_ref()
908 }
909
910 /// v0.6 #35: internal helper used by the DELETE handlers to fire a
911 /// matching notification on a detached tokio task. No-op when no
912 /// manager is attached or no rule on the bucket matches the given
913 /// (event, key) tuple.
914 fn fire_delete_notification(
915 &self,
916 bucket: &str,
917 key: &str,
918 event: crate::notifications::EventType,
919 version_id: Option<String>,
920 ) {
921 let Some(mgr) = self.notifications.as_ref() else {
922 return;
923 };
924 let dests = mgr.match_destinations(bucket, &event, key);
925 if dests.is_empty() {
926 return;
927 }
928 tokio::spawn(crate::notifications::dispatch_event(
929 Arc::clone(mgr),
930 bucket.to_owned(),
931 key.to_owned(),
932 event,
933 None,
934 None,
935 version_id,
936 format!("S4-{}", uuid::Uuid::new_v4()),
937 ));
938 }
939
940 /// v0.6 #40: attach the in-memory cross-bucket replication manager.
941 /// Once set, `put_bucket_replication` / `get_bucket_replication` /
942 /// `delete_bucket_replication` route through the manager (replacing
943 /// the previous backend-passthrough behaviour); a successful
944 /// `put_object` whose key matches an enabled rule fires a detached
945 /// tokio task that PUTs the same body + metadata to the rule's
946 /// destination bucket, stamping the replica with
947 /// `x-amz-replication-status: REPLICA`. Failures after the retry
948 /// budget bump the manager's `dropped_total` counter and are
949 /// surfaced in the `s4_replication_dropped_total` Prometheus
950 /// counter; successes bump `s4_replication_replicated_total`.
951 #[must_use]
952 pub fn with_replication(mut self, mgr: Arc<crate::replication::ReplicationManager>) -> Self {
953 self.replication = Some(mgr);
954 self
955 }
956
957 /// v0.6 #40: borrow the attached replication manager (test /
958 /// introspection — used by the metrics layer to read
959 /// `dropped_total`).
960 #[must_use]
961 pub fn replication_manager(&self) -> Option<&Arc<crate::replication::ReplicationManager>> {
962 self.replication.as_ref()
963 }
964
965 /// v0.6 #40: internal helper used by the PUT handlers to fire a
966 /// detached cross-bucket replication task. No-op when no manager
967 /// is attached, the source backend PUT failed, or no rule on the
968 /// source bucket matches the (key, tags) tuple. The `body` is the
969 /// post-compression / post-encryption `Bytes` that was sent to
970 /// the source backend (refcount-cloned), and `metadata` is the
971 /// metadata map that already includes the manifest /
972 /// `s4-encrypted` markers — the replica decodes through the same
973 /// path. The destination PUT runs through `Arc<B>::put_object`.
974 ///
975 /// ## v0.8.2 #61: generation token + shadow-key destination
976 ///
977 /// `pending_version` is the source-side `PutOutcome` minted by the
978 /// caller's versioning branch (or `None` for unversioned /
979 /// suspended buckets). When `pending_version.versioned_response`
980 /// is `true`, the dispatcher writes the destination under the same
981 /// shadow path the source uses (`<key>.__s4ver__/<vid>`) so the
982 /// destination's version chain receives the new version the same
983 /// way `?versionId=` GET resolves it. Closes audit C-1.
984 ///
985 /// The dispatcher also mints a fresh `generation` token before
986 /// spawning, threaded through to [`crate::replication::
987 /// replicate_object`]. Closes audit C-3 — a stale retry of an
988 /// older PUT can no longer overwrite the destination's newer bytes
989 /// because the CAS guard sees the higher stored generation and
990 /// drops its destination write.
991 ///
992 /// ## Asymmetric versioning policy (out of scope)
993 ///
994 /// We assume source + destination buckets share the same
995 /// versioning policy (both Enabled or both Suspended /
996 /// Unversioned). Cross-bucket policy queries would require a
997 /// backend round-trip per replication, which is not worth it for
998 /// the single-instance scope. Operators who configure asymmetric
999 /// versioning will see destination-side `?versionId=` lookups
1000 /// miss — documented as out-of-scope until a future per-rule
1001 /// `destination_versioning_policy` knob lands.
1002 // 8 args is the post-#61 shape: replication needs the
1003 // source bucket+key, the canonical tag set for rule-matching,
1004 // the post-codec body+metadata for the destination PUT, the
1005 // backend-success gate, and the pending version-id for the
1006 // shadow-key destination override. A shape struct would just
1007 // split the (single) call site so opt for the inline form.
1008 #[allow(clippy::too_many_arguments)]
1009 fn spawn_replication_if_matched(
1010 &self,
1011 source_bucket: &str,
1012 source_key: &str,
1013 request_tags: &Option<crate::tagging::TagSet>,
1014 body: &bytes::Bytes,
1015 metadata: &Option<std::collections::HashMap<String, String>>,
1016 backend_ok: bool,
1017 pending_version: Option<&crate::versioning::PutOutcome>,
1018 ) where
1019 B: Send + Sync + 'static,
1020 {
1021 if !backend_ok {
1022 return;
1023 }
1024 let Some(mgr) = self.replication.as_ref() else {
1025 return;
1026 };
1027 // Pull the request's tags into the (k, v) shape the matcher
1028 // expects. The tagging manager would have the canonical
1029 // post-PUT view but at this point in the pipeline it's
1030 // already been written above; for the rule-match decision
1031 // the request's tags are sufficient (= the tags this PUT
1032 // applies, S3 PutObject is full-replace on tags).
1033 let object_tags: Vec<(String, String)> = request_tags
1034 .as_ref()
1035 .map(|ts| ts.iter().cloned().collect())
1036 .unwrap_or_default();
1037 let Some(rule) = mgr.match_rule(source_bucket, source_key, &object_tags) else {
1038 return;
1039 };
1040 // v0.8.2 #61: mint the per-PUT generation BEFORE the eager
1041 // Pending stamp so the stamp itself carries the right
1042 // generation (the CAS in `record_status_if_newer` would
1043 // otherwise see a `generation=0` Pending and accept any
1044 // stale retry).
1045 let generation = mgr.next_generation();
1046 // Eagerly mark the source key as Pending so a HEAD between
1047 // the source PUT returning and the spawned task completing
1048 // surfaces the in-flight state. CAS-guarded so a slower
1049 // older PUT can't downgrade a newer Completed back to Pending.
1050 let _ = mgr.record_status_if_newer(
1051 source_bucket,
1052 source_key,
1053 generation,
1054 crate::replication::ReplicationStatus::Pending,
1055 );
1056 // v0.8.2 #61: derive the destination storage key. For a
1057 // versioning-Enabled source the destination receives the
1058 // same shadow-key path so a `?versionId=<vid>` GET on the
1059 // destination resolves through the same lookup the source
1060 // uses. Suspended / Unversioned sources keep the logical
1061 // key (= `None` override = dispatcher uses `source_key`).
1062 let destination_key_override = pending_version
1063 .filter(|pv| pv.versioned_response)
1064 .map(|pv| versioned_shadow_key(source_key, &pv.version_id));
1065 // v0.8.3 #68 (audit M-1): capture the source object's Object
1066 // Lock state so the dispatcher can decorate the destination
1067 // PUT with the matching AWS-wire lock headers. Without this,
1068 // a Compliance / Governance / legal-hold protected source
1069 // would replicate to a destination where DELETE succeeds
1070 // (the WORM posture would only hold on the source).
1071 let source_lock_state = self
1072 .object_lock
1073 .as_ref()
1074 .and_then(|mgr| mgr.get(source_bucket, source_key));
1075 // v0.8.3 #68: hand the destination-side ObjectLockManager to
1076 // the dispatcher closure so we can persist the propagated
1077 // lock state on successful destination PUT (the destination
1078 // PUT below bypasses S4Service::put_object — we drive the
1079 // backend directly — so the explicit_lock_mode commit block
1080 // in put_object never fires for replicas. We replay it here
1081 // against the destination key.)
1082 let dest_lock_mgr = self.object_lock.as_ref().map(Arc::clone);
1083 let mgr_cl = Arc::clone(mgr);
1084 let backend = Arc::clone(&self.backend);
1085 let body_cl = body.clone();
1086 let metadata_cl = metadata.clone();
1087 let source_bucket_cl = source_bucket.to_owned();
1088 let source_key_cl = source_key.to_owned();
1089 let source_lock_state_for_closure = source_lock_state.clone();
1090 let source_bucket_for_warn = source_bucket.to_owned();
1091 // v0.8.5 #86 (audit M-2): bound the in-flight replication queue
1092 // depth. Acquire happens INSIDE the spawned task (not on the
1093 // listener path) so a saturated semaphore back-pressures the
1094 // dispatcher pool without stalling the source PUT response —
1095 // the source has already returned 200 to the client by the time
1096 // the spawn body runs. A failed `acquire_owned` only happens
1097 // when the semaphore is closed (we never close it, so the
1098 // logged-and-skipped fallback is unreachable in practice).
1099 let semaphore = Arc::clone(&self.replication_semaphore);
1100 tokio::spawn(async move {
1101 let _permit = match semaphore.acquire_owned().await {
1102 Ok(p) => p,
1103 Err(e) => {
1104 tracing::warn!(
1105 bucket = %source_bucket_cl,
1106 key = %source_key_cl,
1107 "S4 replication dispatcher could not acquire semaphore permit (closed? {e}); skipping replica"
1108 );
1109 return;
1110 }
1111 };
1112 let do_put = move |dest_bucket: String,
1113 dest_key: String,
1114 dest_body: bytes::Bytes,
1115 dest_meta: Option<std::collections::HashMap<String, String>>| {
1116 let backend = Arc::clone(&backend);
1117 let dest_lock_mgr = dest_lock_mgr.clone();
1118 let lock_state = source_lock_state_for_closure.clone();
1119 let warn_src = source_bucket_for_warn.clone();
1120 async move {
1121 let req = S3Request {
1122 input: PutObjectInput {
1123 bucket: dest_bucket.clone(),
1124 key: dest_key.clone(),
1125 body: Some(bytes_to_blob(dest_body)),
1126 metadata: dest_meta,
1127 ..Default::default()
1128 },
1129 method: http::Method::PUT,
1130 uri: "/".parse().unwrap(),
1131 headers: http::HeaderMap::new(),
1132 extensions: http::Extensions::new(),
1133 credentials: None,
1134 region: None,
1135 service: None,
1136 trailing_headers: None,
1137 };
1138 let put_result = backend
1139 .put_object(req)
1140 .await
1141 .map(|_| ())
1142 .map_err(|e| format!("destination put_object: {e}"));
1143 // v0.8.3 #68: on successful destination PUT,
1144 // persist the propagated lock state into the
1145 // destination's ObjectLockManager so a subsequent
1146 // DELETE on the destination is refused. Three cases:
1147 // - PUT failed → skip (no replica to protect)
1148 // - lock_state None → nothing to propagate
1149 // - dest manager None (operator misconfig)
1150 // → log warn-once + bump skip metric
1151 if put_result.is_ok()
1152 && let Some(state) = lock_state
1153 {
1154 match dest_lock_mgr {
1155 Some(ref mgr) => {
1156 mgr.set(&dest_bucket, &dest_key, state);
1157 }
1158 None => {
1159 crate::replication::warn_lock_propagation_skipped(
1160 &warn_src,
1161 &dest_bucket,
1162 );
1163 }
1164 }
1165 }
1166 put_result
1167 }
1168 };
1169 // v0.8.5 #81 (audit H-7): wrap the dispatcher body in
1170 // `futures::FutureExt::catch_unwind` so a panic inside
1171 // `replicate_object` (or any of the user-supplied closures
1172 // it drives — `do_put`, the destination backend, the lock
1173 // manager) does NOT bubble out of the detached task as a
1174 // `JoinError` that no operator dashboard scrapes. Caught
1175 // panics bump `s4_dispatcher_panics_total{kind="replication"}`
1176 // + log at ERROR with the panic payload, so silent feature
1177 // degradation (= every replication PUT panicking and
1178 // dropping the replica without any visible signal) becomes
1179 // a first-class metric the operator can alert on.
1180 //
1181 // `AssertUnwindSafe` is required because the inner future
1182 // captures `Arc<...>` clones + a `do_put` closure that are
1183 // not `UnwindSafe` by default; the safety contract here is
1184 // "we don't continue using any of those captures after the
1185 // panic" which trivially holds (we drop them and return).
1186 use futures::FutureExt as _;
1187 let dispatcher_kind = "replication";
1188 let fut = crate::replication::replicate_object(
1189 rule,
1190 source_bucket_cl,
1191 source_key_cl,
1192 body_cl,
1193 metadata_cl,
1194 do_put,
1195 mgr_cl,
1196 generation,
1197 destination_key_override,
1198 source_lock_state,
1199 );
1200 if let Err(panic) = std::panic::AssertUnwindSafe(fut).catch_unwind().await {
1201 let panic_msg = panic
1202 .downcast_ref::<&'static str>()
1203 .copied()
1204 .map(str::to_owned)
1205 .or_else(|| panic.downcast_ref::<String>().cloned())
1206 .unwrap_or_else(|| "(non-string panic payload)".to_owned());
1207 tracing::error!(
1208 kind = dispatcher_kind,
1209 panic_payload = %panic_msg,
1210 "S4 dispatcher task panicked (caught by catch_unwind, runtime not poisoned)"
1211 );
1212 crate::metrics::record_dispatcher_panic(dispatcher_kind);
1213 }
1214 });
1215 }
1216
1217 /// v0.6 #42: attach the in-memory MFA-Delete enforcement manager.
1218 /// Once set, every DELETE / DELETE-version / delete-marker /
1219 /// `PutBucketVersioning` request against a bucket whose MFA-Delete
1220 /// state is `Enabled` requires a valid `x-amz-mfa: <serial> <code>`
1221 /// header (RFC 6238 6-digit TOTP); the gate is a no-op for buckets
1222 /// where MFA-Delete is `Disabled` (S3 default).
1223 #[must_use]
1224 pub fn with_mfa_delete(mut self, mgr: Arc<crate::mfa::MfaDeleteManager>) -> Self {
1225 self.mfa_delete = Some(mgr);
1226 self
1227 }
1228
1229 /// v0.6 #42: borrow the attached MFA-Delete manager (test /
1230 /// introspection — used by the snapshot path in `main.rs` to call
1231 /// `to_json` for restart-recoverable state).
1232 #[must_use]
1233 pub fn mfa_delete_manager(&self) -> Option<&Arc<crate::mfa::MfaDeleteManager>> {
1234 self.mfa_delete.as_ref()
1235 }
1236
1237 /// v0.6 #38: attach the in-memory CORS configuration manager. Once
1238 /// set, `put_bucket_cors` / `get_bucket_cors` / `delete_bucket_cors`
1239 /// route through the manager instead of forwarding to the backend,
1240 /// and [`Self::handle_preflight`] becomes useful for the (future)
1241 /// listener-side OPTIONS interceptor.
1242 #[must_use]
1243 pub fn with_cors(mut self, mgr: Arc<crate::cors::CorsManager>) -> Self {
1244 self.cors = Some(mgr);
1245 self
1246 }
1247
1248 /// v0.6 #38: Borrow the attached CORS manager (test / introspection).
1249 #[must_use]
1250 pub fn cors_manager(&self) -> Option<&Arc<crate::cors::CorsManager>> {
1251 self.cors.as_ref()
1252 }
1253
1254 /// v0.6 #38: evaluate a CORS preflight request against the bucket's
1255 /// configured rules and, if a rule matches, return the headers that
1256 /// the (future) listener-side OPTIONS interceptor must put on the
1257 /// 200 response: `Access-Control-Allow-Origin`, `Access-Control-
1258 /// Allow-Methods`, `Access-Control-Allow-Headers`, optionally
1259 /// `Access-Control-Max-Age` and `Access-Control-Expose-Headers`.
1260 ///
1261 /// Returns `None` when no manager is attached, no config is
1262 /// registered for the bucket, or no rule matches the (origin,
1263 /// method, headers) triple. The caller is responsible for turning
1264 /// `None` into the appropriate 403 response.
1265 ///
1266 /// **Note:** the OPTIONS routing itself (i.e. wiring this method
1267 /// into the hyper-util listener path) is a follow-up — s3s does not
1268 /// surface OPTIONS as a typed S3 handler, so this method is
1269 /// currently call-able only from inside other handlers and tests.
1270 #[must_use]
1271 pub fn handle_preflight(
1272 &self,
1273 bucket: &str,
1274 origin: &str,
1275 method: &str,
1276 request_headers: &[String],
1277 ) -> Option<std::collections::HashMap<String, String>> {
1278 let mgr = self.cors.as_ref()?;
1279 let rule = mgr.match_preflight(bucket, origin, method, request_headers)?;
1280 let mut h = std::collections::HashMap::new();
1281 // Echo the matched origin back. If the rule used "*" we still
1282 // echo "*" (S3 spec — the spec does not require us to echo the
1283 // *requesting* origin when the wildcard matched).
1284 let allow_origin = if rule.allowed_origins.iter().any(|o| o == "*") {
1285 "*".to_string()
1286 } else {
1287 origin.to_string()
1288 };
1289 h.insert("Access-Control-Allow-Origin".to_string(), allow_origin);
1290 h.insert(
1291 "Access-Control-Allow-Methods".to_string(),
1292 rule.allowed_methods.join(", "),
1293 );
1294 if !rule.allowed_headers.is_empty() {
1295 // For the Allow-Headers response, echo back the rule's
1296 // pattern list verbatim (S3 echoes the configured list,
1297 // including "*" if present). Browsers honour exact-match
1298 // rules.
1299 h.insert(
1300 "Access-Control-Allow-Headers".to_string(),
1301 rule.allowed_headers.join(", "),
1302 );
1303 }
1304 if let Some(secs) = rule.max_age_seconds {
1305 h.insert("Access-Control-Max-Age".to_string(), secs.to_string());
1306 }
1307 if !rule.expose_headers.is_empty() {
1308 h.insert(
1309 "Access-Control-Expose-Headers".to_string(),
1310 rule.expose_headers.join(", "),
1311 );
1312 }
1313 Some(h)
1314 }
1315
1316 /// v0.5 #32: enable strict compliance mode. Every PUT must carry an
1317 /// SSE indicator (server-side encryption header or SSE-C customer
1318 /// key); requests without one are rejected with 400 InvalidRequest.
1319 /// Boot-time prerequisite checking lives in the binary
1320 /// (`validate_compliance_mode`) so this flag is purely the runtime
1321 /// switch.
1322 #[must_use]
1323 pub fn with_compliance_strict(mut self, on: bool) -> Self {
1324 self.compliance_strict = on;
1325 self
1326 }
1327
1328 /// v0.5 #30: attach the in-memory Object Lock (WORM) enforcement
1329 /// manager. Once set, `delete_object` and overwrite-path
1330 /// `put_object` refuse operations on locked keys with HTTP 403
1331 /// `AccessDenied`; new PUTs to a bucket with a default retention
1332 /// policy auto-create per-object lock state.
1333 #[must_use]
1334 pub fn with_object_lock(mut self, mgr: Arc<crate::object_lock::ObjectLockManager>) -> Self {
1335 self.object_lock = Some(mgr);
1336 self
1337 }
1338
1339 /// v0.7 #45: borrow the attached Object Lock manager (read-only —
1340 /// the lifecycle scanner uses this to skip currently-locked objects
1341 /// before issuing `delete_object`, since an Object Lock always wins
1342 /// over Lifecycle Expiration in AWS S3 semantics). Mirrors the
1343 /// shape of [`Self::lifecycle_manager`] /
1344 /// [`Self::tag_manager`] — purely additive accessor, no handler
1345 /// behaviour change.
1346 #[must_use]
1347 pub fn object_lock_manager(&self) -> Option<&Arc<crate::object_lock::ObjectLockManager>> {
1348 self.object_lock.as_ref()
1349 }
1350
1351 /// v0.5 #28: attach an SSE-KMS backend. `default_key_id` is used
1352 /// when a PUT requests SSE-KMS without naming a specific KMS key
1353 /// (operators set this to mirror AWS S3's bucket-default key).
1354 #[must_use]
1355 pub fn with_kms_backend(
1356 mut self,
1357 kms: Arc<dyn crate::kms::KmsBackend>,
1358 default_key_id: Option<String>,
1359 ) -> Self {
1360 self.kms = Some(kms);
1361 self.kms_default_key_id = default_key_id;
1362 self
1363 }
1364
1365 /// v0.5 #34: attach the first-class versioning state machine. Once
1366 /// set, this `S4Service` owns the per-bucket versioning state +
1367 /// per-(bucket, key) version chain; `put_object` / `get_object` /
1368 /// `delete_object` / `list_object_versions` /
1369 /// `get_bucket_versioning` / `put_bucket_versioning` consult the
1370 /// manager instead of passing through to the backend. The backend
1371 /// is still used as the byte store: Suspended / Unversioned buckets
1372 /// keep using `<key>` directly (legacy), Enabled buckets redirect
1373 /// each version's bytes to a shadow key
1374 /// (`<key>.__s4ver__/<version-id>`) so older versions survive newer
1375 /// PUTs to the same logical key.
1376 #[must_use]
1377 pub fn with_versioning(mut self, mgr: Arc<crate::versioning::VersioningManager>) -> Self {
1378 self.versioning = Some(mgr);
1379 self
1380 }
1381
1382 /// v0.8.5 #86 (audit M-3): borrow the attached versioning manager so
1383 /// the SIGUSR1 snapshot dump-back hook in `main.rs` can re-emit the
1384 /// in-memory state to the operator's `--versioning-state-file`
1385 /// without restarting the gateway. Mirrors the shape of
1386 /// [`Self::object_lock_manager`] / [`Self::lifecycle_manager`] —
1387 /// purely additive accessor, no handler behaviour change.
1388 #[must_use]
1389 pub fn versioning_manager(&self) -> Option<&Arc<crate::versioning::VersioningManager>> {
1390 self.versioning.as_ref()
1391 }
1392
1393 /// v0.8.5 #86 (audit M-2): override the default replication-dispatch
1394 /// concurrency cap (1024). Wired by the `--replication-max-concurrent`
1395 /// CLI flag in `main.rs`. Operators running heavy cross-region
1396 /// fan-out may need to raise this; operators on memory-constrained
1397 /// hosts may need to lower it. The new value replaces the existing
1398 /// `Semaphore` (so calling this after dispatchers are already in
1399 /// flight is fine — the in-flight tasks hold permits from the old
1400 /// semaphore which is dropped when its last permit is released).
1401 /// A `max` of 0 would deadlock all replicas; the value is silently
1402 /// clamped to 1 instead.
1403 #[must_use]
1404 pub fn with_replication_max_concurrent(mut self, max: usize) -> Self {
1405 let max = max.max(1);
1406 self.replication_semaphore = Arc::new(tokio::sync::Semaphore::new(max));
1407 self
1408 }
1409
1410 /// v0.8.5 #86 (audit M-2): borrow the in-flight replication
1411 /// concurrency permit pool. Tests inspect `available_permits()`
1412 /// after invoking `spawn_replication_if_matched` to verify the
1413 /// dispatcher actually `acquire_owned`s before kicking off the
1414 /// destination PUT.
1415 #[must_use]
1416 pub fn replication_semaphore(&self) -> &Arc<tokio::sync::Semaphore> {
1417 &self.replication_semaphore
1418 }
1419
1420 /// v0.4 #21 (kept for back-compat): attach a single SSE-S4 key.
1421 /// Internally wraps it in a 1-slot keyring with id=1 active, so
1422 /// new objects ride the v0.5 S4E2 frame while previously-written
1423 /// S4E1 bytes (this same key) still decrypt via the keyring's S4E1
1424 /// fallback path. Operators wanting true rotation should call
1425 /// [`Self::with_sse_keyring`] instead.
1426 #[must_use]
1427 pub fn with_sse_key(mut self, key: crate::sse::SharedSseKey) -> Self {
1428 let keyring = crate::sse::SseKeyring::new(1, key);
1429 self.sse_keyring = Some(std::sync::Arc::new(keyring));
1430 self
1431 }
1432
1433 /// v0.5 #29: attach a multi-key SSE-S4 keyring. PUT encrypts under
1434 /// the active key (S4E2 frame stamped with that key's id); GET
1435 /// dispatches on the body's magic — S4E1 falls back to trying every
1436 /// key in the ring (active first) so v0.4 objects survive a
1437 /// migration; S4E2 looks up the explicit key_id from the header.
1438 #[must_use]
1439 pub fn with_sse_keyring(mut self, keyring: crate::sse::SharedSseKeyring) -> Self {
1440 self.sse_keyring = Some(keyring);
1441 self
1442 }
1443
1444 /// v0.8 #52: opt the SSE-S4 PUT path into the chunked S4E5 frame
1445 /// (so the matching GET can stream-decrypt chunk-by-chunk
1446 /// instead of buffering the entire body before tag verify).
1447 /// `bytes` is the plaintext slice size — typically 1 MiB; 0
1448 /// disables the path and reverts to the legacy S4E2 buffered
1449 /// frame.
1450 ///
1451 /// SSE-C (S4E3) and SSE-KMS (S4E4) are intentionally untouched:
1452 /// the chunked envelopes for those flows are a follow-up issue
1453 /// (the customer-key wire surface needs separate version
1454 /// negotiation).
1455 ///
1456 /// Has no effect when `with_sse_keyring` / `with_sse_key` is
1457 /// not also set — the chunked path runs only on the SSE-S4
1458 /// branch of `put_object`.
1459 #[must_use]
1460 pub fn with_sse_chunk_size(mut self, bytes: usize) -> Self {
1461 self.sse_chunk_size = bytes;
1462 self
1463 }
1464
1465 /// v0.4 #20: attach an S3-style access-log emitter. Each completed
1466 /// PUT / GET / DELETE / List handler emits one entry into the
1467 /// emitter's buffer; a background flusher (started separately, see
1468 /// [`crate::access_log::AccessLog::spawn_flusher`]) writes hourly
1469 /// rotated `.log` files into the configured directory.
1470 #[must_use]
1471 pub fn with_access_log(mut self, log: crate::access_log::SharedAccessLog) -> Self {
1472 self.access_log = Some(log);
1473 self
1474 }
1475
1476 /// Capture the per-request access-log preamble before the request is
1477 /// consumed by the backend call. Returns `None` if no access logger
1478 /// is configured (cheap early-out so the handler doesn't pay the
1479 /// header-clone cost when access logging is off).
1480 fn access_log_preamble<I>(&self, req: &S3Request<I>) -> Option<AccessLogPreamble> {
1481 self.access_log.as_ref()?;
1482 Some(AccessLogPreamble {
1483 // v0.8.11 CRIT-4 fix: same trust gate as `request_context`.
1484 // Recording a client-controllable header in the access log
1485 // would poison forensic queries; leave it `None` until the
1486 // operator declares X-Forwarded-For is set by a trusted
1487 // proxy.
1488 remote_ip: if self.trust_x_forwarded_for {
1489 req.headers
1490 .get("x-forwarded-for")
1491 .and_then(|v| v.to_str().ok())
1492 .and_then(|raw| raw.split(',').next())
1493 .map(|s| s.trim().to_owned())
1494 } else {
1495 None
1496 },
1497 requester: Self::principal_of(req).map(str::to_owned),
1498 request_uri: format!("{} {}", req.method, req.uri.path()),
1499 user_agent: req
1500 .headers
1501 .get("user-agent")
1502 .and_then(|v| v.to_str().ok())
1503 .map(str::to_owned),
1504 })
1505 }
1506
1507 /// Internal — called by handlers at end-of-request with a captured
1508 /// preamble. Best-effort: swallows the await fast (clones Arc +
1509 /// pushes), no error propagation back to the request path.
1510 #[allow(clippy::too_many_arguments)]
1511 async fn record_access(
1512 &self,
1513 preamble: Option<AccessLogPreamble>,
1514 operation: &'static str,
1515 bucket: &str,
1516 key: Option<&str>,
1517 http_status: u16,
1518 bytes_sent: u64,
1519 object_size: u64,
1520 total_time_ms: u64,
1521 error_code: Option<&str>,
1522 ) {
1523 let (Some(log), Some(p)) = (self.access_log.as_ref(), preamble) else {
1524 return;
1525 };
1526 log.record(crate::access_log::AccessLogEntry {
1527 time: std::time::SystemTime::now(),
1528 bucket: bucket.to_owned(),
1529 remote_ip: p.remote_ip,
1530 requester: p.requester,
1531 operation,
1532 key: key.map(str::to_owned),
1533 request_uri: p.request_uri,
1534 http_status,
1535 error_code: error_code.map(str::to_owned),
1536 bytes_sent,
1537 object_size,
1538 total_time_ms,
1539 user_agent: p.user_agent,
1540 })
1541 .await;
1542 }
1543
1544 /// v0.4 #19: attach a per-(principal, bucket) token-bucket rate limiter.
1545 /// When set, every PUT / GET / DELETE / List / Copy / multipart op is
1546 /// throttle-checked before the policy gate; throttled requests return
1547 /// `S3ErrorCode::SlowDown` (HTTP 503) and bump
1548 /// `s4_rate_limit_throttled_total{principal,bucket}`.
1549 #[must_use]
1550 pub fn with_rate_limits(mut self, rl: crate::rate_limit::SharedRateLimits) -> Self {
1551 self.rate_limits = Some(rl);
1552 self
1553 }
1554
1555 /// Helper used by request handlers to apply the rate limit. Returns
1556 /// `Ok(())` when allowed (or no rate limiter is configured), or a
1557 /// `SlowDown` S3Error otherwise.
1558 fn enforce_rate_limit<I>(&self, req: &S3Request<I>, bucket: &str) -> S3Result<()> {
1559 let Some(rl) = self.rate_limits.as_ref() else {
1560 return Ok(());
1561 };
1562 let principal_id = Self::principal_of(req);
1563 if !rl.check(principal_id, bucket) {
1564 crate::metrics::record_rate_limit_throttle(principal_id.unwrap_or("-"), bucket);
1565 return Err(S3Error::with_message(
1566 S3ErrorCode::SlowDown,
1567 format!("rate-limited: bucket={bucket}"),
1568 ));
1569 }
1570 Ok(())
1571 }
1572
1573 /// Tell the policy evaluator that the listener is reached over TLS
1574 /// (or ACME). When `true`, the `aws:SecureTransport` Condition key
1575 /// resolves to `true`. Defaults to `false`.
1576 #[must_use]
1577 pub fn with_secure_transport(mut self, on: bool) -> Self {
1578 self.secure_transport = on;
1579 self
1580 }
1581
1582 #[must_use]
1583 pub fn with_max_body_bytes(mut self, n: usize) -> Self {
1584 self.max_body_bytes = n;
1585 self
1586 }
1587
1588 /// Attach an optional bucket policy (v0.2 #7). When `Some(...)`, every
1589 /// PUT / GET / DELETE / List handler runs `policy.evaluate(...)` before
1590 /// delegating to the backend; failures return `S3ErrorCode::AccessDenied`.
1591 /// When `None` (the default), no policy enforcement happens.
1592 #[must_use]
1593 pub fn with_policy(mut self, policy: crate::policy::SharedPolicy) -> Self {
1594 self.policy = Some(policy);
1595 self
1596 }
1597
1598 /// Pull the SigV4 access key id off the request's credentials, if any.
1599 /// Used as the `principal_id` for policy evaluation.
1600 fn principal_of<I>(req: &S3Request<I>) -> Option<&str> {
1601 req.credentials.as_ref().map(|c| c.access_key.as_str())
1602 }
1603
1604 /// v0.8.17 G-2: shared reserved-name guard used by every per-object
1605 /// API handler. `mode` chooses the AWS error shape: `Mutating`
1606 /// (PUT / Copy / DELETE / Tagging-write) returns
1607 /// `InvalidObjectName`; `Read` (GET / HEAD / Attributes / Tagging-read)
1608 /// returns `NoSuchKey` so a curious client gets the same response
1609 /// the listing filter has been giving them since v0.8.12 (the
1610 /// sidecar is invisible to list).
1611 ///
1612 /// v0.8.17 G-4: when `--allow-legacy-reserved-key-reads` is set
1613 /// AND the call is a `Read`, the guard returns `Ok(())` so
1614 /// operators upgrading from pre-v0.8.15 deployments can still
1615 /// access (and migrate off) any user-owned `<key>.s4index`
1616 /// objects that landed before M-1 / F-13 closed the namespace.
1617 /// Mutating operations stay blocked regardless of the flag —
1618 /// the flag is a read-only migration aid, not an injection
1619 /// re-opener.
1620 fn check_not_reserved_key(&self, key: &str, mode: ReservedKeyMode) -> S3Result<()> {
1621 if !s4_codec::index::is_reserved_sidecar_key(key) {
1622 return Ok(());
1623 }
1624 if matches!(mode, ReservedKeyMode::Read) && self.allow_legacy_reserved_key_reads {
1625 return Ok(());
1626 }
1627 match mode {
1628 ReservedKeyMode::Read => Err(S3Error::with_message(
1629 S3ErrorCode::NoSuchKey,
1630 format!("object key {key:?} is reserved for S4 internal sidecars"),
1631 )),
1632 ReservedKeyMode::Mutating => {
1633 let code = S3ErrorCode::from_bytes(b"InvalidObjectName")
1634 .unwrap_or(S3ErrorCode::InvalidArgument);
1635 Err(S3Error::with_message(
1636 code,
1637 format!(
1638 "object key {key:?} is reserved (suffix `{}` is used for S4 internal \
1639 sidecars)",
1640 s4_codec::index::SIDECAR_SUFFIX,
1641 ),
1642 ))
1643 }
1644 }
1645 }
1646
1647 /// v0.3 #13: build the per-request policy context from the incoming
1648 /// `S3Request`. Pulls `aws:UserAgent` from the User-Agent header,
1649 /// `aws:SourceIp` from the standard `X-Forwarded-For` header (most
1650 /// production deployments are behind an LB / reverse proxy that sets
1651 /// this), `aws:CurrentTime` from the system clock, and
1652 /// `aws:SecureTransport` from the per-listener TLS flag.
1653 fn request_context<I>(&self, req: &S3Request<I>) -> crate::policy::RequestContext {
1654 let user_agent = req
1655 .headers
1656 .get("user-agent")
1657 .and_then(|v| v.to_str().ok())
1658 .map(str::to_owned);
1659 // v0.8.11 CRIT-4 fix: `X-Forwarded-For` is a client-controllable
1660 // header. Trusting it unconditionally lets any public-internet
1661 // request claim it came from a trusted CIDR (e.g.
1662 // `curl -H 'X-Forwarded-For: 10.0.0.1'` to satisfy a
1663 // `Condition: NotIpAddress aws:SourceIp [10.0.0.0/8]` Deny).
1664 // We now only consume the header when the operator has
1665 // declared "this gateway sits behind a trusted reverse proxy
1666 // that scrubs client-supplied values" via
1667 // `with_trust_x_forwarded_for(true)` /
1668 // `--trust-x-forwarded-for`. Default leaves `source_ip` as
1669 // `None`, which fails closed for IP-allowlist Allow rules
1670 // and fails open for IP-blocklist Deny rules — operators
1671 // who need either case behind a public listener must opt in
1672 // or move the gate to the reverse proxy. The leftmost
1673 // comma-separated token is the originator per the
1674 // `X-Forwarded-For: client, proxy1, proxy2` convention.
1675 let source_ip = if self.trust_x_forwarded_for {
1676 req.headers
1677 .get("x-forwarded-for")
1678 .and_then(|v| v.to_str().ok())
1679 .and_then(|raw| raw.split(',').next())
1680 .and_then(|s| s.trim().parse().ok())
1681 } else {
1682 None
1683 };
1684 crate::policy::RequestContext {
1685 source_ip,
1686 user_agent,
1687 request_time: Some(std::time::SystemTime::now()),
1688 secure_transport: self.secure_transport,
1689 existing_object_tags: None,
1690 request_object_tags: None,
1691 extra: Default::default(),
1692 }
1693 }
1694
1695 /// Helper used by request handlers to enforce the optional policy.
1696 /// Returns `Ok(())` when allowed (or no policy is configured), or an
1697 /// `AccessDenied` S3Error otherwise. Bumps the policy denial Prometheus
1698 /// counter on deny.
1699 fn enforce_policy<I>(
1700 &self,
1701 req: &S3Request<I>,
1702 action: &'static str,
1703 bucket: &str,
1704 key: Option<&str>,
1705 ) -> S3Result<()> {
1706 self.enforce_policy_with_extra(req, action, bucket, key, None, None)
1707 }
1708
1709 /// v0.6 #39: variant of [`Self::enforce_policy`] that lets the
1710 /// caller plumb tag context (existing-on-object + on-request) into
1711 /// the policy evaluator. Both arguments default to `None`, in
1712 /// which case the resulting `RequestContext` is identical to
1713 /// [`Self::enforce_policy`]'s — so for handlers that don't deal
1714 /// with tags this is a transparent no-op.
1715 fn enforce_policy_with_extra<I>(
1716 &self,
1717 req: &S3Request<I>,
1718 action: &'static str,
1719 bucket: &str,
1720 key: Option<&str>,
1721 request_tags: Option<&crate::tagging::TagSet>,
1722 existing_tags: Option<&crate::tagging::TagSet>,
1723 ) -> S3Result<()> {
1724 let Some(policy) = self.policy.as_ref() else {
1725 return Ok(());
1726 };
1727 let principal_id = Self::principal_of(req);
1728 let mut ctx = self.request_context(req);
1729 if let Some(t) = request_tags {
1730 ctx.request_object_tags = Some(t.clone());
1731 }
1732 if let Some(t) = existing_tags {
1733 ctx.existing_object_tags = Some(t.clone());
1734 }
1735 let decision = policy.evaluate_with(action, bucket, key, principal_id, &ctx);
1736 if decision.allow {
1737 Ok(())
1738 } else {
1739 crate::metrics::record_policy_denial(action, bucket);
1740 tracing::info!(
1741 action,
1742 bucket,
1743 key = ?key,
1744 principal = ?principal_id,
1745 source_ip = ?ctx.source_ip,
1746 user_agent = ?ctx.user_agent,
1747 secure_transport = ctx.secure_transport,
1748 matched_sid = ?decision.matched_sid,
1749 effect = ?decision.matched_effect,
1750 "S4 policy denied request"
1751 );
1752 Err(S3Error::with_message(
1753 S3ErrorCode::AccessDenied,
1754 format!("denied by S4 policy: {action} on bucket={bucket}"),
1755 ))
1756 }
1757 }
1758
1759 /// テスト用: backend を取り戻す (test helper、production では使わない).
1760 /// v0.6 #40 で `backend` が `Arc<B>` 化したので `Arc::try_unwrap` で
1761 /// 1-clone の場合のみ返す。共有されている (= replication dispatcher が
1762 /// 同じ Arc を持っていて未完了) 場合は `Err` を返さず panic させる
1763 /// (test 用途専用 helper の caller 契約を維持)。
1764 pub fn into_backend(self) -> B {
1765 Arc::try_unwrap(self.backend).unwrap_or_else(|_| {
1766 panic!("into_backend: backend Arc still shared (replication dispatcher in flight?)")
1767 })
1768 }
1769
1770 /// 必要 frame だけを backend に Range GET し、frame parse + decompress + slice
1771 /// した結果を返す sidecar fast path。Range request の **帯域節約版**。
1772 async fn partial_range_get(
1773 &self,
1774 req: &S3Request<GetObjectInput>,
1775 plan: s4_codec::index::RangePlan,
1776 client_start: u64,
1777 client_end_exclusive: u64,
1778 total_original: u64,
1779 get_start: Instant,
1780 ) -> S3Result<S3Response<GetObjectOutput>> {
1781 // 必要 byte 範囲だけを backend に partial GET
1782 let backend_range = s3s::dto::Range::Int {
1783 first: plan.byte_start,
1784 last: Some(plan.byte_end_exclusive - 1),
1785 };
1786 let backend_input = GetObjectInput {
1787 bucket: req.input.bucket.clone(),
1788 key: req.input.key.clone(),
1789 range: Some(backend_range),
1790 ..Default::default()
1791 };
1792 let backend_req = S3Request {
1793 input: backend_input,
1794 method: req.method.clone(),
1795 uri: req.uri.clone(),
1796 headers: req.headers.clone(),
1797 extensions: http::Extensions::new(),
1798 credentials: req.credentials.clone(),
1799 region: req.region.clone(),
1800 service: req.service.clone(),
1801 trailing_headers: None,
1802 };
1803 let mut backend_resp = self.backend.get_object(backend_req).await?;
1804 let blob = backend_resp.output.body.take().ok_or_else(|| {
1805 S3Error::with_message(
1806 S3ErrorCode::InternalError,
1807 "backend partial GET returned empty body",
1808 )
1809 })?;
1810 let bytes = collect_blob(blob, self.max_body_bytes)
1811 .await
1812 .map_err(internal("collect partial body"))?;
1813
1814 // frame parse + decompress
1815 let mut combined = BytesMut::new();
1816 for frame in FrameIter::new(bytes) {
1817 let (header, payload) = frame.map_err(|e| {
1818 S3Error::with_message(
1819 S3ErrorCode::InternalError,
1820 format!("partial-range frame parse: {e}"),
1821 )
1822 })?;
1823 let chunk_manifest = ChunkManifest {
1824 codec: header.codec,
1825 original_size: header.original_size,
1826 compressed_size: header.compressed_size,
1827 crc32c: header.crc32c,
1828 };
1829 let decompressed = self
1830 .registry
1831 .decompress(payload, &chunk_manifest)
1832 .await
1833 .map_err(internal("partial-range decompress"))?;
1834 combined.extend_from_slice(&decompressed);
1835 }
1836 let combined = combined.freeze();
1837 let sliced = combined
1838 .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
1839
1840 // response 組立て
1841 let returned_size = sliced.len() as u64;
1842 backend_resp.output.content_length = Some(returned_size as i64);
1843 backend_resp.output.content_range = Some(format!(
1844 "bytes {client_start}-{}/{total_original}",
1845 client_end_exclusive - 1
1846 ));
1847 backend_resp.output.checksum_crc32 = None;
1848 backend_resp.output.checksum_crc32c = None;
1849 backend_resp.output.checksum_crc64nvme = None;
1850 backend_resp.output.checksum_sha1 = None;
1851 backend_resp.output.checksum_sha256 = None;
1852 backend_resp.output.e_tag = None;
1853 backend_resp.output.body = Some(bytes_to_blob(sliced));
1854 backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
1855
1856 let elapsed = get_start.elapsed();
1857 crate::metrics::record_get(
1858 "partial",
1859 plan.byte_end_exclusive - plan.byte_start,
1860 returned_size,
1861 elapsed.as_secs_f64(),
1862 true,
1863 );
1864 info!(
1865 op = "get_object",
1866 bucket = %req.input.bucket,
1867 key = %req.input.key,
1868 bytes_in = plan.byte_end_exclusive - plan.byte_start,
1869 bytes_out = returned_size,
1870 total_object_size = total_original,
1871 range = true,
1872 path = "sidecar-partial",
1873 latency_ms = elapsed.as_millis() as u64,
1874 "S4 partial Range GET via sidecar index"
1875 );
1876 Ok(backend_resp)
1877 }
1878
1879 /// v0.9 #106: SSE-S4 chunked (S4E6) encryption-aware partial
1880 /// Range GET. The sidecar carries an [`s4_codec::index::SseChunkBinding`]
1881 /// (salt + key_id + chunk geometry) that lets us:
1882 ///
1883 /// 1. Map the [`s4_codec::index::RangePlan`]'s pre-encrypt byte range
1884 /// to an encrypted chunk-range via
1885 /// [`FrameIndex::encrypted_lookup`].
1886 /// 2. Partial-GET only those S4E6 chunks from backend (instead of
1887 /// the entire encrypted body).
1888 /// 3. Decrypt the fetched chunks via
1889 /// [`crate::sse::decrypt_s4e6_chunk_range`] (per-chunk
1890 /// independently sealed — no need for the full body's tag).
1891 /// 4. Frame-parse + decompress the decrypted plaintext and slice
1892 /// out the client-requested bytes via the existing
1893 /// [`Self::partial_range_get`] machinery (re-used to keep one
1894 /// source of truth for the response shaping).
1895 ///
1896 /// Returns `Err(...)` on any failure (auth, range, parse) so the
1897 /// caller can decide to fall back to the buffered full-GET path.
1898 /// In practice we surface a clear `InternalError` and let it
1899 /// bubble — Range GET on an encrypted body that fails partial
1900 /// fetch is a genuine error condition (sidecar / body mismatch,
1901 /// keyring rotated, etc.), not a quietly-degrade case.
1902 #[allow(clippy::too_many_arguments)]
1903 async fn partial_range_get_encrypted(
1904 &self,
1905 req: &S3Request<GetObjectInput>,
1906 plan: s4_codec::index::RangePlan,
1907 enc_plan: s4_codec::index::EncryptedRangePlan,
1908 sse: s4_codec::index::SseChunkBinding,
1909 client_start: u64,
1910 client_end_exclusive: u64,
1911 total_original: u64,
1912 get_start: Instant,
1913 ) -> S3Result<S3Response<GetObjectOutput>> {
1914 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
1915 S3Error::with_message(
1916 S3ErrorCode::InvalidRequest,
1917 "object is SSE-S4 chunked but no --sse-s4-key is configured on this gateway",
1918 )
1919 })?;
1920 // Partial-fetch the enc byte range that covers the needed
1921 // chunks. Note that `byte_end_exclusive - 1` is the inclusive
1922 // last byte (matches the existing partial_range_get
1923 // convention).
1924 let backend_range = s3s::dto::Range::Int {
1925 first: enc_plan.enc_byte_start,
1926 last: Some(enc_plan.enc_byte_end_exclusive - 1),
1927 };
1928 let backend_input = GetObjectInput {
1929 bucket: req.input.bucket.clone(),
1930 key: req.input.key.clone(),
1931 range: Some(backend_range),
1932 ..Default::default()
1933 };
1934 let backend_req = S3Request {
1935 input: backend_input,
1936 method: req.method.clone(),
1937 uri: req.uri.clone(),
1938 headers: req.headers.clone(),
1939 extensions: http::Extensions::new(),
1940 credentials: req.credentials.clone(),
1941 region: req.region.clone(),
1942 service: req.service.clone(),
1943 trailing_headers: None,
1944 };
1945 let mut backend_resp = self.backend.get_object(backend_req).await?;
1946 let blob = backend_resp.output.body.take().ok_or_else(|| {
1947 S3Error::with_message(
1948 S3ErrorCode::InternalError,
1949 "backend partial GET returned empty body (SSE-S4 chunked Range)",
1950 )
1951 })?;
1952 let enc_bytes = collect_blob(blob, self.max_body_bytes)
1953 .await
1954 .map_err(internal("collect SSE-S4 chunked partial body"))?;
1955
1956 // Decrypt the partial chunks → pre-encrypt (= compressed-framed) plaintext.
1957 let plaintext = crate::sse::decrypt_s4e6_chunk_range(
1958 &enc_bytes,
1959 keyring.as_ref(),
1960 sse.enc_chunk_size,
1961 sse.enc_chunk_count,
1962 sse.enc_key_id,
1963 &sse.enc_salt,
1964 sse.enc_plaintext_len,
1965 enc_plan.chunk_idx_start,
1966 enc_plan.chunk_idx_last_inclusive,
1967 )
1968 .map_err(|e| {
1969 S3Error::with_message(
1970 S3ErrorCode::InternalError,
1971 format!("SSE-S4 chunked partial decrypt failed: {e}"),
1972 )
1973 })?;
1974 // Slice the decrypted concatenation down to the requested
1975 // pre-encrypt byte range (= the `RangePlan.byte_start..
1976 // byte_end_exclusive` range, expressed inside the chunks we
1977 // fetched).
1978 let s = enc_plan.pre_encrypt_slice_start_in_concat as usize;
1979 let e = enc_plan.pre_encrypt_slice_end_in_concat as usize;
1980 if e > plaintext.len() {
1981 return Err(S3Error::with_message(
1982 S3ErrorCode::InternalError,
1983 "SSE-S4 chunked partial decrypt produced fewer bytes than the sidecar declared",
1984 ));
1985 }
1986 let pre_encrypt_slice = plaintext.slice(s..e);
1987
1988 // Frame-parse + decompress the pre-encrypt slice, then slice
1989 // again on the original byte range. The plan's
1990 // slice_start_in_combined / slice_end_in_combined account for
1991 // the original_offset of the first frame we fetched — they
1992 // are pre-encrypt-domain offsets, identical to the
1993 // non-encrypted partial-range path.
1994 let mut combined = BytesMut::new();
1995 for frame in FrameIter::new(pre_encrypt_slice) {
1996 let (header, payload) = frame.map_err(|fe| {
1997 S3Error::with_message(
1998 S3ErrorCode::InternalError,
1999 format!("SSE-S4 chunked partial frame parse: {fe}"),
2000 )
2001 })?;
2002 let chunk_manifest = ChunkManifest {
2003 codec: header.codec,
2004 original_size: header.original_size,
2005 compressed_size: header.compressed_size,
2006 crc32c: header.crc32c,
2007 };
2008 let decompressed = self
2009 .registry
2010 .decompress(payload, &chunk_manifest)
2011 .await
2012 .map_err(internal("SSE-S4 chunked partial decompress"))?;
2013 combined.extend_from_slice(&decompressed);
2014 }
2015 let combined = combined.freeze();
2016 let sliced = combined
2017 .slice(plan.slice_start_in_combined as usize..plan.slice_end_in_combined as usize);
2018
2019 // Response shaping: identical to the unencrypted partial
2020 // path (clear backend checksums / e_tag since they describe
2021 // the encrypted body, not the plaintext slice).
2022 let returned_size = sliced.len() as u64;
2023 backend_resp.output.content_length = Some(returned_size as i64);
2024 backend_resp.output.content_range = Some(format!(
2025 "bytes {client_start}-{}/{total_original}",
2026 client_end_exclusive - 1
2027 ));
2028 backend_resp.output.checksum_crc32 = None;
2029 backend_resp.output.checksum_crc32c = None;
2030 backend_resp.output.checksum_crc64nvme = None;
2031 backend_resp.output.checksum_sha1 = None;
2032 backend_resp.output.checksum_sha256 = None;
2033 backend_resp.output.e_tag = None;
2034 backend_resp.output.body = Some(bytes_to_blob(sliced));
2035 backend_resp.status = Some(http::StatusCode::PARTIAL_CONTENT);
2036
2037 let elapsed = get_start.elapsed();
2038 // Use the encrypted bytes_in for the bandwidth-saved metric —
2039 // that's what actually traversed the wire, vs. the full
2040 // encrypted body that the buffered fallback would have
2041 // fetched.
2042 crate::metrics::record_get(
2043 "sse-s4-chunked-partial",
2044 enc_plan.enc_byte_end_exclusive - enc_plan.enc_byte_start,
2045 returned_size,
2046 elapsed.as_secs_f64(),
2047 true,
2048 );
2049 info!(
2050 op = "get_object",
2051 bucket = %req.input.bucket,
2052 key = %req.input.key,
2053 bytes_in = enc_plan.enc_byte_end_exclusive - enc_plan.enc_byte_start,
2054 bytes_out = returned_size,
2055 total_object_size = total_original,
2056 range = true,
2057 path = "sidecar-partial-sse-s4-chunked",
2058 chunks_fetched = (enc_plan.chunk_idx_last_inclusive - enc_plan.chunk_idx_start + 1) as u64,
2059 latency_ms = elapsed.as_millis() as u64,
2060 "S4 partial Range GET via v3 sidecar (SSE-S4 chunked fast-path)"
2061 );
2062 Ok(backend_resp)
2063 }
2064
2065 /// `<key>.s4index` sidecar object を backend に書く。失敗しても本体 PUT は
2066 /// 成功扱いにしたいので、err は warn ログのみ (Range GET の partial path が
2067 /// 使えなくなるが、full read fallback で意味的には正しい結果を返す)。
2068 async fn write_sidecar(&self, bucket: &str, key: &str, index: &FrameIndex) {
2069 let bytes = encode_index(index);
2070 let len = bytes.len() as i64;
2071 let sidecar = sidecar_key(key);
2072 // v0.7 #49: synthetic re-entry URI must be percent-encoded; if
2073 // the (already legally-arbitrary) S3 key produces something we
2074 // cannot encode at all, drop the sidecar PUT (the GET path
2075 // falls back to a full read on a missing sidecar) instead of
2076 // panicking on `parse().unwrap()`.
2077 let uri = match safe_object_uri(bucket, &sidecar) {
2078 Ok(u) => u,
2079 Err(e) => {
2080 tracing::warn!(
2081 bucket,
2082 key,
2083 "S4 write_sidecar skipped (key not URI-encodable): {e}"
2084 );
2085 return;
2086 }
2087 };
2088 let put_input = PutObjectInput {
2089 bucket: bucket.into(),
2090 key: sidecar,
2091 body: Some(bytes_to_blob(bytes)),
2092 content_length: Some(len),
2093 content_type: Some("application/x-s4-index".into()),
2094 ..Default::default()
2095 };
2096 let put_req = S3Request {
2097 input: put_input,
2098 method: http::Method::PUT,
2099 uri,
2100 headers: http::HeaderMap::new(),
2101 extensions: http::Extensions::new(),
2102 credentials: None,
2103 region: None,
2104 service: None,
2105 trailing_headers: None,
2106 };
2107 if let Err(e) = self.backend.put_object(put_req).await {
2108 tracing::warn!(
2109 bucket,
2110 key,
2111 "S4 write_sidecar failed (Range GET will fall back to full read): {e}"
2112 );
2113 }
2114 }
2115
2116 /// v0.8.4 #73 H-2: confirm that the sidecar we just decoded still
2117 /// describes the current backend object before we trust its frame
2118 /// offsets for a partial Range GET. The sidecar carries the source
2119 /// `etag` and `compressed_size` that were observed at PUT time; we
2120 /// HEAD the backend object and compare.
2121 ///
2122 /// Decision matrix:
2123 /// - sidecar `source_etag = None` (legacy v1 / build_index_from_body
2124 /// that wasn't stamped) → return `true` (best-effort, preserves
2125 /// pre-v0.8.4 behaviour for existing on-disk sidecars).
2126 /// - HEAD fails → return `false` (we can't tell either way; full GET
2127 /// path will surface the real backend error to the client).
2128 /// - HEAD ETag matches → `true`.
2129 /// - HEAD ETag differs OR HEAD size differs from
2130 /// `source_compressed_size` → `false` (sidecar stale or attacker-
2131 /// written; fall back to full GET).
2132 async fn sidecar_version_binding_ok(
2133 &self,
2134 bucket: &str,
2135 key: &str,
2136 index: &FrameIndex,
2137 ) -> bool {
2138 let Some(ref expected_etag) = index.source_etag else {
2139 // Legacy sidecar without the v0.8.4 #73 H-2 binding —
2140 // back-compat: trust it (the partial fetch is the same
2141 // best-effort path that v0.8.3 and earlier shipped).
2142 return true;
2143 };
2144 let head_input = HeadObjectInput {
2145 bucket: bucket.into(),
2146 key: key.into(),
2147 ..Default::default()
2148 };
2149 let uri = match safe_object_uri(bucket, key) {
2150 Ok(u) => u,
2151 Err(_) => return false,
2152 };
2153 let head_req = S3Request {
2154 input: head_input,
2155 method: http::Method::HEAD,
2156 uri,
2157 headers: http::HeaderMap::new(),
2158 extensions: http::Extensions::new(),
2159 credentials: None,
2160 region: None,
2161 service: None,
2162 trailing_headers: None,
2163 };
2164 let head = match self.backend.head_object(head_req).await {
2165 Ok(r) => r.output,
2166 Err(e) => {
2167 tracing::debug!(
2168 bucket,
2169 key,
2170 "S4 sidecar version-binding HEAD failed, falling back to full GET: {e}"
2171 );
2172 return false;
2173 }
2174 };
2175 // ETag is a strong-vs-weak enum; we compare on the unwrapped string
2176 // form (matches what the PUT path stamped — see below).
2177 let live_etag = head.e_tag.as_ref().map(|t| t.value());
2178 if live_etag != Some(expected_etag.as_str()) {
2179 tracing::debug!(
2180 bucket,
2181 key,
2182 "sidecar stale (ETag mismatch), falling back to full GET (sidecar={:?}, live={:?})",
2183 expected_etag,
2184 live_etag,
2185 );
2186 return false;
2187 }
2188 if let Some(expected_size) = index.source_compressed_size
2189 && let Some(live_size) = head.content_length
2190 && live_size as u64 != expected_size
2191 {
2192 tracing::debug!(
2193 bucket,
2194 key,
2195 "sidecar stale (size mismatch), falling back to full GET (sidecar={}, live={})",
2196 expected_size,
2197 live_size,
2198 );
2199 return false;
2200 }
2201 true
2202 }
2203
2204 /// `<key>.s4index` sidecar を backend から読み出す。なければ None。
2205 async fn read_sidecar(&self, bucket: &str, key: &str) -> Option<FrameIndex> {
2206 let sidecar = sidecar_key(key);
2207 // v0.7 #49: same encode-or-bail treatment as write_sidecar.
2208 let uri = safe_object_uri(bucket, &sidecar).ok()?;
2209 let get_input = GetObjectInput {
2210 bucket: bucket.into(),
2211 key: sidecar,
2212 ..Default::default()
2213 };
2214 let get_req = S3Request {
2215 input: get_input,
2216 method: http::Method::GET,
2217 uri,
2218 headers: http::HeaderMap::new(),
2219 extensions: http::Extensions::new(),
2220 credentials: None,
2221 region: None,
2222 service: None,
2223 trailing_headers: None,
2224 };
2225 let resp = self.backend.get_object(get_req).await.ok()?;
2226 let blob = resp.output.body?;
2227 let bytes = collect_blob(blob, 64 * 1024 * 1024).await.ok()?;
2228 decode_index(bytes).ok()
2229 }
2230
2231 /// Multipart object (frame 列) を解凍 → 元 bytes を再構築。
2232 ///
2233 /// **per-frame codec dispatch**: 各 frame header に codec_id が入っているので、
2234 /// frame ごとに registry が違う codec を呼ぶことができる。同一 object 内で
2235 /// 異なる codec が混在していても透過的に解凍可能 (parquet 風 mixed columns 等)。
2236 async fn decompress_multipart(&self, bytes: bytes::Bytes) -> S3Result<bytes::Bytes> {
2237 let mut out = BytesMut::new();
2238 // v0.8.15 H-h: cap the *aggregate* decoded output. Each
2239 // individual frame is already bounded by
2240 // `validate_decompress_manifest` (default 5 GiB per frame),
2241 // but a forged multi-frame body can declare many frames
2242 // each near the limit — without an object-level ceiling, a
2243 // single GET could pin tens of GiB of plaintext in
2244 // `BytesMut::extend_from_slice`. Use the gateway's
2245 // `max_body_bytes` (same cap that bounds PUT bodies) so a
2246 // GET can never produce more plaintext than a PUT can ever
2247 // legitimately have stored.
2248 let aggregate_cap = self.max_body_bytes;
2249 let mut produced: usize = 0;
2250 for frame in FrameIter::new(bytes) {
2251 let (header, payload) = frame.map_err(|e| {
2252 S3Error::with_message(
2253 S3ErrorCode::InternalError,
2254 format!("multipart frame parse: {e}"),
2255 )
2256 })?;
2257 let chunk_manifest = ChunkManifest {
2258 codec: header.codec,
2259 original_size: header.original_size,
2260 compressed_size: header.compressed_size,
2261 crc32c: header.crc32c,
2262 };
2263 // v0.8.15 H-h: pre-flight check on the declared
2264 // `original_size` so a forged manifest claiming a frame
2265 // that would push us past the cap is rejected before we
2266 // start decoding. Defence-in-depth alongside the
2267 // post-decode `produced` check below.
2268 if (produced as u64).saturating_add(header.original_size) > aggregate_cap as u64 {
2269 return Err(S3Error::with_message(
2270 S3ErrorCode::InternalError,
2271 format!(
2272 "multipart aggregate output exceeds cap: would reach \
2273 {produced_total} bytes after this frame, cap is {aggregate_cap}",
2274 produced_total = (produced as u64).saturating_add(header.original_size),
2275 ),
2276 ));
2277 }
2278 let decompressed = self
2279 .registry
2280 .decompress(payload, &chunk_manifest)
2281 .await
2282 .map_err(internal("multipart frame decompress"))?;
2283 produced = produced.saturating_add(decompressed.len());
2284 if produced > aggregate_cap {
2285 return Err(S3Error::with_message(
2286 S3ErrorCode::InternalError,
2287 format!(
2288 "multipart aggregate output exceeded cap: {produced} bytes \
2289 emitted, cap is {aggregate_cap}"
2290 ),
2291 ));
2292 }
2293 out.extend_from_slice(&decompressed);
2294 }
2295 Ok(out.freeze())
2296 }
2297}
2298
2299/// Parse a CopySourceRange header value (`bytes=N-M`, `bytes=N-`, `bytes=-N`)
2300/// into the s3s::dto::Range used by the GetObject path. The S3 spec only
2301/// allows `bytes=N-M` for upload_part_copy (no suffix or open-ended), so
2302/// reject the other variants for parity with AWS.
2303fn parse_copy_source_range(s: &str) -> Result<s3s::dto::Range, String> {
2304 let rest = s
2305 .strip_prefix("bytes=")
2306 .ok_or_else(|| format!("CopySourceRange must start with 'bytes=', got {s:?}"))?;
2307 let (a, b) = rest
2308 .split_once('-')
2309 .ok_or_else(|| format!("CopySourceRange must be 'bytes=N-M', got {s:?}"))?;
2310 let first: u64 = a
2311 .parse()
2312 .map_err(|_| format!("CopySourceRange first byte not a number: {a:?}"))?;
2313 let last: u64 = b
2314 .parse()
2315 .map_err(|_| format!("CopySourceRange last byte not a number: {b:?}"))?;
2316 if last < first {
2317 return Err(format!("CopySourceRange last < first: {s:?}"));
2318 }
2319 Ok(s3s::dto::Range::Int {
2320 first,
2321 last: Some(last),
2322 })
2323}
2324
2325/// v0.5 #34: synthesize the backend storage key for a given
2326/// (logical key, version-id) pair on an Enabled-versioning bucket.
2327///
2328/// Uses the `__s4ver__/` infix because:
2329/// - it's not a substring of `.s4index` / `.s4ver` natural keys (no false-positive
2330/// listing filter collisions)
2331/// - directory-style separator keeps S3 console "browse by prefix" UX intact
2332/// (versions roll up under one virtual folder per object)
2333/// - human-readable on debug logs / `aws s3 ls`
2334///
2335/// `list_objects` / `list_objects_v2` / `list_object_versions` MUST filter
2336/// keys containing `.__s4ver__/` from results so customers don't see internal
2337/// shadow objects.
2338pub fn versioned_shadow_key(key: &str, version_id: &str) -> String {
2339 format!("{key}.__s4ver__/{version_id}")
2340}
2341
2342/// Test for the marker substring used by [`versioned_shadow_key`]. Cheap str
2343/// scan; both list_objects filter and the GET passthrough check use this.
2344fn is_versioning_shadow_key(key: &str) -> bool {
2345 key.contains(".__s4ver__/")
2346}
2347
2348/// v0.6 #42: wall-clock seconds since the UNIX epoch — fed to
2349/// `mfa::check_mfa` so the TOTP verifier can match the client's
2350/// authenticator app's view of "now". Falls back to `0` on the
2351/// (impossible-in-practice) clock-before-1970 path so the verifier
2352/// rejects rather than panicking.
2353fn current_unix_secs() -> u64 {
2354 std::time::SystemTime::now()
2355 .duration_since(std::time::UNIX_EPOCH)
2356 .map(|d| d.as_secs())
2357 .unwrap_or(0)
2358}
2359
2360/// v0.6 #42: translate an `MfaError` into the matching S3 wire error.
2361///
2362/// - `Missing` / `SerialMismatch` / `InvalidCode` → `403 AccessDenied`
2363/// (S3 spec for MFA Delete: every gating failure surfaces as
2364/// `AccessDenied`, not a separate `MFA*` code).
2365/// - `Malformed` → `400 InvalidRequest` (the request itself is
2366/// syntactically broken, not a permission issue).
2367fn mfa_error_to_s3(e: crate::mfa::MfaError) -> S3Error {
2368 match e {
2369 crate::mfa::MfaError::Missing => S3Error::with_message(
2370 S3ErrorCode::AccessDenied,
2371 "MFA token required for this operation",
2372 ),
2373 crate::mfa::MfaError::Malformed => {
2374 S3Error::with_message(S3ErrorCode::InvalidRequest, "malformed x-amz-mfa header")
2375 }
2376 crate::mfa::MfaError::SerialMismatch => S3Error::with_message(
2377 S3ErrorCode::AccessDenied,
2378 "MFA serial does not match configured device",
2379 ),
2380 crate::mfa::MfaError::InvalidCode => {
2381 S3Error::with_message(S3ErrorCode::AccessDenied, "invalid MFA code")
2382 }
2383 }
2384}
2385
2386fn is_multipart_object(metadata: &Option<Metadata>) -> bool {
2387 metadata
2388 .as_ref()
2389 .and_then(|m| m.get(META_MULTIPART))
2390 .map(|v| v == "true")
2391 .unwrap_or(false)
2392}
2393
2394const META_CODEC: &str = "s4-codec";
2395const META_ORIGINAL_SIZE: &str = "s4-original-size";
2396const META_COMPRESSED_SIZE: &str = "s4-compressed-size";
2397const META_CRC32C: &str = "s4-crc32c";
2398/// Multipart upload で per-part frame format を使ったオブジェクトであることを示す。
2399/// GET 時にこの flag を見て frame parser を起動する。
2400const META_MULTIPART: &str = "s4-multipart";
2401/// v0.2 #4: single-PUT でも S4F2 framed format で書かれていることを示す。
2402/// 旧 v0.1 single-PUT は raw 圧縮 bytes (この flag なし)。GET 時にこの flag を
2403/// 見て framed 経路 (= multipart と同じ FrameIter parse) に流す。
2404const META_FRAMED: &str = "s4-framed";
2405
2406fn is_framed_v2_object(metadata: &Option<Metadata>) -> bool {
2407 metadata
2408 .as_ref()
2409 .and_then(|m| m.get(META_FRAMED))
2410 .map(|v| v == "true")
2411 .unwrap_or(false)
2412}
2413
2414/// v0.4 #21: detect SSE-S4 by the metadata flag we set on PUT.
2415fn is_sse_encrypted(metadata: &Option<Metadata>) -> bool {
2416 metadata
2417 .as_ref()
2418 .and_then(|m| m.get("s4-encrypted"))
2419 .map(|v| v == "aes-256-gcm")
2420 .unwrap_or(false)
2421}
2422
2423/// v0.5 #27: pull the three SSE-C headers off an input struct. The S3
2424/// contract is "all three or none" — partial sets are a 400.
2425///
2426/// Returns `Ok(None)` when no SSE-C headers were sent (server-managed or
2427/// no encryption), `Ok(Some(material))` on validated client key, and
2428/// `Err` for malformed or partial inputs.
2429fn extract_sse_c_material(
2430 algorithm: &Option<String>,
2431 key: &Option<String>,
2432 md5: &Option<String>,
2433) -> S3Result<Option<crate::sse::CustomerKeyMaterial>> {
2434 match (algorithm, key, md5) {
2435 (None, None, None) => Ok(None),
2436 (Some(a), Some(k), Some(m)) => crate::sse::parse_customer_key_headers(a, k, m)
2437 .map(Some)
2438 .map_err(sse_c_error_to_s3),
2439 _ => Err(S3Error::with_message(
2440 S3ErrorCode::InvalidRequest,
2441 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
2442 )),
2443 }
2444}
2445
2446/// v0.5 #28: detect SSE-KMS request — `x-amz-server-side-encryption: aws:kms`.
2447/// Returns the key-id to wrap under, falling back to the gateway default.
2448fn extract_kms_key_id(
2449 sse: &Option<ServerSideEncryption>,
2450 sse_kms_key_id: &Option<String>,
2451 gateway_default: Option<&str>,
2452) -> Option<String> {
2453 let asks_for_kms = sse
2454 .as_ref()
2455 .map(|s| s.as_str() == ServerSideEncryption::AWS_KMS)
2456 .unwrap_or(false);
2457 if !asks_for_kms {
2458 return None;
2459 }
2460 sse_kms_key_id
2461 .clone()
2462 .or_else(|| gateway_default.map(str::to_owned))
2463}
2464
2465/// v0.5 #28: map kms module errors to AWS-shaped S3 error codes.
2466/// `KeyNotFound` is operator misconfig (400); `BackendUnavailable` is a
2467/// transient KMS outage (503). Other variants are 500 InternalError.
2468fn kms_error_to_s3(e: crate::kms::KmsError) -> S3Error {
2469 use crate::kms::KmsError as K;
2470 match e {
2471 K::KeyNotFound { key_id } => S3Error::with_message(
2472 S3ErrorCode::InvalidArgument,
2473 format!("KMS key not found: {key_id}"),
2474 ),
2475 K::BackendUnavailable { message } => S3Error::with_message(
2476 S3ErrorCode::ServiceUnavailable,
2477 format!("KMS backend unavailable: {message}"),
2478 ),
2479 other => S3Error::with_message(S3ErrorCode::InternalError, format!("KMS error: {other}")),
2480 }
2481}
2482
2483/// v0.5 #27: map sse module errors to AWS-shaped S3 error codes.
2484/// `WrongCustomerKey` → 403 AccessDenied (matches AWS behaviour);
2485/// `InvalidCustomerKey` / algorithm / required / unexpected → 400.
2486fn sse_c_error_to_s3(e: crate::sse::SseError) -> S3Error {
2487 use crate::sse::SseError as E;
2488 match e {
2489 E::WrongCustomerKey => S3Error::with_message(
2490 S3ErrorCode::AccessDenied,
2491 "SSE-C key does not match the key used at PUT time",
2492 ),
2493 E::InvalidCustomerKey { reason } => {
2494 S3Error::with_message(S3ErrorCode::InvalidArgument, format!("SSE-C: {reason}"))
2495 }
2496 E::CustomerKeyAlgorithmUnsupported { algo } => S3Error::with_message(
2497 S3ErrorCode::InvalidArgument,
2498 format!("SSE-C unsupported algorithm: {algo:?} (only AES256 is allowed)"),
2499 ),
2500 E::CustomerKeyRequired => S3Error::with_message(
2501 S3ErrorCode::InvalidRequest,
2502 "object is SSE-C encrypted; supply x-amz-server-side-encryption-customer-* headers",
2503 ),
2504 E::CustomerKeyUnexpected => S3Error::with_message(
2505 S3ErrorCode::InvalidRequest,
2506 "object is not SSE-C encrypted; do not send x-amz-server-side-encryption-customer-* headers",
2507 ),
2508 other => S3Error::with_message(S3ErrorCode::InternalError, format!("SSE error: {other}")),
2509 }
2510}
2511
2512fn extract_manifest(metadata: &Option<Metadata>) -> Option<ChunkManifest> {
2513 let m = metadata.as_ref()?;
2514 let codec = m
2515 .get(META_CODEC)
2516 .and_then(|s| s.parse::<CodecKind>().ok())?;
2517 let original_size = m.get(META_ORIGINAL_SIZE)?.parse().ok()?;
2518 let compressed_size = m.get(META_COMPRESSED_SIZE)?.parse().ok()?;
2519 let crc32c = m.get(META_CRC32C)?.parse().ok()?;
2520 Some(ChunkManifest {
2521 codec,
2522 original_size,
2523 compressed_size,
2524 crc32c,
2525 })
2526}
2527
2528fn write_manifest(metadata: &mut Option<Metadata>, manifest: &ChunkManifest) {
2529 let meta = metadata.get_or_insert_with(Default::default);
2530 meta.insert(META_CODEC.into(), manifest.codec.as_str().into());
2531 meta.insert(
2532 META_ORIGINAL_SIZE.into(),
2533 manifest.original_size.to_string(),
2534 );
2535 meta.insert(
2536 META_COMPRESSED_SIZE.into(),
2537 manifest.compressed_size.to_string(),
2538 );
2539 meta.insert(META_CRC32C.into(), manifest.crc32c.to_string());
2540}
2541
2542fn internal<E: std::fmt::Display>(prefix: &'static str) -> impl FnOnce(E) -> S3Error {
2543 move |e| S3Error::with_message(S3ErrorCode::InternalError, format!("{prefix}: {e}"))
2544}
2545
2546/// v0.6 #41: map a `select::SelectError` to the S3 error surface. AWS
2547/// uses a domain-specific `InvalidSqlExpression` code for parse / unsupported
2548/// errors, but s3s 0.13 doesn't expose that as a typed variant — we
2549/// fall back to the well-known `InvalidRequest` 400 with a descriptive
2550/// message that includes the original error context.
2551fn select_error_to_s3(e: crate::select::SelectError, fmt: &str) -> S3Error {
2552 use crate::select::SelectError;
2553 match e {
2554 SelectError::Parse(msg) => S3Error::with_message(
2555 S3ErrorCode::InvalidRequest,
2556 format!("SQL parse error: {msg}"),
2557 ),
2558 SelectError::UnsupportedFeature(msg) => S3Error::with_message(
2559 S3ErrorCode::InvalidRequest,
2560 format!("unsupported SQL feature: {msg}"),
2561 ),
2562 SelectError::RowEval(msg) => S3Error::with_message(
2563 S3ErrorCode::InvalidRequest,
2564 format!("SQL row evaluation error: {msg}"),
2565 ),
2566 SelectError::InputFormat(msg) => S3Error::with_message(
2567 S3ErrorCode::InvalidRequest,
2568 format!("{fmt} input format error: {msg}"),
2569 ),
2570 }
2571}
2572
2573/// v0.5 #30: parse the `x-amz-bypass-governance-retention` header into a
2574/// boolean flag. AWS S3 accepts `true` (case-insensitive); any other value
2575/// (including missing) is treated as `false`.
2576fn parse_bypass_governance_header(headers: &http::HeaderMap) -> bool {
2577 headers
2578 .get("x-amz-bypass-governance-retention")
2579 .and_then(|v| v.to_str().ok())
2580 .map(|s| s.eq_ignore_ascii_case("true"))
2581 .unwrap_or(false)
2582}
2583
2584/// Convert s3s `Timestamp` into a `chrono::DateTime<Utc>` by formatting it
2585/// as an RFC3339 string and re-parsing through `chrono`. The string format
2586/// avoids pulling the `time` crate (transitive dep of s3s, not declared by
2587/// s4-server) into our direct deps. Returns `None` if the format/parse fails
2588/// or the value is outside `chrono`'s supported range.
2589fn timestamp_to_chrono_utc(ts: &Timestamp) -> Option<chrono::DateTime<chrono::Utc>> {
2590 let mut buf = Vec::new();
2591 ts.format(s3s::dto::TimestampFormat::DateTime, &mut buf)
2592 .ok()?;
2593 let s = std::str::from_utf8(&buf).ok()?;
2594 chrono::DateTime::parse_from_rfc3339(s)
2595 .ok()
2596 .map(|dt| dt.with_timezone(&chrono::Utc))
2597}
2598
2599/// Inverse of [`timestamp_to_chrono_utc`] — emit RFC3339 (the s3s
2600/// `DateTime` wire format) and re-parse via `Timestamp::parse`.
2601fn chrono_utc_to_timestamp(dt: chrono::DateTime<chrono::Utc>) -> Timestamp {
2602 // chrono's RFC3339 output format matches s3s' parser ("...Z" with
2603 // optional sub-second precision). Fall back to UNIX_EPOCH if anything
2604 // unexpected happens — we never produce malformed strings, so this
2605 // branch is unreachable in practice.
2606 let s = dt.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
2607 Timestamp::parse(s3s::dto::TimestampFormat::DateTime, &s).unwrap_or_default()
2608}
2609
2610/// v0.6 #39: convert our internal [`crate::tagging::TagSet`] into the
2611/// s3s `Vec<Tag>` wire shape used on `GetObject/BucketTaggingOutput`.
2612/// Both halves of every pair land in the `Some(_)` slot — AWS marks
2613/// the field optional but always populates it on response.
2614fn tagset_to_aws(set: &crate::tagging::TagSet) -> Vec<Tag> {
2615 set.iter()
2616 .map(|(k, v)| Tag {
2617 key: Some(k.clone()),
2618 value: Some(v.clone()),
2619 })
2620 .collect()
2621}
2622
2623/// v0.6 #39: inverse of [`tagset_to_aws`] for input handlers. Missing
2624/// keys / values become empty strings (mirrors AWS, which rejects
2625/// `<Key/>` with InvalidTag at the parser layer; downstream
2626/// `TagSet::validate` then enforces our size limits).
2627fn aws_to_tagset(tags: &[Tag]) -> Result<crate::tagging::TagSet, crate::tagging::TagError> {
2628 let pairs = tags
2629 .iter()
2630 .map(|t| {
2631 (
2632 t.key.clone().unwrap_or_default(),
2633 t.value.clone().unwrap_or_default(),
2634 )
2635 })
2636 .collect();
2637 crate::tagging::TagSet::from_pairs(pairs)
2638}
2639
2640/// `Range` request を decompressed object サイズ `total` に適用して `(start, end_exclusive)`
2641/// を返す。`Range::Int { first, last }` は `bytes=first-last` (last は inclusive)、
2642/// `Range::Suffix { length }` は末尾 `length` byte。S3 仕様に準拠。
2643pub fn resolve_range(range: &s3s::dto::Range, total: u64) -> Result<(u64, u64), String> {
2644 if total == 0 {
2645 return Err("cannot range-get zero-length object".into());
2646 }
2647 match range {
2648 s3s::dto::Range::Int { first, last } => {
2649 let start = *first;
2650 let end_inclusive = match last {
2651 Some(l) => (*l).min(total - 1),
2652 None => total - 1,
2653 };
2654 if start > end_inclusive || start >= total {
2655 return Err(format!(
2656 "range bytes={start}-{:?} out of object size {total}",
2657 last
2658 ));
2659 }
2660 Ok((start, end_inclusive + 1))
2661 }
2662 s3s::dto::Range::Suffix { length } => {
2663 let len = (*length).min(total);
2664 Ok((total - len, total))
2665 }
2666 }
2667}
2668
2669#[async_trait::async_trait]
2670impl<B: S3> S3 for S4Service<B> {
2671 // === 圧縮を挟む path (PUT) ===
2672 #[tracing::instrument(
2673 name = "s4.put_object",
2674 skip(self, req),
2675 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_in, bytes_out, latency_ms)
2676 )]
2677 async fn put_object(
2678 &self,
2679 mut req: S3Request<PutObjectInput>,
2680 ) -> S3Result<S3Response<PutObjectOutput>> {
2681 let put_start = Instant::now();
2682 let put_bucket = req.input.bucket.clone();
2683 let put_key = req.input.key.clone();
2684 // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
2685 self.check_not_reserved_key(&put_key, ReservedKeyMode::Mutating)?;
2686 let access_preamble = self.access_log_preamble(&req);
2687 self.enforce_rate_limit(&req, &put_bucket)?;
2688 // v0.6 #39: parse `x-amz-tagging` (URL-encoded query string) so
2689 // the IAM policy gate sees the request's tags via
2690 // `s3:RequestObjectTag/<key>`. `existing_object_tags` is also
2691 // resolved from the Tagging manager (when wired) so
2692 // `s3:ExistingObjectTag/<key>` works on overwrite.
2693 let request_tags: Option<crate::tagging::TagSet> = req
2694 .input
2695 .tagging
2696 .as_deref()
2697 .map(crate::tagging::parse_tagging_header)
2698 .transpose()
2699 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
2700 let existing_tags: Option<crate::tagging::TagSet> = self
2701 .tagging
2702 .as_ref()
2703 .and_then(|m| m.get_object_tags(&put_bucket, &put_key));
2704 self.enforce_policy_with_extra(
2705 &req,
2706 "s3:PutObject",
2707 &put_bucket,
2708 Some(&put_key),
2709 request_tags.as_ref(),
2710 existing_tags.as_ref(),
2711 )?;
2712 // v0.5 #30: an Object Lock-protected key cannot be overwritten by
2713 // a non-versioned PUT (Suspended / Unversioned bucket). Enabled
2714 // bucket PUTs are exempt because they materialise a fresh
2715 // version under a shadow key (`<key>.__s4ver__/<vid>`) — the
2716 // locked version's bytes are untouched. The check mirrors the
2717 // delete path (Compliance never bypassable, Governance via the
2718 // bypass header, legal hold never).
2719 if let Some(mgr) = self.object_lock.as_ref()
2720 && let Some(state) = mgr.get(&put_bucket, &put_key)
2721 {
2722 let bucket_versioned_enabled = self
2723 .versioning
2724 .as_ref()
2725 .map(|v| v.state(&put_bucket) == crate::versioning::VersioningState::Enabled)
2726 .unwrap_or(false);
2727 if !bucket_versioned_enabled {
2728 let bypass = parse_bypass_governance_header(&req.headers);
2729 let now = chrono::Utc::now();
2730 if !state.can_delete(now, bypass) {
2731 crate::metrics::record_policy_denial("s3:PutObject", &put_bucket);
2732 return Err(S3Error::with_message(
2733 S3ErrorCode::AccessDenied,
2734 "Access Denied because object protected by object lock",
2735 ));
2736 }
2737 }
2738 }
2739 // v0.5 #30: per-PUT explicit retention / legal hold (S3
2740 // `x-amz-object-lock-mode`, `x-amz-object-lock-retain-until-date`,
2741 // `x-amz-object-lock-legal-hold`). Captured before the body
2742 // moves into the backend; persisted into the manager only on
2743 // backend success below.
2744 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
2745 .input
2746 .object_lock_mode
2747 .as_ref()
2748 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
2749 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
2750 .input
2751 .object_lock_retain_until_date
2752 .as_ref()
2753 .and_then(timestamp_to_chrono_utc);
2754 let explicit_legal_hold_on: Option<bool> = req
2755 .input
2756 .object_lock_legal_hold_status
2757 .as_ref()
2758 .map(|s| s.as_str().eq_ignore_ascii_case("ON"));
2759 if let Some(blob) = req.input.body.take() {
2760 // v0.9 #106: parse client-supplied checksum headers
2761 // **before** awaiting any body bytes. A malformed
2762 // `Content-MD5` / `x-amz-checksum-*` value must surface
2763 // as `InvalidDigest` immediately so a slow / non-
2764 // delivering body cannot tie up the handler waiting on
2765 // bytes only to reject the request on a header-level
2766 // problem. The parsed `ClientChecksums` value is reused
2767 // by the streaming-framed branch below; the
2768 // bytes-buffered branch keeps its own
2769 // `verify_client_body_checksums` call which is idempotent
2770 // with this parse.
2771 let client_checksums = crate::streaming_checksum::ClientChecksums::from_request_fields(
2772 req.input.content_md5.as_deref(),
2773 req.input.checksum_crc32.as_deref(),
2774 req.input.checksum_crc32c.as_deref(),
2775 req.input.checksum_sha1.as_deref(),
2776 req.input.checksum_sha256.as_deref(),
2777 req.input.checksum_crc64nvme.as_deref(),
2778 )?;
2779 // Sample 4 KiB から codec を決定。streaming-aware codec なら streaming
2780 // compress fast path、そうでなければ従来の collect-then-compress。
2781 let (sample, rest_stream) = peek_sample(blob, SAMPLE_BYTES)
2782 .await
2783 .map_err(internal("peek put sample"))?;
2784 let sample_len = sample.len().min(SAMPLE_BYTES);
2785 // v0.8 #56: pass the request's Content-Length (when present) so
2786 // the sampling dispatcher can promote large objects to a GPU
2787 // codec. Chunked transfers (no Content-Length) keep CPU.
2788 let total_size_hint = req.input.content_length.and_then(|n| u64::try_from(n).ok());
2789 let kind = self
2790 .dispatcher
2791 .pick_with_size_hint(&sample[..sample_len], total_size_hint)
2792 .await;
2793
2794 // Passthrough buys nothing from S4F2 wrapping (no compression =
2795 // no per-chunk frame to skip past) and the +28-byte header
2796 // overhead breaks size-sensitive callers that expect a true
2797 // pass-through. So passthrough always uses the legacy raw-blob
2798 // path; only compressing codecs go through the framed path.
2799 //
2800 // v0.9 #106 — true streaming PUT checksum verify. The
2801 // streaming-framed path used to fail-open on client-supplied
2802 // whole-body checksums (`x-amz-checksum-{crc32, crc32c, sha1,
2803 // sha256, crc64nvme}` and `Content-MD5`): the v0.8.13 #127
2804 // attempt to "force buffered when any checksum header is
2805 // present" had to be reverted in v0.8.14 #129 because modern
2806 // AWS SDKs auto-attach `x-amz-checksum-crc32`, which made
2807 // every SDK PUT lose the streaming-framed path and therefore
2808 // its sidecar (range_get_falls_back_to_full_when_sidecar_etag_stale
2809 // + upload_part_copy_propagates_source_version_id failed on
2810 // CI). v0.9 #106 keeps the streaming-framed path and tees
2811 // each chunk into a multi-hasher (`streaming_checksum`
2812 // module) as it flows through the compressor. On EOF the
2813 // hashers are finalised and compared; a mismatch surfaces
2814 // as a synthetic `io::Error` carrying
2815 // `StreamingChecksumError` which we downcast back below and
2816 // map to a typed 400 BadDigest. Sidecar emission is
2817 // unaffected — the verifier sits **upstream** of
2818 // `streaming_compress_to_frames`, so on mismatch the call
2819 // returns Err and we never reach the backend write or
2820 // sidecar build, preserving the post-revert invariant.
2821 //
2822 // Scope: single-PUT cpu-zstd / passthrough only. Multipart
2823 // `upload_part` keeps its buffered per-part verify (the
2824 // part body is already in memory there for framing /
2825 // padding, so streaming verify wouldn't save anything).
2826 // GPU codecs (nvcomp-*) fall through to the buffered
2827 // branch below — they are bytes-buffered today and use the
2828 // existing `verify_client_body_checksums`.
2829 // (`client_checksums` was parsed before `peek_sample`
2830 // above so malformed values fail pre-stream.)
2831 //
2832 // v0.9 #106 trailer support: the chunked / SigV4-streaming
2833 // SDK case attaches the actual checksum value in the
2834 // request **trailers** (post-body). The `x-amz-trailer`
2835 // request header announces which algorithm(s) will follow;
2836 // we use it to decide which hashers to spin up at body
2837 // start so the digest is ready to compare once trailers
2838 // arrive. After the codec consumes the body we read
2839 // `req.trailing_headers` and run a deferred comparison
2840 // against the finalised digests via
2841 // `ComputedDigests::compare_b64` (see post-stream block
2842 // below). Without this, a bad trailer checksum on the
2843 // streaming-framed path would silently pass — same
2844 // fail-open shape this issue is closing, different
2845 // delivery mechanism.
2846 let trailer_hashers: crate::streaming_checksum::WhichHashers = req
2847 .headers
2848 .get("x-amz-trailer")
2849 .and_then(|v| v.to_str().ok())
2850 .map(crate::streaming_checksum::WhichHashers::from_trailer_header)
2851 .unwrap_or_default();
2852 let which_hashers = client_checksums.which_hashers().or(trailer_hashers);
2853 let use_framed = supports_streaming_compress(kind) && kind != CodecKind::Passthrough;
2854 let (compressed, manifest, is_framed) = if use_framed {
2855 // streaming fast path: input は memory に collect しない
2856 let chained = chain_sample_with_rest(sample, rest_stream);
2857 // v0.9 #106: tee the chained input through a multi-hasher
2858 // when ANY client checksum claim is present (header or
2859 // trailer). The wrapper is a no-op (and skipped
2860 // entirely) when neither side has work, so non-
2861 // checksummed PUTs keep their pre-#106 throughput.
2862 let (chained, digest_handle) = if which_hashers.any() {
2863 let (b, h) = crate::streaming_checksum::tee_into_hashers_with_handle(
2864 chained,
2865 client_checksums.clone(),
2866 which_hashers,
2867 );
2868 (b, Some(h))
2869 } else {
2870 (chained, None)
2871 };
2872 debug!(
2873 bucket = ?req.input.bucket,
2874 key = ?req.input.key,
2875 codec = kind.as_str(),
2876 path = "streaming-framed",
2877 client_checksum_verify = client_checksums.any(),
2878 "S4 put_object: compressing (streaming, S4F2 multi-frame)"
2879 );
2880 // v0.4 #16: pick the chunk size based on the request's
2881 // Content-Length when known, falling back to the 4 MiB
2882 // default for chunked transfers.
2883 let chunk_size = pick_chunk_size(req.input.content_length.map(|n| n as u64));
2884 // v0.8.4 #73 M2: pass the request's Content-Length so
2885 // streaming_compress_to_frames can fail-fast on a mid-PUT
2886 // truncation (client disconnect after sending half the
2887 // body). `None` is the chunked-Transfer-Encoding case
2888 // where the upstream genuinely doesn't know the size and
2889 // the backend's framing layer is the only truncation
2890 // signal we have.
2891 let expected_input_size =
2892 req.input.content_length.and_then(|n| u64::try_from(n).ok());
2893 let (body, manifest) = streaming_compress_to_frames(
2894 chained,
2895 Arc::clone(&self.registry),
2896 kind,
2897 chunk_size,
2898 expected_input_size,
2899 )
2900 .await
2901 .map_err(|e| match e {
2902 s4_codec::CodecError::TruncatedStream { expected, got } => {
2903 // 400 IncompleteBody: client advertised N bytes
2904 // but disconnected after `got`. Mirrors AWS S3's
2905 // canonical error code for the same shape so SDK
2906 // retries kick in instead of treating the PUT as
2907 // a successful upload of a half-body.
2908 S3Error::with_message(
2909 S3ErrorCode::IncompleteBody,
2910 format!("PUT body truncated: expected {expected} bytes, got {got}"),
2911 )
2912 }
2913 // v0.8.15 M-4: 400
2914 // `RequestBodyLengthMismatch` for over-length
2915 // bodies. AWS S3 returns this when the declared
2916 // `Content-Length` is smaller than the wire body;
2917 // S4 used to silently accept the surplus bytes.
2918 // `IncompleteBody` is the closest typed variant
2919 // in the s3s enum — we widen the message so the
2920 // SDK / curl side sees the shape unambiguously.
2921 s4_codec::CodecError::OverlengthStream { expected, got } => {
2922 let code = S3ErrorCode::from_bytes(b"RequestBodyLengthMismatch")
2923 .unwrap_or(S3ErrorCode::IncompleteBody);
2924 S3Error::with_message(
2925 code,
2926 format!(
2927 "PUT body length mismatch: Content-Length declared {expected} \
2928 bytes, body carried at least {got}"
2929 ),
2930 )
2931 }
2932 // v0.9 #106: streaming checksum mismatch — the tee
2933 // wrapper emitted a synthetic io::Error carrying
2934 // StreamingChecksumError. Downcast and remap to
2935 // BadDigest so the client sees the same response
2936 // the buffered path would have produced.
2937 s4_codec::CodecError::Io(ref io_err) => {
2938 if let Some(alg) =
2939 crate::streaming_checksum::extract_streaming_checksum_error(io_err)
2940 {
2941 let code = S3ErrorCode::from_bytes(b"BadDigest")
2942 .unwrap_or(S3ErrorCode::InvalidArgument);
2943 S3Error::with_message(
2944 code,
2945 format!("client-supplied {alg} did not match the received body"),
2946 )
2947 } else {
2948 internal("streaming framed compress")(e)
2949 }
2950 }
2951 other => internal("streaming framed compress")(other),
2952 })?;
2953 // v0.9 #106 trailer-deferred verify. Header claims
2954 // have already been compared eagerly inside the tee
2955 // at EOF (mismatch surfaces as `BadDigest` through
2956 // the `CodecError::Io` branch above). Now that the
2957 // body has been fully consumed, request trailers are
2958 // available — delegate to the shared trailer-verify
2959 // helper (also used by the buffered branch below,
2960 // see v0.9 #106-audit-R2 P2-INT-2).
2961 //
2962 // **Fail-closed when announced trailers are
2963 // missing**: if the client announced
2964 // `x-amz-trailer: x-amz-checksum-*` but did NOT
2965 // deliver the trailer value (or the trailers block
2966 // never arrived), the helper refuses the PUT with
2967 // `BadDigest`. Skipping the comparison in that case
2968 // would silently re-open the streaming fail-open
2969 // this issue closes — a client could declare an
2970 // integrity check and then omit the value to bypass
2971 // verification.
2972 if let Some(handle) = digest_handle.as_ref() {
2973 let announced = req
2974 .headers
2975 .get("x-amz-trailer")
2976 .and_then(|v| v.to_str().ok());
2977 // If the tee never finalised (computed is None)
2978 // the body was incomplete; the CodecError path
2979 // would have already surfaced — defensive belt
2980 // for any future refactor. We still need a
2981 // ComputedDigests instance to feed the helper
2982 // when trailers were announced, so synthesise
2983 // an empty one and let `compare_b64` reject
2984 // every claim as BadDigest (every algorithm
2985 // slot is None).
2986 let computed = handle
2987 .lock()
2988 .expect("digest handle lock poisoned")
2989 .clone()
2990 .unwrap_or_default();
2991 verify_client_trailer_checksums(
2992 announced,
2993 req.trailing_headers.as_ref(),
2994 &computed,
2995 )?;
2996 }
2997 (body, manifest, true)
2998 } else {
2999 // GPU codec 等で streaming-aware でないものは bytes-buffered path
3000 // (raw 圧縮 bytes、framed なし — back-compat 互換 path)
3001 let bytes = collect_with_sample(sample, rest_stream, self.max_body_bytes)
3002 .await
3003 .map_err(internal("collect put body (buffered path)"))?;
3004 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
3005 // checksum algorithms against the received body on
3006 // the buffered path. The streaming-framed branch
3007 // above redirects here when ANY checksum header is
3008 // present (#127 MED-B), so this is the single
3009 // checkpoint for client-supplied integrity.
3010 verify_client_body_checksums(
3011 &bytes,
3012 req.input.content_md5.as_deref(),
3013 req.input.checksum_crc32.as_deref(),
3014 req.input.checksum_crc32c.as_deref(),
3015 req.input.checksum_sha1.as_deref(),
3016 req.input.checksum_sha256.as_deref(),
3017 req.input.checksum_crc64nvme.as_deref(),
3018 )?;
3019 // v0.9 #106-audit-R2 P2-INT-2: SigV4-streaming trailer
3020 // checksums must verify on the buffered path too. Pre-fix
3021 // the streaming-framed branch above handled
3022 // `x-amz-trailer` while this branch silently dropped
3023 // it — a client could PUT through a GPU codec / non-
3024 // streaming dispatch and bypass trailer verification.
3025 // We have the full body in memory here, so a one-shot
3026 // `compute_digests` followed by the shared
3027 // `verify_client_trailer_checksums` helper closes the
3028 // gap. The hasher selector is derived from the same
3029 // `x-amz-trailer` header parser the streaming branch
3030 // uses (`WhichHashers::from_trailer_header`).
3031 if let Some(announced) = req
3032 .headers
3033 .get("x-amz-trailer")
3034 .and_then(|v| v.to_str().ok())
3035 {
3036 let which =
3037 crate::streaming_checksum::WhichHashers::from_trailer_header(announced);
3038 if which.any() {
3039 let computed = crate::streaming_checksum::compute_digests(&bytes, which);
3040 verify_client_trailer_checksums(
3041 Some(announced),
3042 req.trailing_headers.as_ref(),
3043 &computed,
3044 )?;
3045 } else {
3046 // Header announced only non-checksum trailers
3047 // (e.g. `x-amz-trailer-signature`). The helper
3048 // would return Ok in that case — invoke it
3049 // anyway for symmetry with the streaming branch
3050 // so a future change to the filter logic stays
3051 // wired through both paths.
3052 verify_client_trailer_checksums(
3053 Some(announced),
3054 req.trailing_headers.as_ref(),
3055 &crate::streaming_checksum::ComputedDigests::default(),
3056 )?;
3057 }
3058 }
3059 debug!(
3060 bucket = ?req.input.bucket,
3061 key = ?req.input.key,
3062 bytes = bytes.len(),
3063 codec = kind.as_str(),
3064 path = "buffered",
3065 "S4 put_object: compressing (buffered, raw blob)"
3066 );
3067 // v0.8 #55: telemetry-returning compress so we can stamp
3068 // GPU-pipeline Prometheus metrics (`s4_gpu_compress_seconds`,
3069 // throughput gauge, OOM counter) for nvcomp / dietgpu codecs.
3070 // CPU codecs come back with `gpu_seconds = None` and the
3071 // stamp helper short-circuits — no extra cost on CPU path.
3072 let (compress_res, tel) = self.registry.compress_with_telemetry(bytes, kind).await;
3073 stamp_gpu_compress_telemetry(&tel);
3074 let (body, m) = compress_res.map_err(internal("registry compress"))?;
3075 (body, m, false)
3076 };
3077
3078 write_manifest(&mut req.input.metadata, &manifest);
3079 if is_framed {
3080 // v0.2 #4: framed body であることを GET 側に伝える meta flag。
3081 req.input
3082 .metadata
3083 .get_or_insert_with(Default::default)
3084 .insert(META_FRAMED.into(), "true".into());
3085 }
3086 // 重要: content_length を圧縮後サイズで更新する。
3087 // これを忘れると下流 (aws-sdk-s3 → S3) が宣言サイズ分の bytes を
3088 // 待ち続けて RequestTimeout で失敗する (S3 仕様)。
3089 req.input.content_length = Some(compressed.len() as i64);
3090 // body を書き換えたので、客側が送ってきた original body 用の
3091 // checksum / MD5 ヘッダは無効化する (そのまま転送すると下流 S3 が
3092 // XAmzContentChecksumMismatch を返す)。S4 自身の整合性は
3093 // ChunkManifest.crc32c で担保している。
3094 req.input.checksum_algorithm = None;
3095 req.input.checksum_crc32 = None;
3096 req.input.checksum_crc32c = None;
3097 req.input.checksum_crc64nvme = None;
3098 req.input.checksum_sha1 = None;
3099 req.input.checksum_sha256 = None;
3100 req.input.content_md5 = None;
3101 let original_size = manifest.original_size;
3102 let compressed_size = manifest.compressed_size;
3103 let codec_label = manifest.codec.as_str();
3104 // (sidecar_index is built below, after the SSE-mode
3105 // extraction, so v0.8.12 HIGH-10 can short-circuit the
3106 // build when the on-disk bytes are about to be encrypted.)
3107 // v0.4 #21 / v0.5 #29 / v0.5 #27: encrypt-after-compress.
3108 // Precedence:
3109 // - SSE-C headers present → per-request customer key (S4E3)
3110 // - server-managed keyring configured → active key (S4E2)
3111 // - neither → no encryption (raw compressed body)
3112 // The `s4-encrypted: aes-256-gcm` metadata flag is set in
3113 // both encrypted modes; the on-disk frame magic distinguishes
3114 // S4E1 / S4E2 / S4E3 so GET picks the right decrypt path.
3115 // v0.7 #48 BUG-2/3 fix: take() the SSE fields off req.input
3116 // so the encryption headers are NOT forwarded to the
3117 // backend. S4 owns the encrypt-then-store contract; if we
3118 // leave the headers in place, real S3-compat backends
3119 // (MinIO / AWS) try to apply their own SSE on top and
3120 // either reject (MinIO requires HTTPS for SSE-C) or fail
3121 // (MinIO has no KMS configured). MemoryBackend ignored
3122 // these so mock tests passed.
3123 let sse_c_alg = req.input.sse_customer_algorithm.take();
3124 let sse_c_key = req.input.sse_customer_key.take();
3125 let sse_c_md5 = req.input.sse_customer_key_md5.take();
3126 let sse_header = req.input.server_side_encryption.take();
3127 let sse_kms_key = req.input.ssekms_key_id.take();
3128 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3129 // v0.5 #28: SSE-KMS request? Resolves to None unless the
3130 // request asks for `aws:kms` AND a key id is available
3131 // (explicit header or gateway default). When set, we'll
3132 // generate a per-object DEK below.
3133 let kms_key_id = extract_kms_key_id(
3134 &sse_header,
3135 &sse_kms_key,
3136 self.kms_default_key_id.as_deref(),
3137 );
3138 // v0.8.12 HIGH-10 fix: the sidecar offsets describe the
3139 // pre-encrypt `compressed` body, but the bytes the
3140 // backend stores when any SSE mode is active are
3141 // *post-encrypt* (different length, different layout).
3142 // A Range GET on an SSE-encrypted object would slice the
3143 // ciphertext at the stale offsets, hand the wrong bytes
3144 // to the frame parser, and 500. Suppress the sidecar
3145 // entirely when SSE is going to be applied below;
3146 // encrypted-object Range GET falls back to the buffered
3147 // path (decrypt full body → frame parse → slice), trading
3148 // partial-fetch performance for correctness.
3149 //
3150 // v0.9 #106 (encryption-aware sidecar): re-enable sidecar
3151 // emission for the **SSE-S4 chunked (S4E6) path only** —
3152 // S4E6 chunks are per-chunk independently sealed so the
3153 // GET path can compute encrypted byte ranges, partial-fetch
3154 // just the needed chunks, decrypt + frame-parse + slice.
3155 // The pre-encrypt `compressed` offsets in the sidecar are
3156 // still load-bearing (the GET path decrypts into the
3157 // pre-encrypt domain before frame-parsing), with the new
3158 // v3 SSE binding (`sse_v3`) stamped below once the
3159 // encrypt path runs and reveals the per-PUT salt /
3160 // chunk_count / key_id. SSE-KMS / SSE-C / S4E2 buffered
3161 // (`--sse-chunk-size 0`) keep the v0.8.12 #120 buffered
3162 // fallback (= sidecar suppressed) — multi-mode plumbing
3163 // is the v0.10+ roadmap.
3164 let will_encrypt =
3165 sse_c_material.is_some() || kms_key_id.is_some() || self.sse_keyring.is_some();
3166 let sse_s4_chunked_path = sse_c_material.is_none()
3167 && kms_key_id.is_none()
3168 && self.sse_keyring.is_some()
3169 && self.sse_chunk_size > 0;
3170 let sidecar_index = if is_framed && (!will_encrypt || sse_s4_chunked_path) {
3171 s4_codec::index::build_index_from_body(&compressed).ok()
3172 } else {
3173 None
3174 };
3175 // v0.5 #32: in compliance-strict mode, every PUT must
3176 // declare SSE — either client-supplied (SSE-C), KMS, or by
3177 // virtue of a server-side keyring being configured (which
3178 // applies SSE-S4 to every PUT automatically). Requests that
3179 // would otherwise land as plain compressed bytes are
3180 // rejected with 400 InvalidRequest.
3181 if self.compliance_strict
3182 && sse_c_material.is_none()
3183 && kms_key_id.is_none()
3184 && self.sse_keyring.is_none()
3185 && sse_header.as_ref().map(|s| s.as_str()) != Some(ServerSideEncryption::AES256)
3186 {
3187 return Err(S3Error::with_message(
3188 S3ErrorCode::InvalidRequest,
3189 "compliance-mode strict: PUT must include x-amz-server-side-encryption \
3190 (AES256 or aws:kms) or x-amz-server-side-encryption-customer-* headers",
3191 ));
3192 }
3193 // SSE-C and SSE-KMS are mutually exclusive on a single PUT
3194 // (AWS S3 returns 400 InvalidArgument). SSE-C wins by spec.
3195 if sse_c_material.is_some() && kms_key_id.is_some() {
3196 return Err(S3Error::with_message(
3197 S3ErrorCode::InvalidArgument,
3198 "SSE-C and SSE-KMS cannot be used together on the same PUT",
3199 ));
3200 }
3201 // KMS path needs to call generate_dek().await before the
3202 // body_to_send branch; capture the result here.
3203 //
3204 // v0.8.1 #58: the plaintext DEK lives in three places
3205 // during one PUT:
3206 //
3207 // 1. The `Zeroizing<Vec<u8>>` returned by `generate_dek`
3208 // — wiped when the binding `dek` falls out of scope at
3209 // the end of this `if`-arm.
3210 // 2. The stack `[u8; 32]` we copy into for `SseSource::Kms`
3211 // — wrapped in `Zeroizing<[u8; 32]>` so it's wiped when
3212 // the outer `kms_wrap` `Option` is dropped at the end
3213 // of `put_object`.
3214 // 3. AES-GCM internal key state inside the `aes-gcm`
3215 // crate during `encrypt_with_source` — out of scope
3216 // for this fix; tracked separately in v0.8.2.
3217 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
3218 if let Some(ref key_id) = kms_key_id {
3219 let kms = self.kms.as_ref().ok_or_else(|| {
3220 S3Error::with_message(
3221 S3ErrorCode::InvalidRequest,
3222 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
3223 )
3224 })?;
3225 // `dek` is `Zeroizing<Vec<u8>>`; deref + slice access
3226 // works unchanged via `Deref<Target=Vec<u8>>`.
3227 let (dek, wrapped) = kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
3228 if dek.len() != 32 {
3229 return Err(S3Error::with_message(
3230 S3ErrorCode::InternalError,
3231 format!(
3232 "KMS backend returned a DEK of {} bytes (expected 32)",
3233 dek.len()
3234 ),
3235 ));
3236 }
3237 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
3238 zeroize::Zeroizing::new([0u8; 32]);
3239 dek_arr.copy_from_slice(&dek);
3240 // `dek` (the `Zeroizing<Vec<u8>>`) is dropped at the
3241 // end of this scope, wiping the heap allocation.
3242 Some((dek_arr, wrapped))
3243 } else {
3244 None
3245 };
3246 // v0.7 #48 BUG-4 fix: stamp the SSE *type* into metadata
3247 // alongside `s4-encrypted` so HEAD (which doesn't fetch the
3248 // body) can echo the correct `x-amz-server-side-encryption`
3249 // value. Without this, HEAD on an SSE-KMS object would not
3250 // echo `aws:kms` because the frame magic is only available
3251 // on the body (which HEAD doesn't read).
3252 let body_to_send = if let Some(ref m) = sse_c_material {
3253 let meta = req.input.metadata.get_or_insert_with(Default::default);
3254 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
3255 meta.insert("s4-sse-type".into(), "AES256".into());
3256 meta.insert(
3257 "s4-sse-c-key-md5".into(),
3258 base64::engine::general_purpose::STANDARD.encode(m.key_md5),
3259 );
3260 crate::sse::encrypt_with_source(
3261 &compressed,
3262 crate::sse::SseSource::CustomerKey {
3263 key: &m.key,
3264 key_md5: &m.key_md5,
3265 },
3266 )
3267 } else if let Some((ref dek, ref wrapped)) = kms_wrap {
3268 let meta = req.input.metadata.get_or_insert_with(Default::default);
3269 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
3270 meta.insert("s4-sse-type".into(), "aws:kms".into());
3271 meta.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
3272 // v0.8.1 #58: `dek` is `&Zeroizing<[u8; 32]>`; `SseSource::Kms`
3273 // wants `&[u8; 32]`. Rust auto-derefs `&Zeroizing<T>` to
3274 // `&T` here via `Deref<Target=T>`, so the binding picks
3275 // up the inner array reference without copying. The array
3276 // stays in the `Zeroizing` wrapper that owns it and gets
3277 // wiped when `kms_wrap` drops at the end of `put_object`.
3278 let dek_ref: &[u8; 32] = dek;
3279 crate::sse::encrypt_with_source(
3280 &compressed,
3281 crate::sse::SseSource::Kms {
3282 dek: dek_ref,
3283 wrapped,
3284 },
3285 )
3286 } else if let Some(keyring) = self.sse_keyring.as_ref() {
3287 // SSE-S4 is server-driven transparent encryption; the
3288 // client didn't ask for SSE. We stamp `s4-encrypted`
3289 // (internal flag the GET path needs) but deliberately
3290 // do NOT stamp `s4-sse-type` — that lights up the HEAD
3291 // echo of `x-amz-server-side-encryption: AES256`,
3292 // which would falsely advertise AWS-style SSE-S3
3293 // semantics the operator didn't request.
3294 let meta = req.input.metadata.get_or_insert_with(Default::default);
3295 meta.insert("s4-encrypted".into(), "aes-256-gcm".into());
3296 // v0.8 #52: when `--sse-chunk-size > 0` is configured,
3297 // emit the chunked S4E5 frame so the matching GET can
3298 // stream-decrypt instead of buffering 5 GiB before
3299 // emitting a byte. Falls back to the buffered S4E2
3300 // frame at chunk_size=0 (default) so existing
3301 // deployments are bit-for-bit unchanged.
3302 if self.sse_chunk_size > 0 {
3303 crate::sse::encrypt_v2_chunked(&compressed, keyring, self.sse_chunk_size)
3304 .map_err(|e| {
3305 S3Error::with_message(
3306 S3ErrorCode::InternalError,
3307 format!("SSE-S4 chunked encrypt failed: {e}"),
3308 )
3309 })?
3310 } else {
3311 crate::sse::encrypt_v2(&compressed, keyring)
3312 }
3313 } else {
3314 compressed.clone()
3315 };
3316 // v0.9 #106: when the SSE-S4 chunked path ran (and only
3317 // that path — SSE-KMS / SSE-C / S4E2 buffered keep the
3318 // buffered fallback), parse the S4E6 header bytes back
3319 // out of `body_to_send` to recover the per-PUT salt /
3320 // key_id / chunk_count and stamp them onto the sidecar's
3321 // SSE binding. The salt isn't secret (it lives in the
3322 // encrypted body's plaintext header) so duplicating it
3323 // in the sidecar saves the GET path an extra HEAD/GET to
3324 // re-derive it. `parse_s4e6_header` reads the fixed-
3325 // layout fields only — any failure leaves `sse_binding`
3326 // as `None`, which falls through to the legacy buffered
3327 // fallback on GET (= safe degradation, not corruption).
3328 let sse_binding: Option<s4_codec::index::SseChunkBinding> = if sse_s4_chunked_path {
3329 match crate::sse::parse_s4e6_header(&body_to_send) {
3330 Ok(hdr) => Some(s4_codec::index::SseChunkBinding {
3331 enc_chunk_size: hdr.chunk_size,
3332 enc_chunk_count: hdr.chunk_count,
3333 enc_key_id: hdr.key_id,
3334 enc_salt: *hdr.salt,
3335 enc_plaintext_len: compressed.len() as u64,
3336 // S4E6_HEADER_BYTES = 24 today; carried
3337 // explicitly so a future bump (e.g. S4E7
3338 // with a different fixed-header size) can't
3339 // silently break v3 sidecar decode.
3340 enc_header_bytes: crate::sse::S4E6_HEADER_BYTES as u32,
3341 }),
3342 Err(e) => {
3343 tracing::warn!(
3344 bucket = %put_bucket,
3345 key = %put_key,
3346 "S4 sidecar SSE-binding stamp failed (Range GET will fall back \
3347 to buffered): {e}"
3348 );
3349 None
3350 }
3351 }
3352 } else {
3353 None
3354 };
3355 // v0.6 #40: capture the about-to-be-sent body + metadata so
3356 // the replication dispatcher (run after the source PUT
3357 // succeeds) can hand the same backend bytes to the
3358 // destination bucket. `Bytes` clone is cheap (refcounted).
3359 let replication_body = body_to_send.clone();
3360 let replication_metadata = req.input.metadata.clone();
3361 // v0.7 #48 BUG-1 fix: SSE encryption (S4E1/E2/E3/E4 frames)
3362 // makes the body longer than the post-compression bytes
3363 // (header + nonce + tag overhead). The earlier
3364 // content_length stamp at compressed.len() is now stale, so
3365 // re-stamp from the actual bytes about to be sent or the
3366 // backend (real S3 / MinIO) rejects with
3367 // `StreamLengthMismatch`. MemoryBackend never validated
3368 // this, which is why mock-only tests passed.
3369 req.input.content_length = Some(body_to_send.len() as i64);
3370 req.input.body = Some(bytes_to_blob(body_to_send));
3371 // v0.5 #34: pre-allocate a version-id when the bucket is
3372 // Enabled, then redirect the backend storage key to the
3373 // shadow path so older versions survive newer PUTs.
3374 // Suspended / Unversioned buckets keep using the plain
3375 // `<key>` (S3 spec: Suspended overwrites the same backend
3376 // object). Pre-allocation (instead of recording after PUT)
3377 // ensures the shadow key + the response's
3378 // `x-amz-version-id` use the same vid.
3379 let pending_version: Option<crate::versioning::PutOutcome> = self
3380 .versioning
3381 .as_ref()
3382 .map(|mgr| mgr.state(&put_bucket))
3383 .map(|state| match state {
3384 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3385 version_id: crate::versioning::VersioningManager::new_version_id(),
3386 versioned_response: true,
3387 },
3388 crate::versioning::VersioningState::Suspended
3389 | crate::versioning::VersioningState::Unversioned => {
3390 crate::versioning::PutOutcome {
3391 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3392 versioned_response: false,
3393 }
3394 }
3395 });
3396 if let Some(ref pv) = pending_version
3397 && pv.versioned_response
3398 {
3399 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3400 }
3401 // v0.8.4 #73 H-2: capture the to-be-stored body length BEFORE
3402 // the move into `req.input` is consumed by the backend call.
3403 // The sidecar's `source_compressed_size` is checked against
3404 // the live HEAD `Content-Length` on Range GET to detect a
3405 // backend-side mutation.
3406 let backend_object_size = req.input.content_length.and_then(|n| u64::try_from(n).ok());
3407 let mut backend_resp = self.backend.put_object(req).await;
3408 // v0.9 #106 (Codex P2): on the SSE-S4 chunked PUT path,
3409 // if we *couldn't* recover the per-PUT salt / key_id /
3410 // chunk_count (= `sse_binding.is_none()`), we MUST NOT
3411 // emit any sidecar — the bytes on disk are S4E6-encrypted
3412 // and the offsets in `sidecar_index` are pre-encrypt. A
3413 // v2 sidecar (sans SSE binding) would skip the encryption-
3414 // aware GET fast-path AND skip the v0.8.12 #120 buffered
3415 // fallback (the GET path treats a present sidecar as
3416 // "use partial_range_get on the backend body"), so it
3417 // would slice ciphertext at plaintext offsets, hand wrong
3418 // bytes to the frame parser, and 500 (or worse, return
3419 // garbage that decodes by accident). Drop the sidecar so
3420 // the GET falls back to buffered = correct.
3421 let suppress_sidecar_for_failed_sse_binding =
3422 sse_s4_chunked_path && sse_binding.is_none();
3423 if let Some(mut idx) = sidecar_index
3424 && let Ok(ref resp) = backend_resp
3425 && idx.entries.len() > 1
3426 && !suppress_sidecar_for_failed_sse_binding
3427 {
3428 // 1 chunk しかない (small object) なら sidecar は意味がない (=
3429 // partial fetch しても full body と同じ範囲) ので省略。
3430 // Sidecar は user-visible key で書く (latest version の
3431 // partial fetch path 用)。Old versions の Range GET は今 task
3432 // の scope 外 (full read fallback でも意味的には正しい)。
3433 //
3434 // v0.8.4 #73 H-2: stamp the version-binding fields the
3435 // GET path needs to detect a stale / attacker-written
3436 // sidecar. ETag comes from the backend's PUT response —
3437 // when missing (some backends don't return an ETag) we
3438 // synthesize a CRC-derived stable identifier so the
3439 // sidecar still binds to *something*; the GET HEAD will
3440 // see the same backend ETag (None vs None) and treat the
3441 // pair as consistent.
3442 let source_etag = resp.output.e_tag.as_ref().map(|t| t.value().to_string());
3443 idx.source_etag = source_etag;
3444 idx.source_compressed_size = backend_object_size;
3445 // v0.9 #106: stamp the SSE chunked binding so the GET
3446 // path can run the encrypted Range partial-fetch
3447 // fast-path. `None` keeps the sidecar at v2 layout
3448 // (= existing behaviour for non-SSE-S4-chunked PUTs).
3449 idx.sse_v3 = sse_binding;
3450 self.write_sidecar(&put_bucket, &put_key, &idx).await;
3451 }
3452 // v0.5 #34: commit the new version into the manager only on
3453 // backend success. Use the pre-allocated vid so the response
3454 // header and the chain entry agree.
3455 if let (Some(mgr), Some(pv), Ok(resp)) = (
3456 self.versioning.as_ref(),
3457 pending_version.as_ref(),
3458 backend_resp.as_mut(),
3459 ) {
3460 let etag = resp
3461 .output
3462 .e_tag
3463 .clone()
3464 .map(ETag::into_value)
3465 .unwrap_or_else(|| format!("\"crc32c-{}\"", manifest.crc32c));
3466 let now = chrono::Utc::now();
3467 mgr.commit_put_with_version(
3468 &put_bucket,
3469 &put_key,
3470 crate::versioning::VersionEntry {
3471 version_id: pv.version_id.clone(),
3472 etag,
3473 size: original_size,
3474 is_delete_marker: false,
3475 created_at: now,
3476 },
3477 );
3478 if pv.versioned_response {
3479 resp.output.version_id = Some(pv.version_id.clone());
3480 }
3481 }
3482 // v0.5 #27: AWS S3 echoes the SSE-C headers back on success
3483 // so the client knows the server actually applied the
3484 // requested algorithm and which key fingerprint matched.
3485 if let (Some(m), Ok(resp)) = (sse_c_material.as_ref(), backend_resp.as_mut()) {
3486 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
3487 resp.output.sse_customer_key_md5 =
3488 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
3489 }
3490 // v0.5 #28: SSE-KMS echo — `aws:kms` + the canonical key id
3491 // the backend returned (AWS KMS returns the ARN even when
3492 // the request used an alias).
3493 if let (Some((_, wrapped)), Ok(resp)) = (kms_wrap.as_ref(), backend_resp.as_mut()) {
3494 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
3495 ServerSideEncryption::AWS_KMS,
3496 ));
3497 resp.output.ssekms_key_id = Some(wrapped.key_id.clone());
3498 }
3499 // v0.5 #30: persist any per-PUT explicit retention / legal
3500 // hold the client supplied, then auto-apply the bucket
3501 // default (no-op when state is already populated). The
3502 // explicit fields take precedence — the bucket-default
3503 // helper bails out as soon as it sees any retention.
3504 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3505 if explicit_lock_mode.is_some()
3506 || explicit_retain_until.is_some()
3507 || explicit_legal_hold_on.is_some()
3508 {
3509 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3510 if let Some(m) = explicit_lock_mode {
3511 state.mode = Some(m);
3512 }
3513 if let Some(u) = explicit_retain_until {
3514 state.retain_until = Some(u);
3515 }
3516 if let Some(lh) = explicit_legal_hold_on {
3517 state.legal_hold_on = lh;
3518 }
3519 mgr.set(&put_bucket, &put_key, state);
3520 }
3521 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3522 }
3523 let _ = (original_size, compressed_size); // mute unused warnings
3524 let elapsed = put_start.elapsed();
3525 crate::metrics::record_put(
3526 codec_label,
3527 original_size,
3528 compressed_size,
3529 elapsed.as_secs_f64(),
3530 backend_resp.is_ok(),
3531 );
3532 // v0.4 #20: structured access-log entry (best-effort).
3533 self.record_access(
3534 access_preamble,
3535 "REST.PUT.OBJECT",
3536 &put_bucket,
3537 Some(&put_key),
3538 if backend_resp.is_ok() { 200 } else { 500 },
3539 compressed_size,
3540 original_size,
3541 elapsed.as_millis() as u64,
3542 backend_resp.as_ref().err().map(|e| e.code().as_str()),
3543 )
3544 .await;
3545 info!(
3546 op = "put_object",
3547 bucket = %put_bucket,
3548 key = %put_key,
3549 codec = codec_label,
3550 bytes_in = original_size,
3551 bytes_out = compressed_size,
3552 ratio = format!(
3553 "{:.3}",
3554 if original_size == 0 { 1.0 } else { compressed_size as f64 / original_size as f64 }
3555 ),
3556 latency_ms = elapsed.as_millis() as u64,
3557 ok = backend_resp.is_ok(),
3558 "S4 put completed"
3559 );
3560 // v0.6 #35: fire bucket-notification destinations (best-effort,
3561 // detached). Skipped when no manager is attached or when the
3562 // bucket has no rule matching `s3:ObjectCreated:Put` for this
3563 // key.
3564 if backend_resp.is_ok()
3565 && let Some(mgr) = self.notifications.as_ref()
3566 {
3567 let dests = mgr.match_destinations(
3568 &put_bucket,
3569 &crate::notifications::EventType::ObjectCreatedPut,
3570 &put_key,
3571 );
3572 if !dests.is_empty() {
3573 let etag = backend_resp
3574 .as_ref()
3575 .ok()
3576 .and_then(|r| r.output.e_tag.clone())
3577 .map(ETag::into_value);
3578 let version_id = pending_version
3579 .as_ref()
3580 .filter(|pv| pv.versioned_response)
3581 .map(|pv| pv.version_id.clone());
3582 tokio::spawn(crate::notifications::dispatch_event(
3583 Arc::clone(mgr),
3584 put_bucket.clone(),
3585 put_key.clone(),
3586 crate::notifications::EventType::ObjectCreatedPut,
3587 Some(original_size),
3588 etag,
3589 version_id,
3590 format!("S4-{}", uuid::Uuid::new_v4()),
3591 ));
3592 }
3593 }
3594 // v0.6 #39: persist parsed `x-amz-tagging` tags into the
3595 // tagging manager on a successful PUT. AWS PutObject's
3596 // tagging is a full-replace operation (not a merge), so
3597 // any pre-existing entry for `(bucket, key)` is overwritten.
3598 if backend_resp.is_ok()
3599 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3600 {
3601 mgr.put_object_tags(&put_bucket, &put_key, tags);
3602 }
3603 // v0.6 #40: cross-bucket replication fire-point. On
3604 // successful source PUT, consult the replication manager;
3605 // when an enabled rule matches, mark the source key
3606 // `Pending` and spawn a detached task that PUTs the same
3607 // backend bytes + metadata to the rule's destination
3608 // bucket. The dispatcher itself records `Completed` /
3609 // `Failed` and bumps the drop counter on retry-budget
3610 // exhaustion.
3611 self.spawn_replication_if_matched(
3612 &put_bucket,
3613 &put_key,
3614 &request_tags,
3615 &replication_body,
3616 &replication_metadata,
3617 backend_resp.is_ok(),
3618 pending_version.as_ref(),
3619 );
3620 return backend_resp;
3621 }
3622 // Body-less PUT (rare: zero-length object). Mirror the body-full
3623 // versioning hooks so list_object_versions / GET-by-version still see
3624 // empty-body objects in the chain.
3625 let pending_version: Option<crate::versioning::PutOutcome> = self
3626 .versioning
3627 .as_ref()
3628 .map(|mgr| mgr.state(&put_bucket))
3629 .map(|state| match state {
3630 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
3631 version_id: crate::versioning::VersioningManager::new_version_id(),
3632 versioned_response: true,
3633 },
3634 _ => crate::versioning::PutOutcome {
3635 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
3636 versioned_response: false,
3637 },
3638 });
3639 if let Some(ref pv) = pending_version
3640 && pv.versioned_response
3641 {
3642 req.input.key = versioned_shadow_key(&put_key, &pv.version_id);
3643 }
3644 let mut backend_resp = self.backend.put_object(req).await;
3645 if let (Some(mgr), Some(pv), Ok(resp)) = (
3646 self.versioning.as_ref(),
3647 pending_version.as_ref(),
3648 backend_resp.as_mut(),
3649 ) {
3650 let etag = resp
3651 .output
3652 .e_tag
3653 .clone()
3654 .map(ETag::into_value)
3655 .unwrap_or_default();
3656 let now = chrono::Utc::now();
3657 mgr.commit_put_with_version(
3658 &put_bucket,
3659 &put_key,
3660 crate::versioning::VersionEntry {
3661 version_id: pv.version_id.clone(),
3662 etag,
3663 size: 0,
3664 is_delete_marker: false,
3665 created_at: now,
3666 },
3667 );
3668 if pv.versioned_response {
3669 resp.output.version_id = Some(pv.version_id.clone());
3670 }
3671 }
3672 // v0.5 #30: same explicit-then-default lock-state commit as the
3673 // body-bearing branch above, so a zero-length PUT also picks up
3674 // bucket-default retention.
3675 if let (Some(mgr), Ok(_)) = (self.object_lock.as_ref(), backend_resp.as_ref()) {
3676 if explicit_lock_mode.is_some()
3677 || explicit_retain_until.is_some()
3678 || explicit_legal_hold_on.is_some()
3679 {
3680 let mut state = mgr.get(&put_bucket, &put_key).unwrap_or_default();
3681 if let Some(m) = explicit_lock_mode {
3682 state.mode = Some(m);
3683 }
3684 if let Some(u) = explicit_retain_until {
3685 state.retain_until = Some(u);
3686 }
3687 if let Some(lh) = explicit_legal_hold_on {
3688 state.legal_hold_on = lh;
3689 }
3690 mgr.set(&put_bucket, &put_key, state);
3691 }
3692 mgr.apply_default_on_put(&put_bucket, &put_key, chrono::Utc::now());
3693 }
3694 // v0.6 #35: same notification fire-point as the body-bearing PUT
3695 // branch above (zero-length objects still match `ObjectCreated:Put`
3696 // rules per the AWS event taxonomy).
3697 if backend_resp.is_ok()
3698 && let Some(mgr) = self.notifications.as_ref()
3699 {
3700 let dests = mgr.match_destinations(
3701 &put_bucket,
3702 &crate::notifications::EventType::ObjectCreatedPut,
3703 &put_key,
3704 );
3705 if !dests.is_empty() {
3706 let etag = backend_resp
3707 .as_ref()
3708 .ok()
3709 .and_then(|r| r.output.e_tag.clone())
3710 .map(ETag::into_value);
3711 let version_id = pending_version
3712 .as_ref()
3713 .filter(|pv| pv.versioned_response)
3714 .map(|pv| pv.version_id.clone());
3715 tokio::spawn(crate::notifications::dispatch_event(
3716 Arc::clone(mgr),
3717 put_bucket.clone(),
3718 put_key.clone(),
3719 crate::notifications::EventType::ObjectCreatedPut,
3720 Some(0),
3721 etag,
3722 version_id,
3723 format!("S4-{}", uuid::Uuid::new_v4()),
3724 ));
3725 }
3726 }
3727 // v0.6 #39: persist parsed `x-amz-tagging` for the body-less
3728 // (zero-length) PUT branch too — same shape as the body-bearing
3729 // branch above.
3730 if backend_resp.is_ok()
3731 && let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), request_tags.clone())
3732 {
3733 mgr.put_object_tags(&put_bucket, &put_key, tags);
3734 }
3735 // v0.6 #40: cross-bucket replication for the zero-length PUT
3736 // branch — same shape as the body-bearing branch above.
3737 // v0.8.2 #61: pass `pending_version` so a versioned source's
3738 // destination receives the same shadow-key path.
3739 self.spawn_replication_if_matched(
3740 &put_bucket,
3741 &put_key,
3742 &request_tags,
3743 &bytes::Bytes::new(),
3744 &None,
3745 backend_resp.is_ok(),
3746 pending_version.as_ref(),
3747 );
3748 backend_resp
3749 }
3750
3751 // === 圧縮を解く path (GET) ===
3752 #[tracing::instrument(
3753 name = "s4.get_object",
3754 skip(self, req),
3755 fields(bucket = %req.input.bucket, key = %req.input.key, codec, bytes_out, range, path)
3756 )]
3757 async fn get_object(
3758 &self,
3759 mut req: S3Request<GetObjectInput>,
3760 ) -> S3Result<S3Response<GetObjectOutput>> {
3761 let get_start = Instant::now();
3762 let get_bucket = req.input.bucket.clone();
3763 let get_key = req.input.key.clone();
3764 // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
3765 self.check_not_reserved_key(&get_key, ReservedKeyMode::Read)?;
3766 self.enforce_rate_limit(&req, &get_bucket)?;
3767 self.enforce_policy(&req, "s3:GetObject", &get_bucket, Some(&get_key))?;
3768 // Range request の事前検出 (decompress 後 slice する path に使う)。
3769 let range_request = req.input.range.take();
3770 // v0.5 #27: pull SSE-C material from the input headers before
3771 // the request is moved into the backend. A header parse error
3772 // fails fast (no body fetch). The material is consumed below
3773 // when decrypting an S4E3-framed body; the SSE-C headers on
3774 // `req.input` are cleared so the backend doesn't see them.
3775 let sse_c_alg = req.input.sse_customer_algorithm.take();
3776 let sse_c_key = req.input.sse_customer_key.take();
3777 let sse_c_md5 = req.input.sse_customer_key_md5.take();
3778 let get_sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
3779
3780 // v0.5 #34: route the GET through the VersioningManager when
3781 // attached AND the bucket is in a versioning-aware state.
3782 // Resolves which version to fetch (explicit `?versionId=` query
3783 // param vs. chain latest), translates a delete-marker into 404
3784 // NoSuchKey, and rewrites the backend storage key to the shadow
3785 // path (`<key>.__s4ver__/<vid>`) for non-null Enabled-bucket
3786 // versions. `resolved_version_id` is stamped onto the response
3787 // so clients see a coherent `x-amz-version-id` header.
3788 //
3789 // When the bucket is Unversioned (or no manager attached), the
3790 // chain-resolution step is skipped and the request flows
3791 // through the existing single-key path unchanged.
3792 let resolved_version_id: Option<String> = match self.versioning.as_ref() {
3793 Some(mgr)
3794 if mgr.state(&get_bucket) != crate::versioning::VersioningState::Unversioned =>
3795 {
3796 let req_vid = req.input.version_id.take();
3797 let entry = match req_vid.as_deref() {
3798 Some(vid) => {
3799 mgr.lookup_version(&get_bucket, &get_key, vid)
3800 .ok_or_else(|| {
3801 S3Error::with_message(
3802 S3ErrorCode::NoSuchVersion,
3803 format!("no such version: {vid}"),
3804 )
3805 })?
3806 }
3807 None => mgr.lookup_latest(&get_bucket, &get_key).ok_or_else(|| {
3808 S3Error::with_message(
3809 S3ErrorCode::NoSuchKey,
3810 format!("no such key: {get_key}"),
3811 )
3812 })?,
3813 };
3814 if entry.is_delete_marker {
3815 // S3 spec: GET without versionId on a
3816 // delete-marker latest → 404 NoSuchKey + the
3817 // response carries `x-amz-delete-marker: true`.
3818 // GET with explicit versionId pointing at a delete
3819 // marker → 405 MethodNotAllowed; we surface
3820 // NoSuchKey here for both since s3s collapses them
3821 // into the same not-found error path.
3822 return Err(S3Error::with_message(
3823 S3ErrorCode::NoSuchKey,
3824 format!("delete marker is the current version of {get_key}"),
3825 ));
3826 }
3827 if entry.version_id != crate::versioning::NULL_VERSION_ID {
3828 req.input.key = versioned_shadow_key(&get_key, &entry.version_id);
3829 }
3830 Some(entry.version_id)
3831 }
3832 _ => None,
3833 };
3834
3835 // ====== Range GET の partial-fetch fast path (sidecar index 利用) ======
3836 // sidecar `<key>.s4index` が存在し、multipart-framed object であれば
3837 // 必要 frame だけを backend に Range GET し帯域節約する。
3838 //
3839 // v0.8.4 #73 H-2: BEFORE trusting the sidecar's frame offsets,
3840 // verify the source object hasn't been overwritten / mutated since
3841 // the sidecar was stamped. The sidecar carries the backend ETag
3842 // captured at PUT time (`source_etag`); a HEAD against the current
3843 // backend object tells us the live ETag. If they disagree we treat
3844 // the sidecar as stale and fall through to the full-GET path —
3845 // returning the wrong frames for a Range request would surface as
3846 // a CRC mismatch deeper in the stack but would also potentially
3847 // disclose unrelated frames if a hostile operator wrote the
3848 // sidecar themselves. Fail-open to "full read" is the safe default.
3849 //
3850 // Legacy v1 sidecars (no `source_etag` populated) keep the old
3851 // best-effort behaviour so existing on-disk indexes don't suddenly
3852 // start missing the partial-fetch path.
3853 if let Some(ref r) = range_request
3854 && let Some(index) = self.read_sidecar(&req.input.bucket, &req.input.key).await
3855 && self
3856 .sidecar_version_binding_ok(&req.input.bucket, &req.input.key, &index)
3857 .await
3858 {
3859 let total = index.total_original_size();
3860 let (start, end_exclusive) = match resolve_range(r, total) {
3861 Ok(v) => v,
3862 Err(e) => {
3863 return Err(S3Error::with_message(S3ErrorCode::InvalidRange, e));
3864 }
3865 };
3866 if let Some(plan) = index.lookup_range(start, end_exclusive) {
3867 // v0.9 #106: v3 sidecar with an SSE chunked binding →
3868 // encrypted partial-fetch fast-path. SSE-S4 chunked
3869 // (S4E6) is the only scope-in encryption mode; for
3870 // every other case (v1 / v2 sidecar) we fall through
3871 // to the existing pre-encrypt `partial_range_get`.
3872 // SSE-KMS / SSE-C / S4E2 buffered never get a
3873 // sidecar emitted (see PUT path `sidecar_index`
3874 // condition), so they trivially take the existing
3875 // buffered fallback further down.
3876 //
3877 // Codex P2 (round 2): when the sidecar HAS an SSE
3878 // binding but `encrypted_lookup` returns `None` (=
3879 // stale / corrupted chunk geometry, or a Range that
3880 // falls outside the declared `enc_plaintext_len`),
3881 // we must NOT fall through to `partial_range_get`
3882 // — that would slice the S4E6 ciphertext at
3883 // pre-encrypt offsets and either 500 or return
3884 // garbage. Skip the fast-path entirely so the
3885 // buffered fallback below decrypts + frame-parses
3886 // correctly.
3887 if let Some(sse) = index.sse_v3.as_ref() {
3888 if let Some(enc_plan) = index.encrypted_lookup(&plan) {
3889 return self
3890 .partial_range_get_encrypted(
3891 &req,
3892 plan,
3893 enc_plan,
3894 *sse,
3895 start,
3896 end_exclusive,
3897 total,
3898 get_start,
3899 )
3900 .await;
3901 }
3902 // Encrypted body + binding present but
3903 // `encrypted_lookup` refused (= sidecar /
3904 // body mismatch). Fall through to the buffered
3905 // full-GET below — safer than slicing
3906 // ciphertext with pre-encrypt offsets.
3907 //
3908 // Data-flow note: `req.input.range` was
3909 // already `.take()`-ed into `range_request` at
3910 // L3695, so the subsequent
3911 // `self.backend.get_object(req)` carries no
3912 // Range header (= full body fetch). The local
3913 // `range_request` is then re-applied to the
3914 // *decrypted + decompressed* plaintext by the
3915 // buffered slice path further down. Without
3916 // the `.take()` above, we'd have to clear it
3917 // explicitly here or we'd slice ciphertext.
3918 } else {
3919 return self
3920 .partial_range_get(&req, plan, start, end_exclusive, total, get_start)
3921 .await;
3922 }
3923 }
3924 }
3925 let mut resp = self.backend.get_object(req).await?;
3926 // v0.5 #34: stamp the resolved version-id so the client sees a
3927 // coherent `x-amz-version-id` header (only for chains owned by
3928 // the manager — Unversioned buckets / no-manager paths never
3929 // set this).
3930 if let Some(ref vid) = resolved_version_id {
3931 resp.output.version_id = Some(vid.clone());
3932 }
3933 let is_multipart = is_multipart_object(&resp.output.metadata);
3934 let is_framed_v2 = is_framed_v2_object(&resp.output.metadata);
3935 // v0.2 #4: framed-v2 single-PUT は多 frame parse が必要なので
3936 // multipart と同じ path に流す。
3937 let needs_frame_parse = is_multipart || is_framed_v2;
3938 let manifest_opt = extract_manifest(&resp.output.metadata);
3939
3940 if !needs_frame_parse && manifest_opt.is_none() {
3941 // S4 が書いていないオブジェクトは透過 (raw bucket pre-existing object 等)
3942 debug!("S4 get_object: object lacks s4-codec metadata, returning as-is");
3943 return Ok(resp);
3944 }
3945
3946 if let Some(blob) = resp.output.body.take() {
3947 // v0.4 #21 / v0.5 #27: if the object was stored under SSE
3948 // (metadata flag `s4-encrypted: aes-256-gcm`), decrypt
3949 // before any frame parse / streaming decompress. Encrypted
3950 // bodies are opaque to the codec; this also forces the
3951 // buffered path because AES-GCM needs the full body for tag
3952 // verify. SSE-C uses the per-request customer key, SSE-S4
3953 // falls back to the configured keyring.
3954 let blob = if is_sse_encrypted(&resp.output.metadata) {
3955 let body = collect_blob(blob, self.max_body_bytes)
3956 .await
3957 .map_err(internal("collect SSE-encrypted body"))?;
3958 // v0.5 #28: peek the frame magic to route the right
3959 // decrypt path. S4E4 means SSE-KMS — unwrap the DEK
3960 // through the KMS backend (async). S4E1/E2/E3 take
3961 // the sync path (keyring or customer key).
3962 //
3963 // v0.8 #52 (S4E5) / v0.8.1 #57 (S4E6): the chunked
3964 // SSE-S4 frames take the *streaming* path — we hand
3965 // the response body a per-chunk verify-and-emit
3966 // Stream so the client sees chunk 0 plaintext after
3967 // one chunk-worth of AES-GCM verify (vs. waiting
3968 // for the whole body's tag), and the gateway no
3969 // longer needs to materialize the full plaintext
3970 // in memory before responding. SSE-C is out of
3971 // scope for the chunked path (chunked S4E3 is a
3972 // follow-up), so this branch requires the SSE-S4
3973 // keyring to be wired and `get_sse_c_material` to
3974 // be absent — otherwise we surface a clear
3975 // misconfiguration error instead of silently
3976 // falling through to the buffered chunked path.
3977 // v0.8.11 CRIT-1 fix: the chunked stream early-return is
3978 // only correct when the decrypted body IS the user's
3979 // plaintext as-stored. If the object went through the
3980 // codec (compressed) or carries S4F2 frames, returning
3981 // the decrypt stream directly hands the client
3982 // compressed / framed bytes. Restrict the early-return
3983 // to codec=Passthrough + non-framed objects; everything
3984 // else falls through to the buffered path, which
3985 // decrypt-buffers S4E5/S4E6 via
3986 // `decrypt_chunked_buffered_default` and then runs the
3987 // existing decompress pipeline.
3988 let chunked_streaming_safe = !needs_frame_parse
3989 && manifest_opt
3990 .as_ref()
3991 .map(|m| m.codec == CodecKind::Passthrough)
3992 .unwrap_or(false);
3993 if matches!(crate::sse::peek_magic(&body), Some("S4E5") | Some("S4E6"))
3994 && get_sse_c_material.is_none()
3995 && chunked_streaming_safe
3996 {
3997 let keyring_arc = self.sse_keyring.clone().ok_or_else(|| {
3998 S3Error::with_message(
3999 S3ErrorCode::InvalidRequest,
4000 "object is SSE-S4 encrypted (S4E5/S4E6) but no --sse-s4-key is configured on this gateway",
4001 )
4002 })?;
4003 let body_len = body.len() as u64;
4004 let stream = crate::sse::decrypt_chunked_stream(body, keyring_arc.as_ref());
4005 // Stream is `'static` (the keyring borrow is
4006 // consumed up front; the cipher lives inside
4007 // the stream state — see decrypt_chunked_stream
4008 // doc), so we can move it straight into a
4009 // StreamingBlob without lifetime gymnastics.
4010 use futures::StreamExt;
4011 let mapped = stream.map(|r| {
4012 r.map_err(|e| std::io::Error::other(format!("SSE-S4 chunked decrypt: {e}")))
4013 });
4014 use s3s::dto::StreamingBlob;
4015 resp.output.body = Some(StreamingBlob::wrap(mapped));
4016 // Plaintext content_length is unknown until all
4017 // chunks have been verified; null it out so the
4018 // ByteStream wrapper reports `unknown` to the
4019 // HTTP layer (which then emits chunked transfer-
4020 // encoding) rather than lying about the size.
4021 resp.output.content_length = None;
4022 // The backend's checksums + ETag describe the
4023 // encrypted body (S4E5/S4E6 wire format), not
4024 // the plaintext we're about to stream — clear them
4025 // so the AWS SDK doesn't fail the GET with a
4026 // ChecksumMismatch on a successful round-trip.
4027 // Mirrors the streaming-zstd path at L1180-1185.
4028 resp.output.checksum_crc32 = None;
4029 resp.output.checksum_crc32c = None;
4030 resp.output.checksum_crc64nvme = None;
4031 resp.output.checksum_sha1 = None;
4032 resp.output.checksum_sha256 = None;
4033 resp.output.e_tag = None;
4034 let elapsed = get_start.elapsed();
4035 crate::metrics::record_get(
4036 "sse-s4-chunked",
4037 body_len,
4038 body_len,
4039 elapsed.as_secs_f64(),
4040 true,
4041 );
4042 return Ok(resp);
4043 }
4044 let plain = match crate::sse::peek_magic(&body) {
4045 Some("S4E4") => {
4046 let kms = self.kms.as_ref().ok_or_else(|| {
4047 S3Error::with_message(
4048 S3ErrorCode::InvalidRequest,
4049 "object is SSE-KMS encrypted but no --kms-local-dir / --kms-aws-region is configured on this gateway",
4050 )
4051 })?;
4052 let kms_ref: &dyn crate::kms::KmsBackend = kms.as_ref();
4053 crate::sse::decrypt_with_kms(&body, kms_ref)
4054 .await
4055 .map_err(|e| match e {
4056 crate::sse::SseError::KmsBackend(k) => kms_error_to_s3(k),
4057 other => S3Error::with_message(
4058 S3ErrorCode::InternalError,
4059 format!("SSE-KMS decrypt failed: {other}"),
4060 ),
4061 })?
4062 }
4063 _ => {
4064 if let Some(ref m) = get_sse_c_material {
4065 crate::sse::decrypt(
4066 &body,
4067 crate::sse::SseSource::CustomerKey {
4068 key: &m.key,
4069 key_md5: &m.key_md5,
4070 },
4071 )
4072 .map_err(sse_c_error_to_s3)?
4073 } else {
4074 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
4075 S3Error::with_message(
4076 S3ErrorCode::InvalidRequest,
4077 "object is SSE-S4 encrypted but no --sse-s4-key is configured on this gateway",
4078 )
4079 })?;
4080 crate::sse::decrypt(&body, keyring).map_err(|e| {
4081 S3Error::with_message(
4082 S3ErrorCode::InternalError,
4083 format!("SSE-S4 decrypt failed: {e}"),
4084 )
4085 })?
4086 }
4087 }
4088 };
4089 // v0.5 #28: parse out the on-disk wrapped DEK's key id
4090 // so the GET response can echo `x-amz-server-side-encryption-aws-kms-key-id`.
4091 if matches!(crate::sse::peek_magic(&body), Some("S4E4"))
4092 && let Ok(hdr) = crate::sse::parse_s4e4_header(&body)
4093 {
4094 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
4095 ServerSideEncryption::AWS_KMS,
4096 ));
4097 resp.output.ssekms_key_id = Some(hdr.key_id.to_string());
4098 }
4099 bytes_to_blob(plain)
4100 } else if let Some(ref m) = get_sse_c_material {
4101 // Client sent SSE-C headers for an unencrypted object —
4102 // mirror AWS S3's 400 InvalidRequest.
4103 let _ = m;
4104 return Err(sse_c_error_to_s3(
4105 crate::sse::SseError::CustomerKeyUnexpected,
4106 ));
4107 } else {
4108 blob
4109 };
4110 // v0.5 #27: SSE-C echo on success — algorithm + key MD5
4111 // tell the client that the supplied key was the one used.
4112 if let Some(ref m) = get_sse_c_material {
4113 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
4114 resp.output.sse_customer_key_md5 =
4115 Some(base64::engine::general_purpose::STANDARD.encode(m.key_md5));
4116 }
4117 // ====== Streaming fast path (CpuZstd, non-multipart, codec supports it) ======
4118 // 大規模 object (e.g. 5 GB) を memory に collect すると OOM するので、
4119 // codec が streaming-aware なら body を chunk-by-chunk で decompress して
4120 // 即座に client に流す。
4121 //
4122 // ただし Range request 時は streaming できない (slice するため total bytes
4123 // が必要) → buffered path に fall through。
4124 if range_request.is_none()
4125 && !needs_frame_parse
4126 && let Some(ref m) = manifest_opt
4127 && supports_streaming_decompress(m.codec)
4128 && m.codec == CodecKind::CpuZstd
4129 {
4130 // v0.8.4 #73 H-1: wrap the decompressor output in a
4131 // rolling-CRC32C verifier so a tampered ciphertext (or a
4132 // backend-side corruption that the zstd decoder happens
4133 // to "successfully" decode into wrong bytes) surfaces as
4134 // a streaming error tail at EOF instead of silently
4135 // delivering corrupt plaintext to the client. The wrap
4136 // is a pure pass-through during the body — no extra
4137 // buffering, TTFB unaffected — and the integrity
4138 // decision lands at the last chunk.
4139 let decompressed_blob = cpu_zstd_decompress_stream(blob);
4140 let verified_reader = Crc32cVerifyingReader::new(
4141 blob_to_async_read(decompressed_blob),
4142 m.crc32c,
4143 m.original_size,
4144 );
4145 let verified_blob = async_read_to_blob(verified_reader);
4146 resp.output.content_length = Some(m.original_size as i64);
4147 resp.output.checksum_crc32 = None;
4148 resp.output.checksum_crc32c = None;
4149 resp.output.checksum_crc64nvme = None;
4150 resp.output.checksum_sha1 = None;
4151 resp.output.checksum_sha256 = None;
4152 resp.output.e_tag = None;
4153 resp.output.body = Some(verified_blob);
4154 let elapsed = get_start.elapsed();
4155 crate::metrics::record_get(
4156 m.codec.as_str(),
4157 m.compressed_size,
4158 m.original_size,
4159 elapsed.as_secs_f64(),
4160 true,
4161 );
4162 info!(
4163 op = "get_object",
4164 bucket = %get_bucket,
4165 key = %get_key,
4166 codec = m.codec.as_str(),
4167 bytes_in = m.compressed_size,
4168 bytes_out = m.original_size,
4169 path = "streaming",
4170 setup_latency_ms = elapsed.as_millis() as u64,
4171 "S4 get started (streaming)"
4172 );
4173 return Ok(resp);
4174 }
4175 // Passthrough: そのまま流す (Range なしの場合のみ streaming)
4176 if range_request.is_none()
4177 && !needs_frame_parse
4178 && let Some(ref m) = manifest_opt
4179 && m.codec == CodecKind::Passthrough
4180 {
4181 resp.output.content_length = Some(m.original_size as i64);
4182 resp.output.checksum_crc32 = None;
4183 resp.output.checksum_crc32c = None;
4184 resp.output.checksum_crc64nvme = None;
4185 resp.output.checksum_sha1 = None;
4186 resp.output.checksum_sha256 = None;
4187 resp.output.e_tag = None;
4188 resp.output.body = Some(blob);
4189 debug!("S4 get_object: passthrough streaming");
4190 return Ok(resp);
4191 }
4192
4193 // ====== Buffered slow path (multipart frame parser, GPU codecs) ======
4194 let bytes = collect_blob(blob, self.max_body_bytes)
4195 .await
4196 .map_err(internal("collect get body"))?;
4197
4198 let decompressed = if needs_frame_parse {
4199 // multipart objects と framed-v2 single-PUT objects は同じ
4200 // S4F2 frame 列なので decompress_multipart で統一処理
4201 self.decompress_multipart(bytes).await?
4202 } else {
4203 let manifest = manifest_opt.as_ref().expect("non-multipart guarded above");
4204 self.registry
4205 .decompress(bytes, manifest)
4206 .await
4207 .map_err(internal("registry decompress"))?
4208 };
4209
4210 // Range request があれば slice。なければ full body を返す。
4211 let total_size = decompressed.len() as u64;
4212 let (final_bytes, status_override) = if let Some(r) = range_request.as_ref() {
4213 let (start, end) = resolve_range(r, total_size)
4214 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
4215 let sliced = decompressed.slice(start as usize..end as usize);
4216 resp.output.content_range = Some(format!(
4217 "bytes {start}-{}/{total_size}",
4218 end.saturating_sub(1)
4219 ));
4220 (sliced, Some(http::StatusCode::PARTIAL_CONTENT))
4221 } else {
4222 (decompressed, None)
4223 };
4224 // 解凍後の真のサイズを返す (S3 client は content_length を信頼するので
4225 // 圧縮 size のままだと downstream が body を途中で切ってしまう)
4226 resp.output.content_length = Some(final_bytes.len() as i64);
4227 // 圧縮済 bytes の checksum を返すと AWS SDK 側で StreamingError
4228 // (ChecksumMismatch) になる。ETag も backend が返した「圧縮済 bytes の
4229 // MD5/checksum」なので意味的にズレる — クリアして S4 自身の crc32c
4230 // (manifest 内 / frame 内) で integrity を保証する設計にする。
4231 resp.output.checksum_crc32 = None;
4232 resp.output.checksum_crc32c = None;
4233 resp.output.checksum_crc64nvme = None;
4234 resp.output.checksum_sha1 = None;
4235 resp.output.checksum_sha256 = None;
4236 resp.output.e_tag = None;
4237 let returned_size = final_bytes.len() as u64;
4238 let codec_label = manifest_opt
4239 .as_ref()
4240 .map(|m| m.codec.as_str())
4241 .unwrap_or("multipart");
4242 resp.output.body = Some(bytes_to_blob(final_bytes));
4243 if let Some(status) = status_override {
4244 resp.status = Some(status);
4245 }
4246 let elapsed = get_start.elapsed();
4247 crate::metrics::record_get(codec_label, 0, returned_size, elapsed.as_secs_f64(), true);
4248 info!(
4249 op = "get_object",
4250 bucket = %get_bucket,
4251 key = %get_key,
4252 codec = codec_label,
4253 bytes_out = returned_size,
4254 total_object_size = total_size,
4255 range = range_request.is_some(),
4256 path = "buffered",
4257 latency_ms = elapsed.as_millis() as u64,
4258 "S4 get completed (buffered)"
4259 );
4260 }
4261 // v0.6 #40: echo the recorded `x-amz-replication-status` so
4262 // consumers can poll progress (PENDING / COMPLETED / FAILED).
4263 if let Some(mgr) = self.replication.as_ref()
4264 && let Some(status) = mgr.lookup_status(&get_bucket, &get_key)
4265 {
4266 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
4267 status.as_aws_str().to_owned(),
4268 ));
4269 }
4270 Ok(resp)
4271 }
4272
4273 // === passthrough delegations ===
4274 async fn head_bucket(
4275 &self,
4276 req: S3Request<HeadBucketInput>,
4277 ) -> S3Result<S3Response<HeadBucketOutput>> {
4278 self.backend.head_bucket(req).await
4279 }
4280 async fn list_buckets(
4281 &self,
4282 req: S3Request<ListBucketsInput>,
4283 ) -> S3Result<S3Response<ListBucketsOutput>> {
4284 self.backend.list_buckets(req).await
4285 }
4286 async fn create_bucket(
4287 &self,
4288 req: S3Request<CreateBucketInput>,
4289 ) -> S3Result<S3Response<CreateBucketOutput>> {
4290 self.backend.create_bucket(req).await
4291 }
4292 async fn delete_bucket(
4293 &self,
4294 req: S3Request<DeleteBucketInput>,
4295 ) -> S3Result<S3Response<DeleteBucketOutput>> {
4296 self.backend.delete_bucket(req).await
4297 }
4298 async fn head_object(
4299 &self,
4300 req: S3Request<HeadObjectInput>,
4301 ) -> S3Result<S3Response<HeadObjectOutput>> {
4302 // v0.6 #40: capture bucket/key before req is consumed so the
4303 // replication-status echo can look the entry up.
4304 let head_bucket = req.input.bucket.clone();
4305 let head_key = req.input.key.clone();
4306 // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
4307 self.check_not_reserved_key(&head_key, ReservedKeyMode::Read)?;
4308 let mut resp = self.backend.head_object(req).await?;
4309 if let Some(manifest) = extract_manifest(&resp.output.metadata) {
4310 // 客側には decompress 後の意味のある content_length / checksum を返す。
4311 // backend が返す圧縮済 bytes の checksum / e_tag は意味が違うため除去
4312 // (S4 は manifest 内の crc32c で integrity を担保する)。
4313 resp.output.content_length = Some(manifest.original_size as i64);
4314 resp.output.checksum_crc32 = None;
4315 resp.output.checksum_crc32c = None;
4316 resp.output.checksum_crc64nvme = None;
4317 resp.output.checksum_sha1 = None;
4318 resp.output.checksum_sha256 = None;
4319 resp.output.e_tag = None;
4320 }
4321 // v0.6 #40: echo `x-amz-replication-status` (PENDING / COMPLETED
4322 // / FAILED) so consumers can poll progress without a GET.
4323 if let Some(mgr) = self.replication.as_ref()
4324 && let Some(status) = mgr.lookup_status(&head_bucket, &head_key)
4325 {
4326 resp.output.replication_status = Some(s3s::dto::ReplicationStatus::from(
4327 status.as_aws_str().to_owned(),
4328 ));
4329 }
4330 // v0.7 #48 BUG-4 fix: HEAD must echo SSE indicators so SDKs
4331 // and pipelines see the same posture they got on PUT. The PUT
4332 // path stamps `s4-sse-type` metadata for exactly this — HEAD
4333 // doesn't fetch the body, so it can't peek frame magic.
4334 if let Some(meta) = resp.output.metadata.as_ref()
4335 && let Some(sse_type) = meta.get("s4-sse-type")
4336 {
4337 {
4338 match sse_type.as_str() {
4339 "aws:kms" => {
4340 resp.output.server_side_encryption = Some(
4341 ServerSideEncryption::from_static(ServerSideEncryption::AWS_KMS),
4342 );
4343 if let Some(key_id) = meta.get("s4-sse-kms-key-id") {
4344 resp.output.ssekms_key_id = Some(key_id.clone());
4345 }
4346 }
4347 _ => {
4348 resp.output.server_side_encryption = Some(
4349 ServerSideEncryption::from_static(ServerSideEncryption::AES256),
4350 );
4351 if let Some(md5) = meta.get("s4-sse-c-key-md5") {
4352 resp.output.sse_customer_algorithm =
4353 Some(crate::sse::SSE_C_ALGORITHM.into());
4354 resp.output.sse_customer_key_md5 = Some(md5.clone());
4355 }
4356 }
4357 }
4358 }
4359 }
4360 Ok(resp)
4361 }
4362 async fn delete_object(
4363 &self,
4364 mut req: S3Request<DeleteObjectInput>,
4365 ) -> S3Result<S3Response<DeleteObjectOutput>> {
4366 let bucket = req.input.bucket.clone();
4367 let key = req.input.key.clone();
4368 // v0.8.16 F-13 / v0.8.17 G-2: shared reserved-name guard.
4369 // The S4 internal sidecar cleanup path
4370 // (`write_sidecar` and friends) talks to
4371 // `self.backend.delete_object(...)` directly, NOT through
4372 // this trait method, so the guard doesn't break
4373 // legitimate sidecar cleanup.
4374 self.check_not_reserved_key(&key, ReservedKeyMode::Mutating)?;
4375 self.enforce_rate_limit(&req, &bucket)?;
4376 self.enforce_policy(&req, "s3:DeleteObject", &bucket, Some(&key))?;
4377 // v0.6 #42: MFA Delete enforcement. When the bucket has
4378 // MFA-Delete = Enabled, every DELETE / DELETE-version /
4379 // delete-marker form needs `x-amz-mfa: <serial> <code>` (RFC 6238
4380 // 6-digit TOTP). Runs *before* the WORM / versioning routers so
4381 // a missing token is denied for free regardless of which delete
4382 // path the request would otherwise take.
4383 if let Some(mgr) = self.mfa_delete.as_ref()
4384 && mgr.is_enabled(&bucket)
4385 {
4386 let header = req.input.mfa.as_deref();
4387 if let Err(e) = crate::mfa::check_mfa(&bucket, header, mgr, current_unix_secs()) {
4388 crate::metrics::record_mfa_delete_denial(&bucket);
4389 return Err(mfa_error_to_s3(e));
4390 }
4391 }
4392 // v0.5 #30: refuse the delete while a WORM lock is in effect.
4393 // Compliance can never be bypassed; Governance can be overridden
4394 // via `x-amz-bypass-governance-retention: true`; legal hold
4395 // never. The check happens before the versioning router so a
4396 // locked object can't be soft-deleted (delete-marker push) on an
4397 // Enabled bucket either — S3 spec says lock applies to all
4398 // delete forms.
4399 if let Some(mgr) = self.object_lock.as_ref()
4400 && let Some(state) = mgr.get(&bucket, &key)
4401 {
4402 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
4403 // v0.8.12 HIGH-7 fix: the bypass header alone used to be
4404 // enough to override Governance retention. AWS spec
4405 // requires the caller hold `s3:BypassGovernanceRetention`
4406 // for the target ARN; without that, the header is
4407 // silently ignored (not an error — it lines up with how
4408 // AWS' canonical behaviour treats unprivileged callers).
4409 let bypass_allowed = if bypass_header {
4410 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
4411 .is_ok()
4412 } else {
4413 false
4414 };
4415 let now = chrono::Utc::now();
4416 if !state.can_delete(now, bypass_allowed) {
4417 crate::metrics::record_policy_denial("s3:DeleteObject", &bucket);
4418 return Err(S3Error::with_message(
4419 S3ErrorCode::AccessDenied,
4420 "Access Denied because object protected by object lock",
4421 ));
4422 }
4423 }
4424 // v0.5 #34: route DELETE through the VersioningManager when the
4425 // bucket is in a versioning-aware state.
4426 //
4427 // - Enabled bucket, no version_id → push a delete marker into
4428 // the chain. NO backend object is touched (older versions
4429 // stay reachable via specific-version GET).
4430 // - Enabled / Suspended bucket, with version_id → physical
4431 // delete. Backend bytes at the shadow key (or `<key>` for
4432 // `null`) are removed; chain entry is dropped. If the deleted
4433 // entry was a delete marker, no backend bytes exist for it
4434 // (record-only).
4435 // - Suspended bucket, no version_id → push a "null" delete
4436 // marker (S3 spec); backend bytes at `<key>` are physically
4437 // removed (same as legacy).
4438 // - Unversioned bucket → fall through to legacy passthrough.
4439 if let Some(mgr) = self.versioning.as_ref() {
4440 let state = mgr.state(&bucket);
4441 if state != crate::versioning::VersioningState::Unversioned {
4442 let req_vid = req.input.version_id.take();
4443 if let Some(vid) = req_vid {
4444 // Specific-version DELETE: touch backend bytes only
4445 // when the entry was a real version (not a delete
4446 // marker, which has no backend bytes).
4447 let outcome = mgr.record_delete_specific(&bucket, &key, &vid);
4448 let backend_target = if vid == crate::versioning::NULL_VERSION_ID {
4449 key.clone()
4450 } else {
4451 versioned_shadow_key(&key, &vid)
4452 };
4453 let was_real_version = outcome
4454 .as_ref()
4455 .map(|o| !o.is_delete_marker)
4456 .unwrap_or(false);
4457 if was_real_version {
4458 // Best-effort backend cleanup; missing bytes
4459 // are not an error (e.g. shadow key already
4460 // GC'd).
4461 let backend_input = DeleteObjectInput {
4462 bucket: bucket.clone(),
4463 key: backend_target,
4464 ..Default::default()
4465 };
4466 let backend_req = S3Request {
4467 input: backend_input,
4468 method: http::Method::DELETE,
4469 uri: req.uri.clone(),
4470 headers: req.headers.clone(),
4471 extensions: http::Extensions::new(),
4472 credentials: req.credentials.clone(),
4473 region: req.region.clone(),
4474 service: req.service.clone(),
4475 trailing_headers: None,
4476 };
4477 let _ = self.backend.delete_object(backend_req).await;
4478 }
4479 let mut output = DeleteObjectOutput {
4480 version_id: Some(vid.clone()),
4481 ..Default::default()
4482 };
4483 if let Some(o) = outcome.as_ref()
4484 && o.is_delete_marker
4485 {
4486 output.delete_marker = Some(true);
4487 }
4488 // v0.6 #35: specific-version DELETE always counts as
4489 // a hard `ObjectRemoved:Delete` event (the chain
4490 // entry, marker or not, is gone after this call).
4491 self.fire_delete_notification(
4492 &bucket,
4493 &key,
4494 crate::notifications::EventType::ObjectRemovedDelete,
4495 Some(vid.clone()),
4496 );
4497 return Ok(S3Response::new(output));
4498 }
4499 // No version_id: record a delete marker (state-aware).
4500 let outcome = mgr.record_delete(&bucket, &key);
4501 if state == crate::versioning::VersioningState::Suspended {
4502 // Suspended buckets also evict the prior `<key>`
4503 // bytes (the previous null version is gone too).
4504 let backend_input = DeleteObjectInput {
4505 bucket: bucket.clone(),
4506 key: key.clone(),
4507 ..Default::default()
4508 };
4509 let backend_req = S3Request {
4510 input: backend_input,
4511 method: http::Method::DELETE,
4512 uri: req.uri.clone(),
4513 headers: req.headers.clone(),
4514 extensions: http::Extensions::new(),
4515 credentials: req.credentials.clone(),
4516 region: req.region.clone(),
4517 service: req.service.clone(),
4518 trailing_headers: None,
4519 };
4520 let _ = self.backend.delete_object(backend_req).await;
4521 }
4522 let output = DeleteObjectOutput {
4523 delete_marker: Some(true),
4524 version_id: outcome.version_id.clone(),
4525 ..Default::default()
4526 };
4527 // v0.6 #35: versioned bucket DELETE without a version-id
4528 // creates a delete marker — the dedicated AWS event
4529 // taxonomy entry. Suspended-state buckets also push a
4530 // (null) marker, so the same event fires there.
4531 self.fire_delete_notification(
4532 &bucket,
4533 &key,
4534 crate::notifications::EventType::ObjectRemovedDeleteMarker,
4535 outcome.version_id,
4536 );
4537 return Ok(S3Response::new(output));
4538 }
4539 }
4540 // Legacy / Unversioned path: physical delete on the backend +
4541 // best-effort sidecar cleanup (mirrors v0.4 behaviour).
4542 let resp = self.backend.delete_object(req).await?;
4543 // v0.5 #30: drop any per-object lock state once the delete has
4544 // succeeded so the freed key can be re-armed by a future PUT
4545 // under the bucket default. Reaching here implies the lock had
4546 // already passed `can_delete` above, so this is purely cleanup.
4547 if let Some(mgr) = self.object_lock.as_ref() {
4548 mgr.clear(&bucket, &key);
4549 }
4550 // v0.6 #39: drop any object-level tag set on physical delete —
4551 // the freed key starts a fresh tag history if a future PUT
4552 // re-creates it. (Versioned-delete branches above return early
4553 // and do NOT touch tags, mirroring AWS where tag state is
4554 // attached to the logical key, not the version chain.)
4555 if let Some(mgr) = self.tagging.as_ref() {
4556 mgr.delete_object_tags(&bucket, &key);
4557 }
4558 let sidecar = sidecar_key(&key);
4559 // v0.7 #49: skip the sidecar DELETE if the key + sidecar suffix
4560 // can't be encoded into a request URI — the primary delete
4561 // already succeeded and a stale sidecar is harmless (Range GET
4562 // re-validates the underlying object on next read).
4563 if let Ok(uri) = safe_object_uri(&bucket, &sidecar) {
4564 let sidecar_input = DeleteObjectInput {
4565 bucket: bucket.clone(),
4566 key: sidecar,
4567 ..Default::default()
4568 };
4569 let sidecar_req = S3Request {
4570 input: sidecar_input,
4571 method: http::Method::DELETE,
4572 uri,
4573 headers: http::HeaderMap::new(),
4574 extensions: http::Extensions::new(),
4575 credentials: None,
4576 region: None,
4577 service: None,
4578 trailing_headers: None,
4579 };
4580 let _ = self.backend.delete_object(sidecar_req).await;
4581 }
4582 // v0.6 #35: legacy unversioned-bucket hard delete fires the
4583 // canonical `ObjectRemoved:Delete` event.
4584 self.fire_delete_notification(
4585 &bucket,
4586 &key,
4587 crate::notifications::EventType::ObjectRemovedDelete,
4588 None,
4589 );
4590 Ok(resp)
4591 }
4592 async fn delete_objects(
4593 &self,
4594 req: S3Request<DeleteObjectsInput>,
4595 ) -> S3Result<S3Response<DeleteObjectsOutput>> {
4596 // v0.6 #42: MFA Delete applies once to the whole batch (S3 spec:
4597 // when MFA-Delete is on the bucket, a missing / invalid token
4598 // fails the entire DeleteObjects request, not per-object).
4599 if let Some(mgr) = self.mfa_delete.as_ref()
4600 && mgr.is_enabled(&req.input.bucket)
4601 {
4602 let header = req.input.mfa.as_deref();
4603 if let Err(e) =
4604 crate::mfa::check_mfa(&req.input.bucket, header, mgr, current_unix_secs())
4605 {
4606 crate::metrics::record_mfa_delete_denial(&req.input.bucket);
4607 return Err(mfa_error_to_s3(e));
4608 }
4609 }
4610 // v0.8.11 CRIT-3 fix: route every entry through the gated
4611 // per-object `delete_object` path so Object Lock, IAM policy,
4612 // versioning, tagging, sidecar cleanup and notification fan-
4613 // out all fire for batch DELETE. The previous
4614 // `self.backend.delete_objects(req).await` straight-through
4615 // bypassed every gate, so a `legal_hold=on` key listed inside
4616 // a DeleteObjects XML was happily removed.
4617 //
4618 // S3 spec note: DeleteObjects is "best-effort per object" —
4619 // a failure on one key surfaces as an `Errors` entry without
4620 // aborting the rest of the batch. Quiet-mode suppresses the
4621 // `Deleted` list (errors are still reported). We honour both.
4622 let bucket = req.input.bucket.clone();
4623 let bypass_governance = req.input.bypass_governance_retention.unwrap_or(false);
4624 let mfa_header = req.input.mfa.clone();
4625 let quiet = req.input.delete.quiet.unwrap_or(false);
4626 let mut deleted: Vec<DeletedObject> = Vec::new();
4627 let mut errors: Vec<s3s::dto::Error> = Vec::new();
4628 for ident in req.input.delete.objects.iter() {
4629 let key = ident.key.clone();
4630 let version_id = ident.version_id.clone();
4631 let per_input = DeleteObjectInput {
4632 bucket: bucket.clone(),
4633 key: key.clone(),
4634 version_id: version_id.clone(),
4635 bypass_governance_retention: Some(bypass_governance),
4636 mfa: mfa_header.clone(),
4637 ..Default::default()
4638 };
4639 let per_uri = match safe_object_uri(&bucket, &key) {
4640 Ok(u) => u,
4641 Err(_) => {
4642 errors.push(s3s::dto::Error {
4643 code: Some("InvalidArgument".to_owned()),
4644 key: Some(key),
4645 message: Some("object key is not URI-encodable".to_owned()),
4646 version_id,
4647 });
4648 continue;
4649 }
4650 };
4651 let per_req = S3Request {
4652 input: per_input,
4653 method: http::Method::DELETE,
4654 uri: per_uri,
4655 headers: req.headers.clone(),
4656 extensions: http::Extensions::new(),
4657 credentials: req.credentials.clone(),
4658 region: req.region.clone(),
4659 service: req.service.clone(),
4660 trailing_headers: None,
4661 };
4662 match self.delete_object(per_req).await {
4663 Ok(resp) => {
4664 let out = resp.output;
4665 // DeleteObjectOutput doesn't surface a separate
4666 // `delete_marker_version_id`; the marker's version
4667 // id is whatever `version_id` carries (when the
4668 // versioning manager pushed a delete-marker, that
4669 // field already holds the marker's vid).
4670 let vid = out.version_id.clone().or(version_id);
4671 deleted.push(DeletedObject {
4672 key: Some(key),
4673 version_id: vid.clone(),
4674 delete_marker: out.delete_marker,
4675 delete_marker_version_id: vid,
4676 });
4677 }
4678 Err(e) => {
4679 let code_str = e.code().as_str().to_owned();
4680 let msg = e.message().unwrap_or(code_str.as_str()).to_owned();
4681 errors.push(s3s::dto::Error {
4682 code: Some(code_str),
4683 key: Some(key),
4684 message: Some(msg),
4685 version_id,
4686 });
4687 }
4688 }
4689 }
4690 let output = DeleteObjectsOutput {
4691 deleted: if quiet || deleted.is_empty() {
4692 None
4693 } else {
4694 Some(deleted)
4695 },
4696 errors: if errors.is_empty() {
4697 None
4698 } else {
4699 Some(errors)
4700 },
4701 ..Default::default()
4702 };
4703 Ok(S3Response::new(output))
4704 }
4705 async fn copy_object(
4706 &self,
4707 mut req: S3Request<CopyObjectInput>,
4708 ) -> S3Result<S3Response<CopyObjectOutput>> {
4709 // copy is conceptually "GetObject src + PutObject dst" — enforce both.
4710 let dst_bucket = req.input.bucket.clone();
4711 let dst_key = req.input.key.clone();
4712 // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
4713 self.check_not_reserved_key(&dst_key, ReservedKeyMode::Mutating)?;
4714 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
4715 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4716 // v0.8.17 G-2: source `<key>.s4index` would let
4717 // CopyObject expose the raw sidecar (frame layout +
4718 // source ETag) into a writable destination, bypassing
4719 // the F-13 GET reject. Same guard, Read mode (returns
4720 // NoSuchKey to match listing semantics).
4721 self.check_not_reserved_key(key, ReservedKeyMode::Read)?;
4722 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
4723 }
4724 // S4-aware copy: source object に s4-* metadata がある場合、それを
4725 // destination に確実に preserve する。
4726 //
4727 // - MetadataDirective::COPY (default): backend が source metadata を
4728 // そのまま copy するので S4 metadata も自動で渡る。介入不要
4729 // - MetadataDirective::REPLACE: 客が指定した metadata で source を
4730 // 上書き → s4-* metadata が消えると destination は decompress 不能に
4731 // なる (silent corruption)。S4 が source metadata を HEAD で取得し、
4732 // s4-* fields を input.metadata に強制 merge する
4733 let needs_merge = req
4734 .input
4735 .metadata_directive
4736 .as_ref()
4737 .map(|d| d.as_str() == MetadataDirective::REPLACE)
4738 .unwrap_or(false);
4739 if needs_merge && let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
4740 // v0.8.16 F-8: strip the client-supplied `s4-*` keys
4741 // *unconditionally* — the v0.8.15 M-2 fix only ran the
4742 // strip inside the `if let Ok(head) = ...` block, so a
4743 // backend HEAD failure (transient 5xx, NoSuchKey on a
4744 // racing delete) left attacker-injected `s4-*` /
4745 // `S4-*` metadata intact on the destination. Now we
4746 // strip first, then re-populate from the source HEAD
4747 // when available — HEAD failure simply means the
4748 // destination loses the codec markers (correct: a
4749 // CopyObject without the source's codec metadata
4750 // produces an unreadable object, but doesn't allow
4751 // injection).
4752 let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4753 dest_meta.retain(|k, _| !k.to_ascii_lowercase().starts_with("s4-"));
4754 let head_input = HeadObjectInput {
4755 bucket: bucket.to_string(),
4756 key: key.to_string(),
4757 ..Default::default()
4758 };
4759 let head_req = S3Request {
4760 input: head_input,
4761 method: req.method.clone(),
4762 uri: req.uri.clone(),
4763 headers: req.headers.clone(),
4764 extensions: http::Extensions::new(),
4765 credentials: req.credentials.clone(),
4766 region: req.region.clone(),
4767 service: req.service.clone(),
4768 trailing_headers: None,
4769 };
4770 if let Ok(head) = self.backend.head_object(head_req).await
4771 && let Some(src_meta) = head.output.metadata.as_ref()
4772 {
4773 let dest_meta = req.input.metadata.get_or_insert_with(Default::default);
4774 for key in [
4775 META_CODEC,
4776 META_ORIGINAL_SIZE,
4777 META_COMPRESSED_SIZE,
4778 META_CRC32C,
4779 META_MULTIPART,
4780 META_FRAMED,
4781 ] {
4782 if let Some(v) = src_meta.get(key) {
4783 dest_meta.insert(key.to_string(), v.clone());
4784 }
4785 }
4786 // SSE markers are equally reserved — propagate any
4787 // source flags so a copy of an encrypted object stays
4788 // marked as encrypted at the destination.
4789 for sse_key in [
4790 "s4-encrypted",
4791 "s4-sse-type",
4792 "s4-sse-c-key-md5",
4793 "s4-sse-kms-key-id",
4794 ] {
4795 if let Some(v) = src_meta.get(sse_key) {
4796 dest_meta.insert(sse_key.to_string(), v.clone());
4797 }
4798 }
4799 debug!(
4800 src_bucket = %bucket,
4801 src_key = %key,
4802 "S4 copy_object: replaced client s4-* metadata with source values across REPLACE directive (v0.8.15 M-2)"
4803 );
4804 }
4805 }
4806 self.backend.copy_object(req).await
4807 }
4808 async fn list_objects(
4809 &self,
4810 req: S3Request<ListObjectsInput>,
4811 ) -> S3Result<S3Response<ListObjectsOutput>> {
4812 self.enforce_rate_limit(&req, &req.input.bucket)?;
4813 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4814 let mut resp = self.backend.list_objects(req).await?;
4815 // S4 内部 object (`*.s4index` sidecar、`.__s4ver__/` shadow versions
4816 // — v0.5 #34) を顧客から隠す。
4817 if let Some(contents) = resp.output.contents.as_mut() {
4818 contents.retain(|o| {
4819 o.key
4820 .as_ref()
4821 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4822 .unwrap_or(true)
4823 });
4824 }
4825 Ok(resp)
4826 }
4827 async fn list_objects_v2(
4828 &self,
4829 req: S3Request<ListObjectsV2Input>,
4830 ) -> S3Result<S3Response<ListObjectsV2Output>> {
4831 self.enforce_rate_limit(&req, &req.input.bucket)?;
4832 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4833 let mut resp = self.backend.list_objects_v2(req).await?;
4834 if let Some(contents) = resp.output.contents.as_mut() {
4835 let before = contents.len();
4836 contents.retain(|o| {
4837 o.key
4838 .as_ref()
4839 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4840 .unwrap_or(true)
4841 });
4842 // key_count も補正 (S3 spec compliance)
4843 if let Some(kc) = resp.output.key_count.as_mut() {
4844 *kc -= (before - contents.len()) as i32;
4845 }
4846 }
4847 Ok(resp)
4848 }
4849 /// v0.4 #17: filter S4-internal sidecars from versioned listings.
4850 /// v0.5 #34: when a [`crate::versioning::VersioningManager`] is
4851 /// attached AND the bucket is in a versioning-aware state, build
4852 /// the `Versions` / `DeleteMarkers` arrays directly from the
4853 /// in-memory chain (paginated + ordered the S3 way: key asc,
4854 /// version newest-first inside each key). Otherwise fall back to
4855 /// passthrough + sidecar-filter (legacy v0.4 behaviour).
4856 async fn list_object_versions(
4857 &self,
4858 req: S3Request<ListObjectVersionsInput>,
4859 ) -> S3Result<S3Response<ListObjectVersionsOutput>> {
4860 self.enforce_rate_limit(&req, &req.input.bucket)?;
4861 self.enforce_policy(&req, "s3:ListBucket", &req.input.bucket, None)?;
4862 // v0.5 #34: VersioningManager-owned path.
4863 if let Some(mgr) = self.versioning.as_ref()
4864 && mgr.state(&req.input.bucket) != crate::versioning::VersioningState::Unversioned
4865 {
4866 let max_keys = req.input.max_keys.unwrap_or(1000) as usize;
4867 let page = mgr.list_versions(
4868 &req.input.bucket,
4869 req.input.prefix.as_deref(),
4870 req.input.key_marker.as_deref(),
4871 req.input.version_id_marker.as_deref(),
4872 max_keys,
4873 );
4874 let versions: Vec<ObjectVersion> = page
4875 .versions
4876 .into_iter()
4877 .map(|e| ObjectVersion {
4878 key: Some(e.key),
4879 version_id: Some(e.version_id),
4880 is_latest: Some(e.is_latest),
4881 e_tag: Some(ETag::Strong(e.etag)),
4882 size: Some(e.size as i64),
4883 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4884 ..Default::default()
4885 })
4886 .collect();
4887 let delete_markers: Vec<DeleteMarkerEntry> = page
4888 .delete_markers
4889 .into_iter()
4890 .map(|e| DeleteMarkerEntry {
4891 key: Some(e.key),
4892 version_id: Some(e.version_id),
4893 is_latest: Some(e.is_latest),
4894 last_modified: Some(std::time::SystemTime::from(e.last_modified).into()),
4895 ..Default::default()
4896 })
4897 .collect();
4898 let output = ListObjectVersionsOutput {
4899 name: Some(req.input.bucket.clone()),
4900 prefix: req.input.prefix.clone(),
4901 key_marker: req.input.key_marker.clone(),
4902 version_id_marker: req.input.version_id_marker.clone(),
4903 max_keys: req.input.max_keys,
4904 versions: if versions.is_empty() {
4905 None
4906 } else {
4907 Some(versions)
4908 },
4909 delete_markers: if delete_markers.is_empty() {
4910 None
4911 } else {
4912 Some(delete_markers)
4913 },
4914 is_truncated: Some(page.is_truncated),
4915 next_key_marker: page.next_key_marker,
4916 next_version_id_marker: page.next_version_id_marker,
4917 ..Default::default()
4918 };
4919 return Ok(S3Response::new(output));
4920 }
4921 // Legacy passthrough path (v0.4 #17 sidecar filter retained).
4922 let mut resp = self.backend.list_object_versions(req).await?;
4923 if let Some(versions) = resp.output.versions.as_mut() {
4924 versions.retain(|v| {
4925 v.key
4926 .as_ref()
4927 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4928 .unwrap_or(true)
4929 });
4930 }
4931 if let Some(markers) = resp.output.delete_markers.as_mut() {
4932 markers.retain(|m| {
4933 m.key
4934 .as_ref()
4935 .map(|k| !k.ends_with(".s4index") && !is_versioning_shadow_key(k))
4936 .unwrap_or(true)
4937 });
4938 }
4939 Ok(resp)
4940 }
4941
4942 async fn create_multipart_upload(
4943 &self,
4944 mut req: S3Request<CreateMultipartUploadInput>,
4945 ) -> S3Result<S3Response<CreateMultipartUploadOutput>> {
4946 // v0.8.12 HIGH-9 fix: gate multipart Create on `s3:PutObject` —
4947 // the destination is conceptually about to host a new object,
4948 // matching what `put_object` enforces L2078. Without this, a
4949 // bucket policy denying `s3:PutObject` was bypassable simply
4950 // by switching the client to the multipart wire path.
4951 let mp_bucket = req.input.bucket.clone();
4952 let mp_key = req.input.key.clone();
4953 // v0.8.15 M-1 / v0.8.17 G-2: shared reserved-name guard.
4954 self.check_not_reserved_key(&mp_key, ReservedKeyMode::Mutating)?;
4955 self.enforce_policy(&req, "s3:PutObject", &mp_bucket, Some(&mp_key))?;
4956 self.enforce_rate_limit(&req, &mp_bucket)?;
4957 // Multipart object は per-part 圧縮 + frame 形式で書く。GET 時に
4958 // frame parse を起動するため、object metadata に flag を立てる。
4959 // codec は dispatcher の default kind を採用 (per-part 別 codec は Phase 2)。
4960 let codec_kind = self.registry.default_kind();
4961 let meta = req.input.metadata.get_or_insert_with(Default::default);
4962 meta.insert(META_MULTIPART.into(), "true".into());
4963 meta.insert(META_CODEC.into(), codec_kind.as_str().into());
4964 // v0.8 #54 BUG-10 fix: take() the SSE request fields off
4965 // `req.input` so they are NOT forwarded to the backend on
4966 // CreateMultipartUpload. Same root cause as v0.7 #48 BUG-2/3 on
4967 // single-PUT — MinIO rejects SSE-C with "HTTPS required" and
4968 // SSE-KMS with "KMS not configured" when the headers reach it.
4969 // S4 owns the encrypt-then-store contract; we capture the
4970 // recipe in `multipart_state` here and apply it on Complete.
4971 let sse_c_alg = req.input.sse_customer_algorithm.take();
4972 let sse_c_key = req.input.sse_customer_key.take();
4973 let sse_c_md5 = req.input.sse_customer_key_md5.take();
4974 let sse_header = req.input.server_side_encryption.take();
4975 let sse_kms_key = req.input.ssekms_key_id.take();
4976 // Strip the encryption-context too — leaving it would make
4977 // MinIO try to validate it against a non-existent KMS key.
4978 let _ = req.input.ssekms_encryption_context.take();
4979 let sse_c_material = extract_sse_c_material(&sse_c_alg, &sse_c_key, &sse_c_md5)?;
4980 let kms_key_id = extract_kms_key_id(
4981 &sse_header,
4982 &sse_kms_key,
4983 self.kms_default_key_id.as_deref(),
4984 );
4985 // SSE-C / SSE-KMS exclusivity (mirrors put_object L1870).
4986 if sse_c_material.is_some() && kms_key_id.is_some() {
4987 return Err(S3Error::with_message(
4988 S3ErrorCode::InvalidArgument,
4989 "SSE-C and SSE-KMS cannot be used together on the same multipart upload",
4990 ));
4991 }
4992 let sse_mode = if let Some(ref m) = sse_c_material {
4993 // v0.8.2 #62 (H-6 audit fix): wrap the customer-supplied
4994 // 32-byte key in `Zeroizing` so abandoned uploads (or
4995 // normal Complete/Abort) wipe the key bytes on drop. The
4996 // `key_md5` is the public fingerprint and stays as a
4997 // bare `[u8; 16]`.
4998 crate::multipart_state::MultipartSseMode::SseC {
4999 key: zeroize::Zeroizing::new(m.key),
5000 key_md5: m.key_md5,
5001 }
5002 } else if let Some(ref kid) = kms_key_id {
5003 // KMS pre-flight: fail at Create rather than at Complete if
5004 // the gateway has no KMS backend wired (mirrors the
5005 // put_object L1879 check).
5006 if self.kms.is_none() {
5007 return Err(S3Error::with_message(
5008 S3ErrorCode::InvalidRequest,
5009 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
5010 ));
5011 }
5012 crate::multipart_state::MultipartSseMode::SseKms {
5013 key_id: kid.clone(),
5014 }
5015 } else if self.sse_keyring.is_some() {
5016 // SSE-S4: server-driven transparent encryption. Activates
5017 // whenever the gateway has a keyring configured AND the
5018 // client didn't pick a different SSE mode.
5019 crate::multipart_state::MultipartSseMode::SseS4
5020 } else {
5021 crate::multipart_state::MultipartSseMode::None
5022 };
5023 // v0.8 #54 BUG-9 fix: parse the Tagging header on Create. The
5024 // single-PUT path does this on PutObject; the multipart path
5025 // captures it now and commits via TagManager on Complete.
5026 let request_tags: Option<crate::tagging::TagSet> = req
5027 .input
5028 .tagging
5029 .as_deref()
5030 .map(crate::tagging::parse_tagging_header)
5031 .transpose()
5032 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
5033 // Strip the `Tagging` field off the input so the backend
5034 // doesn't try to apply it (no-op on MinIO but keeps the wire
5035 // clean).
5036 let _ = req.input.tagging.take();
5037 // Object Lock recipe (BUG-7 — captured here, applied on Complete).
5038 let explicit_lock_mode: Option<crate::object_lock::LockMode> = req
5039 .input
5040 .object_lock_mode
5041 .as_ref()
5042 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
5043 let explicit_retain_until: Option<chrono::DateTime<chrono::Utc>> = req
5044 .input
5045 .object_lock_retain_until_date
5046 .as_ref()
5047 .and_then(timestamp_to_chrono_utc);
5048 let explicit_legal_hold_on: bool = req
5049 .input
5050 .object_lock_legal_hold_status
5051 .as_ref()
5052 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
5053 .unwrap_or(false);
5054 let bucket = req.input.bucket.clone();
5055 let key = req.input.key.clone();
5056 debug!(
5057 bucket = %bucket,
5058 key = %key,
5059 codec = codec_kind.as_str(),
5060 sse = ?sse_mode,
5061 "S4 create_multipart_upload: marking object for per-part compression"
5062 );
5063 let mut resp = self.backend.create_multipart_upload(req).await?;
5064 // Stash the per-upload context only after the backend handed
5065 // us an upload_id (failed Creates leave nothing in the store).
5066 if let Some(upload_id) = resp.output.upload_id.as_ref() {
5067 self.multipart_state.put(
5068 upload_id,
5069 crate::multipart_state::MultipartUploadContext {
5070 bucket,
5071 key,
5072 sse: sse_mode.clone(),
5073 tags: request_tags,
5074 object_lock_mode: explicit_lock_mode,
5075 object_lock_retain_until: explicit_retain_until,
5076 object_lock_legal_hold: explicit_legal_hold_on,
5077 },
5078 );
5079 }
5080 // SSE-C / SSE-KMS response echo (mirrors put_object L2036-L2050).
5081 match &sse_mode {
5082 crate::multipart_state::MultipartSseMode::SseC { key_md5, .. } => {
5083 resp.output.sse_customer_algorithm = Some(crate::sse::SSE_C_ALGORITHM.into());
5084 resp.output.sse_customer_key_md5 =
5085 Some(base64::engine::general_purpose::STANDARD.encode(key_md5));
5086 }
5087 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5088 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5089 ServerSideEncryption::AWS_KMS,
5090 ));
5091 resp.output.ssekms_key_id = Some(key_id.clone());
5092 }
5093 _ => {}
5094 }
5095 Ok(resp)
5096 }
5097
5098 async fn upload_part(
5099 &self,
5100 mut req: S3Request<UploadPartInput>,
5101 ) -> S3Result<S3Response<UploadPartOutput>> {
5102 // v0.8.12 HIGH-9 fix: same `s3:PutObject` gate as
5103 // `put_object` / `create_multipart_upload`. Even though
5104 // Create already passed the gate, a bucket policy that
5105 // *revokes* `s3:PutObject` mid-flight should stop further
5106 // parts (e.g. legal hold drops, retention shortened).
5107 let part_bucket = req.input.bucket.clone();
5108 let part_key = req.input.key.clone();
5109 self.enforce_policy(&req, "s3:PutObject", &part_bucket, Some(&part_key))?;
5110 self.enforce_rate_limit(&req, &part_bucket)?;
5111 // 各 part を圧縮して frame header 付きで forward。GET 時に
5112 // `decompress_multipart` が frame iter で順に解凍する。
5113 // **per-part codec dispatch**: dispatcher が body 先頭 sample から
5114 // codec を選ぶので、parquet 風の mixed-content multipart で part ごとに
5115 // 最適 codec を使える (整数列 part → Bitcomp、text 列 part → zstd 等)。
5116 //
5117 // v0.8 #54 BUG-5/BUG-10 fix: lookup the per-upload SSE
5118 // context captured by `create_multipart_upload` and (a) strip
5119 // any SSE-C request headers off `req.input` so the backend
5120 // doesn't see them — same root cause as v0.7 #48 BUG-2/3 on
5121 // single-PUT; MinIO refuses SSE-C parts over HTTP — and (b)
5122 // observe that an upload context exists for `upload_id`. The
5123 // actual encrypt happens once at `complete_multipart_upload`
5124 // time on the assembled body (the per-part-encrypt approach
5125 // would require a matching multi-segment decrypt path on GET;
5126 // encrypting the whole assembled body keeps the GET path's
5127 // `is_sse_encrypted` branch in get_object L2429 working
5128 // unchanged).
5129 let sse_ctx = self.multipart_state.get(req.input.upload_id.as_str());
5130 // v0.8.2 #62 (H-1 audit fix): SSE-C key consistency check.
5131 // The AWS S3 spec requires the same SSE-C key headers on
5132 // every UploadPart and rejects mismatches with 400. Prior to
5133 // #62 we silently stripped the headers (BUG-10 fix) without
5134 // validating them, allowing a client to send part 1 under
5135 // key-A and part 2 under key-B; both got stored, then
5136 // re-encrypted with key-A on Complete — the client thinks
5137 // part 2 is under key-B but a GET with key-B would in fact
5138 // hit the part-1 ciphertext that was actually encrypted with
5139 // key-A. That would either decrypt successfully (silent
5140 // corruption: client lost track of which key encrypts what)
5141 // or fail in a confusing way. Validate the per-part headers
5142 // now and reject with 400 InvalidArgument on mismatch /
5143 // omission / partial supply, matching real-S3 behaviour.
5144 if let Some(ref ctx) = sse_ctx {
5145 if let crate::multipart_state::MultipartSseMode::SseC {
5146 key_md5: ctx_md5, ..
5147 } = &ctx.sse
5148 {
5149 let alg = req.input.sse_customer_algorithm.take();
5150 let key_b64 = req.input.sse_customer_key.take();
5151 let md5_b64 = req.input.sse_customer_key_md5.take();
5152 match (alg, key_b64, md5_b64) {
5153 (Some(a), Some(k), Some(m)) => {
5154 // Parse + validate; if the per-part headers
5155 // are themselves malformed (algorithm not
5156 // AES256, MD5 mismatch, key not 32 bytes)
5157 // surface the same 400 the single-PUT path
5158 // would. Then compare the parsed MD5 to the
5159 // upload-context's MD5; mismatch is a
5160 // different-key UploadPart and must reject.
5161 let part_material = crate::sse::parse_customer_key_headers(&a, &k, &m)
5162 .map_err(sse_c_error_to_s3)?;
5163 if part_material.key_md5 != *ctx_md5 {
5164 return Err(S3Error::with_message(
5165 S3ErrorCode::InvalidArgument,
5166 "SSE-C key on UploadPart does not match the key supplied on CreateMultipartUpload",
5167 ));
5168 }
5169 // OK — same key as Create. Headers are
5170 // already taken off `req.input` so the
5171 // backend never sees them.
5172 }
5173 (None, None, None) => {
5174 // AWS S3 spec: SSE-C headers MUST be replayed
5175 // on every UploadPart of an SSE-C multipart.
5176 // Real-S3 returns 400 InvalidRequest in this
5177 // case; mirror that.
5178 return Err(S3Error::with_message(
5179 S3ErrorCode::InvalidRequest,
5180 "SSE-C requires customer-key headers on every UploadPart (CreateMultipartUpload was SSE-C)",
5181 ));
5182 }
5183 _ => {
5184 // Partial header set (e.g. algorithm + key
5185 // but no MD5) — same handling as the
5186 // single-PUT `extract_sse_c_material` helper.
5187 return Err(S3Error::with_message(
5188 S3ErrorCode::InvalidRequest,
5189 "SSE-C requires all three of: x-amz-server-side-encryption-customer-{algorithm,key,key-MD5}",
5190 ));
5191 }
5192 }
5193 } else {
5194 // CreateMultipartUpload was non-SSE-C (None / SseS4 /
5195 // SseKms). A part that arrives carrying SSE-C headers
5196 // is either a confused client or an attempt to
5197 // smuggle SSE-C around the gateway-internal SSE
5198 // recipe. Reject with 400 InvalidRequest rather than
5199 // silently strip — the strip would let the client
5200 // believe the part was encrypted under their key
5201 // when in fact the upload's encryption recipe is
5202 // whatever the Create captured.
5203 if req.input.sse_customer_algorithm.is_some()
5204 || req.input.sse_customer_key.is_some()
5205 || req.input.sse_customer_key_md5.is_some()
5206 {
5207 return Err(S3Error::with_message(
5208 S3ErrorCode::InvalidRequest,
5209 "UploadPart sent SSE-C headers but CreateMultipartUpload was not SSE-C",
5210 ));
5211 }
5212 }
5213 } else {
5214 // No upload context registered (gateway crashed between
5215 // Create and Part, or pre-#62 abandoned-upload restore).
5216 // We can't check key consistency in this case — strip
5217 // the headers and let the request through unchanged so
5218 // the backend's `NoSuchUpload` reply (or whatever it
5219 // chooses to do) flows back to the client.
5220 let _ = req.input.sse_customer_algorithm.take();
5221 let _ = req.input.sse_customer_key.take();
5222 let _ = req.input.sse_customer_key_md5.take();
5223 }
5224 let _sse_ctx = sse_ctx;
5225 if let Some(blob) = req.input.body.take() {
5226 let bytes = collect_blob(blob, self.max_body_bytes)
5227 .await
5228 .map_err(internal("collect upload_part body"))?;
5229 // v0.8.12 HIGH-12 / #128 MED-C: verify all six AWS
5230 // checksum algorithms against the received part body.
5231 verify_client_body_checksums(
5232 &bytes,
5233 req.input.content_md5.as_deref(),
5234 req.input.checksum_crc32.as_deref(),
5235 req.input.checksum_crc32c.as_deref(),
5236 req.input.checksum_sha1.as_deref(),
5237 req.input.checksum_sha256.as_deref(),
5238 req.input.checksum_crc64nvme.as_deref(),
5239 )?;
5240 let sample_len = bytes.len().min(SAMPLE_BYTES);
5241 // v0.8 #56: full part body is already in memory here; use its
5242 // length as the size hint so the dispatcher can promote to GPU
5243 // if it's big enough.
5244 let codec_kind = self
5245 .dispatcher
5246 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
5247 .await;
5248 let original_size = bytes.len() as u64;
5249 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
5250 let (compress_res, tel) = self
5251 .registry
5252 .compress_with_telemetry(bytes, codec_kind)
5253 .await;
5254 stamp_gpu_compress_telemetry(&tel);
5255 let (compressed, manifest) =
5256 compress_res.map_err(internal("registry compress part"))?;
5257 let header = FrameHeader {
5258 codec: codec_kind,
5259 original_size,
5260 compressed_size: compressed.len() as u64,
5261 crc32c: manifest.crc32c,
5262 };
5263 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
5264 write_frame(&mut framed, header, &compressed);
5265 // v0.2 #5: heuristic-based padding skip for likely-final parts.
5266 //
5267 // AWS SDK / aws-cli / boto3 always send the final (and only the
5268 // final) part below the configured part_size. So if the raw user
5269 // part is already smaller than S3's 5 MiB multipart minimum, this
5270 // is overwhelmingly likely to be the final part — and the final
5271 // part is exempt from S3's size constraint. Skipping padding here
5272 // saves up to ~5 MiB per object on highly compressible workloads.
5273 //
5274 // If a misbehaving client sends a tiny **non-final** part, S3
5275 // itself rejects with EntityTooSmall at CompleteMultipartUpload —
5276 // identical outcome to a vanilla S3 PUT, just earlier than
5277 // padding-then-complete would catch it.
5278 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
5279 if !likely_final {
5280 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
5281 }
5282 let framed_bytes = framed.freeze();
5283 let new_len = framed_bytes.len() as i64;
5284 // 同じ wire 互換問題が multipart にもある (content-length / checksum)
5285 req.input.content_length = Some(new_len);
5286 req.input.checksum_algorithm = None;
5287 req.input.checksum_crc32 = None;
5288 req.input.checksum_crc32c = None;
5289 req.input.checksum_crc64nvme = None;
5290 req.input.checksum_sha1 = None;
5291 req.input.checksum_sha256 = None;
5292 req.input.content_md5 = None;
5293 req.input.body = Some(bytes_to_blob(framed_bytes));
5294 debug!(
5295 part_number = ?req.input.part_number,
5296 upload_id = ?req.input.upload_id,
5297 original_size,
5298 framed_size = new_len,
5299 "S4 upload_part: framed compressed payload"
5300 );
5301 }
5302 self.backend.upload_part(req).await
5303 }
5304 async fn complete_multipart_upload(
5305 &self,
5306 mut req: S3Request<CompleteMultipartUploadInput>,
5307 ) -> S3Result<S3Response<CompleteMultipartUploadOutput>> {
5308 let bucket = req.input.bucket.clone();
5309 let key = req.input.key.clone();
5310 let upload_id = req.input.upload_id.clone();
5311 // v0.8.12 HIGH-9 fix: gate Complete on `s3:PutObject` (the
5312 // commit point for the multipart-assembled object).
5313 self.enforce_policy(&req, "s3:PutObject", &bucket, Some(&key))?;
5314 self.enforce_rate_limit(&req, &bucket)?;
5315 // v0.8.12 HIGH-6 fix: re-verify Object Lock on the target key
5316 // at Complete time. Without this an attacker with PutObject
5317 // permission could `CreateMultipartUpload` against a key
5318 // that's currently under retention / legal hold and silently
5319 // overwrite it on Complete (the single-PUT path runs the
5320 // same check at L2007). Compliance retention is never
5321 // bypassable; Governance only with explicit IAM permission
5322 // (HIGH-7 gate below).
5323 if let Some(mgr) = self.object_lock.as_ref()
5324 && let Some(state) = mgr.get(&bucket, &key)
5325 {
5326 // CompleteMultipartUpload doesn't carry the bypass header
5327 // (the s3s DTO matches AWS' wire schema). A locked key
5328 // therefore cannot be overwritten by Complete regardless
5329 // of caller permission — operators who need to break a
5330 // Governance lock do it via PutObjectRetention before
5331 // calling Complete.
5332 let now = chrono::Utc::now();
5333 if !state.can_delete(now, false) {
5334 crate::metrics::record_policy_denial("s3:PutObject", &bucket);
5335 return Err(S3Error::with_message(
5336 S3ErrorCode::AccessDenied,
5337 "Access Denied because target key is protected by object lock",
5338 ));
5339 }
5340 }
5341 // v0.8.1 #59: serialise concurrent Complete invocations on the
5342 // same `(bucket, key)`. The race window the lock closes is the
5343 // GET-assembled-body → encrypt → PUT-encrypted-body triple
5344 // below (BUG-5 fix); without serialisation, two Completes for
5345 // different `upload_id` but the same logical key could each
5346 // read the other's plaintext assembled body and overwrite the
5347 // peer's encrypted result. The guard is held to function exit
5348 // (drop on `Ok` / `Err`), covering version-id mint, object-
5349 // lock apply, tagging persist, and replication enqueue too.
5350 let completion_lock = self.multipart_state.completion_lock(&bucket, &key);
5351 let _completion_guard = completion_lock.lock().await;
5352 // v0.8 #54 — fetch the per-upload context captured on Create.
5353 // `None` means an abandoned / unknown upload_id (gateway
5354 // crashed between Create and Complete, or pre-v0.8 state
5355 // restore); we still let the backend do its thing for
5356 // transparency, but we can't apply any SSE / version / lock /
5357 // tag / replication post-processing because we never captured
5358 // the recipe.
5359 let ctx = self.multipart_state.get(upload_id.as_str());
5360 // v0.8 #54 BUG-10 fix: same SSE-C header strip as upload_part
5361 // — some clients (boto3 / aws-sdk-cpp older versions) replay
5362 // the SSE-C triple on Complete too, and MinIO will choke if
5363 // they reach the backend.
5364 let _ = req.input.sse_customer_algorithm.take();
5365 let _ = req.input.sse_customer_key.take();
5366 let _ = req.input.sse_customer_key_md5.take();
5367 let mut resp = self.backend.complete_multipart_upload(req).await?;
5368 // CompleteMultipartUpload 成功 → 完成した object を full fetch して frame
5369 // index を build、`<key>.s4index` sidecar として保存。これで Range GET の
5370 // partial fetch path が利用可能になる (Range request の帯域節約)。
5371 // 注: 巨大 object の場合この pass は重いが、Range query は一度 sidecar が
5372 // できれば爆速になるので 1 回の cost は payback される
5373 //
5374 // v0.8 #54 BUG-5..9: this same fetch is the choke-point for
5375 // the SSE encrypt re-PUT + versioning shadow-key rewrite +
5376 // replication source-bytes capture, so we GET once and reuse
5377 // the bytes for every post-processing step.
5378 let assembled_body: Option<bytes::Bytes> = if let Ok(uri) = safe_object_uri(&bucket, &key) {
5379 let get_input = GetObjectInput {
5380 bucket: bucket.clone(),
5381 key: key.clone(),
5382 ..Default::default()
5383 };
5384 let get_req = S3Request {
5385 input: get_input,
5386 method: http::Method::GET,
5387 uri,
5388 headers: http::HeaderMap::new(),
5389 extensions: http::Extensions::new(),
5390 credentials: None,
5391 region: None,
5392 service: None,
5393 trailing_headers: None,
5394 };
5395 match self.backend.get_object(get_req).await {
5396 Ok(get_resp) => match get_resp.output.body {
5397 Some(blob) => collect_blob(blob, self.max_body_bytes).await.ok(),
5398 None => None,
5399 },
5400 Err(e) => {
5401 // v0.8.4 #71 (C-1 audit fix): a silent
5402 // `Err(_) => None` here is a SSE plaintext
5403 // leak. The post-processing block below only
5404 // runs the SSE re-encrypt branch when
5405 // `assembled_body.is_some()`, so swallowing a
5406 // backend error skipped the encrypt step and
5407 // left the multipart object on disk as
5408 // plaintext, even on SSE-S4 / SSE-C / SSE-KMS
5409 // configured buckets. Same root-cause family
5410 // as v0.8 BUG-5; this branch closes the
5411 // remaining read-side window.
5412 //
5413 // We distinguish two cases:
5414 // - `NoSuchKey`: the object is genuinely
5415 // missing post-Complete. This is rare and
5416 // typically races with a concurrent
5417 // DeleteObject; there is nothing to re-
5418 // encrypt and no SSE markers to honour, so
5419 // falling through to the legacy
5420 // `assembled_body = None` path is safe.
5421 // - everything else (5xx, network, auth,
5422 // etc.): we must FAIL the Complete so the
5423 // client can retry. Returning Ok with
5424 // `assembled_body = None` would silently
5425 // skip the SSE re-encrypt and leave the
5426 // backend bytes plaintext.
5427 if matches!(e.code(), &S3ErrorCode::NoSuchKey) {
5428 tracing::warn!(
5429 bucket = %bucket,
5430 key = %key,
5431 "multipart Complete: backend GET returned NoSuchKey; \
5432 skipping post-processing (object likely raced with DeleteObject)"
5433 );
5434 None
5435 } else {
5436 tracing::error!(
5437 bucket = %bucket,
5438 key = %key,
5439 error = %e,
5440 "multipart Complete: backend GET failed; failing the Complete \
5441 so the client retries (silent fall-through would skip SSE \
5442 re-encrypt and store plaintext)"
5443 );
5444 return Err(internal("multipart Complete: backend body fetch failed")(e));
5445 }
5446 }
5447 }
5448 } else {
5449 None
5450 };
5451 // Sidecar build (existing behaviour, gated on assembled body).
5452 //
5453 // v0.8.12 HIGH-10 fix: skip the sidecar when the Complete is
5454 // going to SSE-encrypt the assembled body before re-PUT (the
5455 // single-PUT path applies the same suppression at L2271).
5456 // Stale offsets into the pre-encrypt body would break Range
5457 // GET on the encrypted on-disk bytes. `ctx.sse != None`
5458 // covers all three SSE modes captured at Create time.
5459 let mp_will_encrypt = ctx
5460 .as_ref()
5461 .map(|c| !matches!(c.sse, crate::multipart_state::MultipartSseMode::None))
5462 .unwrap_or(false);
5463 // v0.8.16 F-7: versioned multipart writes the assembled body
5464 // under `versioned_shadow_key(&key, vid)` *after* this
5465 // sidecar block, then deletes the original `<key>`. Stamping
5466 // the sidecar against the to-be-deleted `<key>` (which is
5467 // what H-g did) leaves an orphan `<key>.s4index` whose
5468 // source-ETag binding can never match the live shadow body
5469 // — the Range GET fast-path's stale-sidecar check then
5470 // falls through to a full read on every request, silently
5471 // disabling partial fetch. Skip the sidecar build entirely
5472 // for versioned buckets; a follow-up issue tracks writing
5473 // the sidecar under the shadow key with the shadow's ETag.
5474 let mp_skip_sidecar_for_versioning = self
5475 .versioning
5476 .as_ref()
5477 .map(|mgr| mgr.state(&bucket))
5478 .map(|state| state == crate::versioning::VersioningState::Enabled)
5479 .unwrap_or(false);
5480 if let Some(ref body) = assembled_body
5481 && !mp_will_encrypt
5482 && !mp_skip_sidecar_for_versioning
5483 && let Ok(mut index) = build_index_from_body(body)
5484 {
5485 // v0.8.15 H-g: stamp the source-ETag / source-compressed-size
5486 // binding on the multipart sidecar. The single-PUT path
5487 // does this at L2519-L2521 via the backend's PUT response,
5488 // but Complete returns its own ETag (an opaque manifest
5489 // hash) so we have to HEAD the freshly-completed object
5490 // to pick up what backend actually wrote, then bind the
5491 // sidecar to those values. Without the binding, a
5492 // subsequent backend-side mutation (lifecycle rewrite,
5493 // out-of-band CopyObject) wouldn't trip the staleness
5494 // check on the next Range GET — the GET would happily
5495 // slice the new bytes at the old sidecar offsets, with
5496 // silent data corruption.
5497 if let Ok(uri) = safe_object_uri(&bucket, &key) {
5498 let head_req = S3Request {
5499 input: HeadObjectInput {
5500 bucket: bucket.clone(),
5501 key: key.clone(),
5502 ..Default::default()
5503 },
5504 method: http::Method::HEAD,
5505 uri,
5506 headers: http::HeaderMap::new(),
5507 extensions: http::Extensions::new(),
5508 credentials: None,
5509 region: None,
5510 service: None,
5511 trailing_headers: None,
5512 };
5513 if let Ok(head) = self.backend.head_object(head_req).await {
5514 index.source_etag = head.output.e_tag.as_ref().map(|t| t.value().to_string());
5515 index.source_compressed_size = head
5516 .output
5517 .content_length
5518 .and_then(|n| u64::try_from(n).ok());
5519 }
5520 // HEAD failure is non-fatal — the sidecar still works
5521 // as a v1-style best-effort fast path; the Range GET
5522 // simply falls back to a full read on any consistency
5523 // signal.
5524 }
5525 self.write_sidecar(&bucket, &key, &index).await;
5526 }
5527 // From here on, post-processing depends on the context —
5528 // short-circuit when the upload had no captured recipe
5529 // (legacy / crashed-Create / pre-v0.8 state restore).
5530 if let Some(ctx) = ctx {
5531 // v0.8 #54 BUG-6 fix: mint a version-id when the bucket
5532 // is versioning-Enabled. The single-PUT path does this in
5533 // `put_object` ~L1968; multipart was the missing branch.
5534 // We mint here (post-Complete, before any re-PUT) so the
5535 // same vid threads into both the shadow-key rewrite and
5536 // the VersionEntry the manager records.
5537 let pending_version: Option<crate::versioning::PutOutcome> = self
5538 .versioning
5539 .as_ref()
5540 .map(|mgr| mgr.state(&bucket))
5541 .map(|state| match state {
5542 crate::versioning::VersioningState::Enabled => crate::versioning::PutOutcome {
5543 version_id: crate::versioning::VersioningManager::new_version_id(),
5544 versioned_response: true,
5545 },
5546 crate::versioning::VersioningState::Suspended
5547 | crate::versioning::VersioningState::Unversioned => {
5548 crate::versioning::PutOutcome {
5549 version_id: crate::versioning::NULL_VERSION_ID.to_owned(),
5550 versioned_response: false,
5551 }
5552 }
5553 });
5554 // v0.8 #54 BUG-5 fix: encrypt the assembled framed body
5555 // and re-PUT it to the backend so the on-disk bytes are
5556 // SSE-encrypted. The single-PUT path does this body-by-
5557 // body inside `put_object` (L1907-L1942); for multipart,
5558 // encrypt-per-part would require a multi-segment decrypt
5559 // path on GET — we instead do a single encrypt over the
5560 // assembled framed body so the existing GET decrypt
5561 // branch (`is_sse_encrypted` → `decrypt(body, source)` →
5562 // FrameIter) handles it unchanged.
5563 //
5564 // The cost is one extra round-trip per Complete for SSE-
5565 // enabled multipart (already-paid for the sidecar build).
5566 // For single-instance gateways pointing at a co-located
5567 // backend this is negligible; cross-region operators
5568 // would benefit from per-part encrypt + multi-segment
5569 // decrypt as a follow-up.
5570 let needs_re_put = matches!(
5571 ctx.sse,
5572 crate::multipart_state::MultipartSseMode::SseS4
5573 | crate::multipart_state::MultipartSseMode::SseC { .. }
5574 | crate::multipart_state::MultipartSseMode::SseKms { .. }
5575 ) || pending_version
5576 .as_ref()
5577 .map(|pv| pv.versioned_response)
5578 .unwrap_or(false);
5579 // v0.8.11 CRIT-2 fix: seed the replication body with the
5580 // pre-encrypt assembled bytes, but overwrite it with the
5581 // post-encrypt `new_body` once the re-PUT branch lands.
5582 // The previous "snapshot in advance" pattern shipped the
5583 // *plaintext* framed body to the destination bucket even
5584 // when SSE-S4 / SSE-C / SSE-KMS was active — the GET on
5585 // the destination would then fail to decrypt (or, worse,
5586 // succeed in handing out plaintext that the source had
5587 // promised was encrypted at rest). When `needs_re_put`
5588 // is false (no SSE, no versioning), the backend still
5589 // holds the original plaintext-framed bytes, and the
5590 // seed value is what the destination should receive.
5591 let mut replication_body = assembled_body.clone();
5592 let mut applied_metadata: Option<std::collections::HashMap<String, String>> = None;
5593 if needs_re_put && let Some(body) = assembled_body {
5594 // v0.8.1 #58: same Zeroizing pattern as put_object's
5595 // single-PUT KMS branch — DEK plaintext lives in
5596 // `Zeroizing<[u8; 32]>` for the lifetime of this
5597 // Complete handler, then is wiped on drop.
5598 let kms_wrap: Option<(zeroize::Zeroizing<[u8; 32]>, crate::kms::WrappedDek)> =
5599 if let crate::multipart_state::MultipartSseMode::SseKms { ref key_id } = ctx.sse
5600 {
5601 let kms = self.kms.as_ref().ok_or_else(|| {
5602 S3Error::with_message(
5603 S3ErrorCode::InvalidRequest,
5604 "SSE-KMS requested but no --kms-local-dir / --kms-aws-region is configured on this gateway",
5605 )
5606 })?;
5607 let (dek, wrapped) =
5608 kms.generate_dek(key_id).await.map_err(kms_error_to_s3)?;
5609 if dek.len() != 32 {
5610 return Err(S3Error::with_message(
5611 S3ErrorCode::InternalError,
5612 format!(
5613 "KMS backend returned a DEK of {} bytes (expected 32)",
5614 dek.len()
5615 ),
5616 ));
5617 }
5618 let mut dek_arr: zeroize::Zeroizing<[u8; 32]> =
5619 zeroize::Zeroizing::new([0u8; 32]);
5620 dek_arr.copy_from_slice(&dek);
5621 // `dek` (Zeroizing<Vec<u8>>) is dropped at scope end.
5622 Some((dek_arr, wrapped))
5623 } else {
5624 None
5625 };
5626 // Build the new metadata map: re-fetch via HEAD so
5627 // the multipart / codec markers the backend stamped
5628 // on Create flow through unchanged, then layer the
5629 // SSE markers on top.
5630 let head_req = S3Request {
5631 input: HeadObjectInput {
5632 bucket: bucket.clone(),
5633 key: key.clone(),
5634 ..Default::default()
5635 },
5636 method: http::Method::HEAD,
5637 uri: safe_object_uri(&bucket, &key)?,
5638 headers: http::HeaderMap::new(),
5639 extensions: http::Extensions::new(),
5640 credentials: None,
5641 region: None,
5642 service: None,
5643 trailing_headers: None,
5644 };
5645 let mut new_metadata: std::collections::HashMap<String, String> =
5646 match self.backend.head_object(head_req).await {
5647 Ok(h) => h.output.metadata.unwrap_or_default(),
5648 Err(_) => std::collections::HashMap::new(),
5649 };
5650 let new_body = match &ctx.sse {
5651 crate::multipart_state::MultipartSseMode::SseC { key, key_md5 } => {
5652 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5653 new_metadata.insert("s4-sse-type".into(), "AES256".into());
5654 new_metadata.insert(
5655 "s4-sse-c-key-md5".into(),
5656 base64::engine::general_purpose::STANDARD.encode(key_md5),
5657 );
5658 // v0.8.2 #62: `key` is `&Zeroizing<[u8; 32]>`;
5659 // auto-deref through one explicit binding so
5660 // `SseSource::CustomerKey` gets the `&[u8; 32]`
5661 // it expects (mirrors the SSE-KMS DEK shape
5662 // a few lines down).
5663 let key_ref: &[u8; 32] = key;
5664 crate::sse::encrypt_with_source(
5665 &body,
5666 crate::sse::SseSource::CustomerKey {
5667 key: key_ref,
5668 key_md5,
5669 },
5670 )
5671 }
5672 crate::multipart_state::MultipartSseMode::SseKms { .. } => {
5673 let (dek, wrapped) = kms_wrap
5674 .as_ref()
5675 .expect("SseKms branch implies kms_wrap is Some");
5676 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5677 new_metadata.insert("s4-sse-type".into(), "aws:kms".into());
5678 new_metadata.insert("s4-sse-kms-key-id".into(), wrapped.key_id.clone());
5679 // v0.8.1 #58: auto-deref from `&Zeroizing<[u8; 32]>`
5680 // to `&[u8; 32]` (same shape as the put_object
5681 // single-PUT branch).
5682 let dek_ref: &[u8; 32] = dek;
5683 crate::sse::encrypt_with_source(
5684 &body,
5685 crate::sse::SseSource::Kms {
5686 dek: dek_ref,
5687 wrapped,
5688 },
5689 )
5690 }
5691 crate::multipart_state::MultipartSseMode::SseS4 => {
5692 let keyring = self.sse_keyring.as_ref().ok_or_else(|| {
5693 S3Error::with_message(
5694 S3ErrorCode::InternalError,
5695 "SSE-S4 captured at Create but keyring missing at Complete",
5696 )
5697 })?;
5698 new_metadata.insert("s4-encrypted".into(), "aes-256-gcm".into());
5699 // SSE-S4 deliberately omits `s4-sse-type` so
5700 // HEAD doesn't falsely advertise AWS-style
5701 // SSE-S3 (matches the put_object L1929-L1939
5702 // comment).
5703 // v0.8 #52: same chunk_size dispatch as the
5704 // single-PUT branch — multipart Complete
5705 // re-encrypts the assembled body, so honoring
5706 // the chunked path here is required to keep
5707 // GET streaming on multipart-uploaded objects.
5708 if self.sse_chunk_size > 0 {
5709 crate::sse::encrypt_v2_chunked(&body, keyring, self.sse_chunk_size)
5710 .map_err(|e| {
5711 S3Error::with_message(
5712 S3ErrorCode::InternalError,
5713 format!("SSE-S4 chunked encrypt failed at Complete: {e}"),
5714 )
5715 })?
5716 } else {
5717 crate::sse::encrypt_v2(&body, keyring)
5718 }
5719 }
5720 crate::multipart_state::MultipartSseMode::None => body.clone(),
5721 };
5722 // v0.8 #54 BUG-6 fix: write the re-PUT under the
5723 // shadow key so the version chain doesn't overwrite
5724 // the previous version on a versioned bucket. The
5725 // original (unshadowed) key was assembled by the
5726 // backend on Complete; we delete it after the shadow
5727 // PUT lands.
5728 let put_target_key = if let Some(pv) = pending_version.as_ref() {
5729 if pv.versioned_response {
5730 versioned_shadow_key(&key, &pv.version_id)
5731 } else {
5732 key.clone()
5733 }
5734 } else {
5735 key.clone()
5736 };
5737 let new_body_len = new_body.len() as i64;
5738 let put_req = S3Request {
5739 input: PutObjectInput {
5740 bucket: bucket.clone(),
5741 key: put_target_key.clone(),
5742 body: Some(bytes_to_blob(new_body.clone())),
5743 metadata: Some(new_metadata.clone()),
5744 content_length: Some(new_body_len),
5745 ..Default::default()
5746 },
5747 method: http::Method::PUT,
5748 uri: safe_object_uri(&bucket, &put_target_key)?,
5749 headers: http::HeaderMap::new(),
5750 extensions: http::Extensions::new(),
5751 credentials: None,
5752 region: None,
5753 service: None,
5754 trailing_headers: None,
5755 };
5756 self.backend.put_object(put_req).await?;
5757 // v0.8.11 CRIT-2 fix: refresh the replication snapshot
5758 // with the bytes that were actually persisted to the
5759 // backend (post-SSE-encrypt for SSE modes; identical to
5760 // `body` for `MultipartSseMode::None` + versioning-only
5761 // re-PUT). The destination then sees the same on-disk
5762 // shape the source does, and a destination GET decrypts
5763 // correctly when SSE is on.
5764 replication_body = Some(new_body.clone());
5765 // If we rewrote the storage key (versioning shadow),
5766 // we must drop the original (unshadowed) Complete-
5767 // assembled bytes so subsequent listings don't see a
5768 // duplicate.
5769 if put_target_key != key {
5770 let del_req = S3Request {
5771 input: DeleteObjectInput {
5772 bucket: bucket.clone(),
5773 key: key.clone(),
5774 ..Default::default()
5775 },
5776 method: http::Method::DELETE,
5777 uri: safe_object_uri(&bucket, &key)?,
5778 headers: http::HeaderMap::new(),
5779 extensions: http::Extensions::new(),
5780 credentials: None,
5781 region: None,
5782 service: None,
5783 trailing_headers: None,
5784 };
5785 let _ = self.backend.delete_object(del_req).await;
5786 }
5787 applied_metadata = Some(new_metadata);
5788 }
5789 // v0.8 #54 BUG-6 commit: register the new version with
5790 // the VersioningManager so list_object_versions /
5791 // GET ?versionId= see it.
5792 if let (Some(mgr), Some(pv)) = (self.versioning.as_ref(), pending_version.as_ref()) {
5793 let etag = resp
5794 .output
5795 .e_tag
5796 .clone()
5797 .map(ETag::into_value)
5798 .unwrap_or_default();
5799 let now = chrono::Utc::now();
5800 mgr.commit_put_with_version(
5801 &bucket,
5802 &key,
5803 crate::versioning::VersionEntry {
5804 version_id: pv.version_id.clone(),
5805 etag,
5806 size: replication_body
5807 .as_ref()
5808 .map(|b| b.len() as u64)
5809 .unwrap_or(0),
5810 is_delete_marker: false,
5811 created_at: now,
5812 },
5813 );
5814 if pv.versioned_response {
5815 resp.output.version_id = Some(pv.version_id.clone());
5816 }
5817 }
5818 // v0.8 #54 BUG-7 fix: persist any per-upload Object Lock
5819 // recipe + auto-apply the bucket default. Mirrors the
5820 // put_object L2057-L2074 block.
5821 if let Some(mgr) = self.object_lock.as_ref() {
5822 if ctx.object_lock_mode.is_some()
5823 || ctx.object_lock_retain_until.is_some()
5824 || ctx.object_lock_legal_hold
5825 {
5826 let mut state = mgr.get(&bucket, &key).unwrap_or_default();
5827 if let Some(m) = ctx.object_lock_mode {
5828 state.mode = Some(m);
5829 }
5830 if let Some(u) = ctx.object_lock_retain_until {
5831 state.retain_until = Some(u);
5832 }
5833 if ctx.object_lock_legal_hold {
5834 state.legal_hold_on = true;
5835 }
5836 mgr.set(&bucket, &key, state);
5837 }
5838 mgr.apply_default_on_put(&bucket, &key, chrono::Utc::now());
5839 }
5840 // v0.8 #54 BUG-9 fix: persist the captured tags via the
5841 // TagManager so GetObjectTagging returns them.
5842 if let (Some(mgr), Some(tags)) = (self.tagging.as_ref(), ctx.tags.as_ref()) {
5843 mgr.put_object_tags(&bucket, &key, tags.clone());
5844 }
5845 // SSE-C / SSE-KMS response echo. The
5846 // CompleteMultipartUploadOutput only exposes
5847 // `server_side_encryption` + `ssekms_key_id` (no
5848 // sse_customer_* — those round-tripped on Create / parts).
5849 match &ctx.sse {
5850 crate::multipart_state::MultipartSseMode::SseC { .. } => {
5851 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5852 ServerSideEncryption::AES256,
5853 ));
5854 }
5855 crate::multipart_state::MultipartSseMode::SseKms { key_id } => {
5856 resp.output.server_side_encryption = Some(ServerSideEncryption::from_static(
5857 ServerSideEncryption::AWS_KMS,
5858 ));
5859 resp.output.ssekms_key_id = Some(key_id.clone());
5860 }
5861 _ => {}
5862 }
5863 // v0.8 #54 BUG-8 fix: fire cross-bucket replication just
5864 // like put_object L2165 does. We hand the dispatcher the
5865 // assembled body bytes (post-encrypt where applicable, so
5866 // the destination ends up byte-identical to the source's
5867 // on-disk shape) plus the metadata that was actually
5868 // committed.
5869 let replication_body_bytes = replication_body.unwrap_or_default();
5870 // v0.8.2 #61: thread the multipart-Complete `pending_version`
5871 // through so a versioning-Enabled source's destination
5872 // receives the same shadow-key path (mirror of the
5873 // single-PUT branch above).
5874 self.spawn_replication_if_matched(
5875 &bucket,
5876 &key,
5877 &ctx.tags,
5878 &replication_body_bytes,
5879 &applied_metadata,
5880 true,
5881 pending_version.as_ref(),
5882 );
5883 self.multipart_state.remove(upload_id.as_str());
5884 }
5885 // v0.8.1 #59 janitor: best-effort sweep of stale completion
5886 // locks while we are still on the critical path of a single
5887 // Complete (so steady-state workloads of unique keys don't
5888 // accumulate `DashMap` entries). The sweep only retires
5889 // entries whose `Arc::strong_count == 1`, so any other in-
5890 // flight Complete on a different key keeps its lock alive.
5891 // Our own `_completion_guard` keeps `bucket`/`key`'s entry
5892 // alive across this call; it's reaped on the next Complete or
5893 // the next caller-driven prune.
5894 self.multipart_state.prune_completion_locks();
5895 Ok(resp)
5896 }
5897 async fn abort_multipart_upload(
5898 &self,
5899 req: S3Request<AbortMultipartUploadInput>,
5900 ) -> S3Result<S3Response<AbortMultipartUploadOutput>> {
5901 // v0.8.12 HIGH-9 fix: gate Abort on `s3:AbortMultipartUpload`
5902 // — the AWS-spec action verb for this operation. Without the
5903 // gate, anyone who could guess an upload_id could throw away
5904 // someone else's in-flight multipart upload.
5905 let abort_bucket = req.input.bucket.clone();
5906 let abort_key = req.input.key.clone();
5907 self.enforce_policy(
5908 &req,
5909 "s3:AbortMultipartUpload",
5910 &abort_bucket,
5911 Some(&abort_key),
5912 )?;
5913 // v0.8 #54: drop the per-upload state (SSE-C key bytes / tag
5914 // set) promptly so an aborted upload doesn't leak the
5915 // customer's key into a long-running gateway's RSS.
5916 //
5917 // v0.8.4 #71 (H-7 audit fix): backend.abort_multipart_upload
5918 // FIRST, then drop in-process state ONLY on success. The
5919 // previous order ("remove → call backend") meant a transient
5920 // backend abort failure (5xx, network) wiped the SSE-C key
5921 // bytes locally while leaving the parts on the backend, so a
5922 // client retry would have to re-validate the SSE-C key against
5923 // a context the gateway no longer has — and the retried abort
5924 // would still hit the unaborted backend parts. Calling the
5925 // backend first lets the failure propagate to the client with
5926 // state intact for a clean retry; only on success do we wipe
5927 // the local state.
5928 let upload_id = req.input.upload_id.as_str().to_owned();
5929 let resp = self.backend.abort_multipart_upload(req).await?;
5930 self.multipart_state.remove(&upload_id);
5931 Ok(resp)
5932 }
5933 async fn list_multipart_uploads(
5934 &self,
5935 req: S3Request<ListMultipartUploadsInput>,
5936 ) -> S3Result<S3Response<ListMultipartUploadsOutput>> {
5937 self.backend.list_multipart_uploads(req).await
5938 }
5939 async fn list_parts(
5940 &self,
5941 req: S3Request<ListPartsInput>,
5942 ) -> S3Result<S3Response<ListPartsOutput>> {
5943 self.backend.list_parts(req).await
5944 }
5945
5946 // =========================================================================
5947 // Phase 2 — pure passthrough delegations。S4 はこれらに対して圧縮 hook を
5948 // 持たないので、backend (= AWS S3) の動作と完全に同一。
5949 //
5950 // 既知の制限事項:
5951 // - copy_object / upload_part_copy: source object が S4-compressed の場合、
5952 // backend が bytes を copy するだけなので metadata (s4-codec etc) も一緒に
5953 // coppied される (AWS S3 default = MetadataDirective COPY)。GET は manifest
5954 // 経由で正しく decompress できる。MetadataDirective REPLACE で上書き
5955 // されると圧縮 metadata が消えて壊れる — 顧客側の運用で注意
5956 // - list_object_versions: versioning enabled bucket では各 version も S4
5957 // metadata を維持する。古い version も S4 経由で正しく GET できる。
5958 // =========================================================================
5959
5960 // ---- Object ACL / tagging / attributes ----
5961 async fn get_object_acl(
5962 &self,
5963 req: S3Request<GetObjectAclInput>,
5964 ) -> S3Result<S3Response<GetObjectAclOutput>> {
5965 // v0.8.17 G-2: reserved-name guard. Without it a hostile
5966 // client can `GetObjectAcl(<key>.s4index)` to confirm the
5967 // sidecar exists, an information leak the F-13 GET reject
5968 // closed for the same object.
5969 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5970 self.backend.get_object_acl(req).await
5971 }
5972 async fn put_object_acl(
5973 &self,
5974 req: S3Request<PutObjectAclInput>,
5975 ) -> S3Result<S3Response<PutObjectAclOutput>> {
5976 // v0.8.17 G-2: reserved-name guard. `put-object-acl
5977 // --acl public-read` against `<key>.s4index` would grant
5978 // external read access to the internal sidecar, bypassing
5979 // the F-13 GET reject via the backend's public-URL path.
5980 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
5981 self.backend.put_object_acl(req).await
5982 }
5983 // v0.6 #39: object tagging — when a `TagManager` is attached the
5984 // configuration / per-(bucket, key) state lives in the manager and
5985 // these handlers serve directly from it; when no manager is
5986 // attached they fall back to the backend (legacy passthrough so
5987 // v0.5 deployments are unaffected).
5988 async fn get_object_tagging(
5989 &self,
5990 req: S3Request<GetObjectTaggingInput>,
5991 ) -> S3Result<S3Response<GetObjectTaggingOutput>> {
5992 // v0.8.17 G-2: reserved-name guard.
5993 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
5994 let Some(mgr) = self.tagging.as_ref() else {
5995 return self.backend.get_object_tagging(req).await;
5996 };
5997 let tags = mgr
5998 .get_object_tags(&req.input.bucket, &req.input.key)
5999 .unwrap_or_default();
6000 Ok(S3Response::new(GetObjectTaggingOutput {
6001 tag_set: tagset_to_aws(&tags),
6002 ..Default::default()
6003 }))
6004 }
6005 async fn put_object_tagging(
6006 &self,
6007 req: S3Request<PutObjectTaggingInput>,
6008 ) -> S3Result<S3Response<PutObjectTaggingOutput>> {
6009 // v0.8.17 G-2: reserved-name guard.
6010 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
6011 let Some(mgr) = self.tagging.as_ref() else {
6012 return self.backend.put_object_tagging(req).await;
6013 };
6014 let bucket = req.input.bucket.clone();
6015 let key = req.input.key.clone();
6016 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6017 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6018 // v0.6 #39: gate via IAM policy with both the request tags
6019 // (`s3:RequestObjectTag/<key>`) and any existing tags on the
6020 // target object (`s3:ExistingObjectTag/<key>`).
6021 let existing = mgr.get_object_tags(&bucket, &key);
6022 self.enforce_policy_with_extra(
6023 &req,
6024 "s3:PutObjectTagging",
6025 &bucket,
6026 Some(&key),
6027 Some(&parsed),
6028 existing.as_ref(),
6029 )?;
6030 mgr.put_object_tags(&bucket, &key, parsed);
6031 Ok(S3Response::new(PutObjectTaggingOutput::default()))
6032 }
6033 async fn delete_object_tagging(
6034 &self,
6035 req: S3Request<DeleteObjectTaggingInput>,
6036 ) -> S3Result<S3Response<DeleteObjectTaggingOutput>> {
6037 // v0.8.17 G-2: reserved-name guard.
6038 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
6039 let Some(mgr) = self.tagging.as_ref() else {
6040 return self.backend.delete_object_tagging(req).await;
6041 };
6042 let bucket = req.input.bucket.clone();
6043 let key = req.input.key.clone();
6044 let existing = mgr.get_object_tags(&bucket, &key);
6045 self.enforce_policy_with_extra(
6046 &req,
6047 "s3:DeleteObjectTagging",
6048 &bucket,
6049 Some(&key),
6050 None,
6051 existing.as_ref(),
6052 )?;
6053 mgr.delete_object_tags(&bucket, &key);
6054 Ok(S3Response::new(DeleteObjectTaggingOutput::default()))
6055 }
6056 async fn get_object_attributes(
6057 &self,
6058 req: S3Request<GetObjectAttributesInput>,
6059 ) -> S3Result<S3Response<GetObjectAttributesOutput>> {
6060 // v0.8.17 G-2: reserved-name guard. Attributes leak the
6061 // sidecar's size + ETag, same shape as F-13's GET concern.
6062 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Read)?;
6063 self.backend.get_object_attributes(req).await
6064 }
6065 async fn restore_object(
6066 &self,
6067 req: S3Request<RestoreObjectInput>,
6068 ) -> S3Result<S3Response<RestoreObjectOutput>> {
6069 // v0.8.17 G-2: reserved-name guard.
6070 self.check_not_reserved_key(&req.input.key, ReservedKeyMode::Mutating)?;
6071 self.backend.restore_object(req).await
6072 }
6073 async fn upload_part_copy(
6074 &self,
6075 req: S3Request<UploadPartCopyInput>,
6076 ) -> S3Result<S3Response<UploadPartCopyOutput>> {
6077 // v0.8.12 HIGH-9 fix: same per-action gates as `copy_object` —
6078 // destination PUT + source GET.
6079 let dst_bucket = req.input.bucket.clone();
6080 let dst_key = req.input.key.clone();
6081 // v0.8.17 G-2: reserved-name guard on both destination
6082 // and source. Mirrors what `copy_object` enforces.
6083 self.check_not_reserved_key(&dst_key, ReservedKeyMode::Mutating)?;
6084 if let CopySource::Bucket { key, .. } = &req.input.copy_source {
6085 self.check_not_reserved_key(key, ReservedKeyMode::Read)?;
6086 }
6087 self.enforce_policy(&req, "s3:PutObject", &dst_bucket, Some(&dst_key))?;
6088 if let CopySource::Bucket { bucket, key, .. } = &req.input.copy_source {
6089 self.enforce_policy(&req, "s3:GetObject", bucket, Some(key))?;
6090 }
6091 self.enforce_rate_limit(&req, &dst_bucket)?;
6092 // v0.2 #6: byte-range aware copy when the source is S4-framed.
6093 //
6094 // For a framed source (multipart upload OR single-PUT framed-v2),
6095 // a naive byte-range passthrough would copy compressed bytes that
6096 // don't align with S4 frame boundaries — silently corrupting the
6097 // result. Instead we GET the source through S4 (which handles
6098 // decompression + Range), re-compress + re-frame as a new part,
6099 // and forward as upload_part. For non-framed sources (S4-untouched
6100 // raw objects), passthrough is correct and we keep the original
6101 // (cheaper) code path.
6102 // v0.8.4 #74: propagate the optional `?versionId=<vid>` from the
6103 // copy-source header. Without this, a versioned source bucket
6104 // copy that pins a specific old version would silently fall
6105 // back to "latest", assembling wrong bytes into the destination
6106 // multipart object (silent data corruption).
6107 let CopySource::Bucket {
6108 bucket: src_bucket,
6109 key: src_key,
6110 version_id: src_version_id,
6111 } = &req.input.copy_source
6112 else {
6113 return self.backend.upload_part_copy(req).await;
6114 };
6115 let src_bucket = src_bucket.to_string();
6116 let src_key = src_key.to_string();
6117 let src_version_id: Option<String> = src_version_id.as_deref().map(str::to_owned);
6118
6119 // Probe metadata to decide whether the source needs S4-aware copy.
6120 let head_input = HeadObjectInput {
6121 bucket: src_bucket.clone(),
6122 key: src_key.clone(),
6123 version_id: src_version_id.clone(),
6124 ..Default::default()
6125 };
6126 let head_req = S3Request {
6127 input: head_input,
6128 method: http::Method::HEAD,
6129 uri: req.uri.clone(),
6130 headers: req.headers.clone(),
6131 extensions: http::Extensions::new(),
6132 credentials: req.credentials.clone(),
6133 region: req.region.clone(),
6134 service: req.service.clone(),
6135 trailing_headers: None,
6136 };
6137 let needs_s4_copy = match self.backend.head_object(head_req).await {
6138 Ok(h) => {
6139 is_multipart_object(&h.output.metadata) || is_framed_v2_object(&h.output.metadata)
6140 }
6141 Err(_) => false,
6142 };
6143 if !needs_s4_copy {
6144 return self.backend.upload_part_copy(req).await;
6145 }
6146
6147 // Resolve the optional source byte range to pass to GET.
6148 let source_range = req
6149 .input
6150 .copy_source_range
6151 .as_ref()
6152 .map(|r| parse_copy_source_range(r))
6153 .transpose()
6154 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidRange, e))?;
6155
6156 // GET source via S4 (handles decompression + sidecar partial fetch
6157 // when range is present). The result is the requested user-visible
6158 // byte range, fully decompressed. version_id is propagated so
6159 // pinned-version copies fetch the exact version requested.
6160 let mut get_input = GetObjectInput {
6161 bucket: src_bucket.clone(),
6162 key: src_key.clone(),
6163 version_id: src_version_id.clone(),
6164 ..Default::default()
6165 };
6166 get_input.range = source_range;
6167 let get_req = S3Request {
6168 input: get_input,
6169 method: http::Method::GET,
6170 uri: req.uri.clone(),
6171 headers: req.headers.clone(),
6172 extensions: http::Extensions::new(),
6173 credentials: req.credentials.clone(),
6174 region: req.region.clone(),
6175 service: req.service.clone(),
6176 trailing_headers: None,
6177 };
6178 let get_resp = self.get_object(get_req).await?;
6179 let blob = get_resp.output.body.ok_or_else(|| {
6180 S3Error::with_message(
6181 S3ErrorCode::InternalError,
6182 "upload_part_copy: empty body from source GET",
6183 )
6184 })?;
6185 let bytes = collect_blob(blob, self.max_body_bytes)
6186 .await
6187 .map_err(internal("collect upload_part_copy source body"))?;
6188
6189 // Compress + frame as a fresh part (mirrors upload_part path).
6190 let sample_len = bytes.len().min(SAMPLE_BYTES);
6191 // v0.8 #56: same size-hint promotion as the upload_part path.
6192 let codec_kind = self
6193 .dispatcher
6194 .pick_with_size_hint(&bytes[..sample_len], Some(bytes.len() as u64))
6195 .await;
6196 let original_size = bytes.len() as u64;
6197 // v0.8 #55: telemetry-returning compress (GPU metrics stamp).
6198 let (compress_res, tel) = self
6199 .registry
6200 .compress_with_telemetry(bytes, codec_kind)
6201 .await;
6202 stamp_gpu_compress_telemetry(&tel);
6203 let (compressed, manifest) =
6204 compress_res.map_err(internal("registry compress upload_part_copy"))?;
6205 let header = FrameHeader {
6206 codec: codec_kind,
6207 original_size,
6208 compressed_size: compressed.len() as u64,
6209 crc32c: manifest.crc32c,
6210 };
6211 let mut framed = BytesMut::with_capacity(FRAME_HEADER_BYTES + compressed.len());
6212 write_frame(&mut framed, header, &compressed);
6213 let likely_final = original_size < S3_MULTIPART_MIN_PART_BYTES as u64;
6214 if !likely_final {
6215 pad_to_minimum(&mut framed, S3_MULTIPART_MIN_PART_BYTES);
6216 }
6217 let framed_bytes = framed.freeze();
6218 let framed_len = framed_bytes.len() as i64;
6219
6220 // Forward as upload_part to the destination multipart upload.
6221 let part_input = UploadPartInput {
6222 bucket: req.input.bucket.clone(),
6223 key: req.input.key.clone(),
6224 part_number: req.input.part_number,
6225 upload_id: req.input.upload_id.clone(),
6226 body: Some(bytes_to_blob(framed_bytes)),
6227 content_length: Some(framed_len),
6228 ..Default::default()
6229 };
6230 let part_req = S3Request {
6231 input: part_input,
6232 method: http::Method::PUT,
6233 uri: req.uri.clone(),
6234 headers: req.headers.clone(),
6235 extensions: http::Extensions::new(),
6236 credentials: req.credentials.clone(),
6237 region: req.region.clone(),
6238 service: req.service.clone(),
6239 trailing_headers: None,
6240 };
6241 let upload_resp = self.backend.upload_part(part_req).await?;
6242
6243 let copy_output = UploadPartCopyOutput {
6244 copy_part_result: Some(CopyPartResult {
6245 e_tag: upload_resp.output.e_tag.clone(),
6246 ..Default::default()
6247 }),
6248 ..Default::default()
6249 };
6250 Ok(S3Response::new(copy_output))
6251 }
6252
6253 // ---- Object lock / retention / legal hold (v0.5 #30) ----
6254 //
6255 // When an `ObjectLockManager` is attached the configuration / per-object
6256 // state lives in the manager and these handlers serve directly from it;
6257 // when no manager is attached they fall back to the backend (legacy
6258 // passthrough so v0.4 deployments are unaffected).
6259 async fn get_object_lock_configuration(
6260 &self,
6261 req: S3Request<GetObjectLockConfigurationInput>,
6262 ) -> S3Result<S3Response<GetObjectLockConfigurationOutput>> {
6263 self.enforce_policy(
6264 &req,
6265 "s3:GetBucketObjectLockConfiguration",
6266 &req.input.bucket,
6267 None,
6268 )?;
6269 if let Some(mgr) = self.object_lock.as_ref() {
6270 let cfg = mgr
6271 .bucket_default(&req.input.bucket)
6272 .map(|d| ObjectLockConfiguration {
6273 object_lock_enabled: Some(ObjectLockEnabled::from_static(
6274 ObjectLockEnabled::ENABLED,
6275 )),
6276 rule: Some(ObjectLockRule {
6277 default_retention: Some(DefaultRetention {
6278 days: Some(d.retention_days as i32),
6279 mode: Some(ObjectLockRetentionMode::from_static(match d.mode {
6280 crate::object_lock::LockMode::Governance => {
6281 ObjectLockRetentionMode::GOVERNANCE
6282 }
6283 crate::object_lock::LockMode::Compliance => {
6284 ObjectLockRetentionMode::COMPLIANCE
6285 }
6286 })),
6287 years: None,
6288 }),
6289 }),
6290 });
6291 let output = GetObjectLockConfigurationOutput {
6292 object_lock_configuration: cfg,
6293 };
6294 return Ok(S3Response::new(output));
6295 }
6296 self.backend.get_object_lock_configuration(req).await
6297 }
6298 async fn put_object_lock_configuration(
6299 &self,
6300 req: S3Request<PutObjectLockConfigurationInput>,
6301 ) -> S3Result<S3Response<PutObjectLockConfigurationOutput>> {
6302 self.enforce_policy(
6303 &req,
6304 "s3:PutBucketObjectLockConfiguration",
6305 &req.input.bucket,
6306 None,
6307 )?;
6308 if let Some(mgr) = self.object_lock.as_ref() {
6309 let bucket = req.input.bucket.clone();
6310 if let Some(cfg) = req.input.object_lock_configuration.as_ref()
6311 && let Some(rule) = cfg.rule.as_ref()
6312 && let Some(d) = rule.default_retention.as_ref()
6313 {
6314 let mode = d
6315 .mode
6316 .as_ref()
6317 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()))
6318 .ok_or_else(|| {
6319 S3Error::with_message(
6320 S3ErrorCode::InvalidRequest,
6321 "Object Lock default retention requires a valid Mode (GOVERNANCE | COMPLIANCE)",
6322 )
6323 })?;
6324 // S3 spec: exactly one of Days / Years (we accept Days
6325 // outright and convert Years → Days for storage; Years
6326 // is just a UX shorthand on the wire).
6327 let days: u32 = match (d.days, d.years) {
6328 (Some(d), None) if d > 0 => d as u32,
6329 (None, Some(y)) if y > 0 => (y as u32).saturating_mul(365),
6330 _ => {
6331 return Err(S3Error::with_message(
6332 S3ErrorCode::InvalidRequest,
6333 "Object Lock default retention requires exactly one of Days or Years (positive integer)",
6334 ));
6335 }
6336 };
6337 mgr.set_bucket_default(
6338 &bucket,
6339 crate::object_lock::BucketObjectLockDefault {
6340 mode,
6341 retention_days: days,
6342 },
6343 );
6344 }
6345 return Ok(S3Response::new(PutObjectLockConfigurationOutput::default()));
6346 }
6347 self.backend.put_object_lock_configuration(req).await
6348 }
6349 async fn get_object_legal_hold(
6350 &self,
6351 req: S3Request<GetObjectLegalHoldInput>,
6352 ) -> S3Result<S3Response<GetObjectLegalHoldOutput>> {
6353 let key = req.input.key.clone();
6354 self.enforce_policy(&req, "s3:GetObjectLegalHold", &req.input.bucket, Some(&key))?;
6355 if let Some(mgr) = self.object_lock.as_ref() {
6356 let on = mgr
6357 .get(&req.input.bucket, &req.input.key)
6358 .map(|s| s.legal_hold_on)
6359 .unwrap_or(false);
6360 let status = ObjectLockLegalHoldStatus::from_static(if on {
6361 ObjectLockLegalHoldStatus::ON
6362 } else {
6363 ObjectLockLegalHoldStatus::OFF
6364 });
6365 let output = GetObjectLegalHoldOutput {
6366 legal_hold: Some(ObjectLockLegalHold {
6367 status: Some(status),
6368 }),
6369 };
6370 return Ok(S3Response::new(output));
6371 }
6372 self.backend.get_object_legal_hold(req).await
6373 }
6374 async fn put_object_legal_hold(
6375 &self,
6376 req: S3Request<PutObjectLegalHoldInput>,
6377 ) -> S3Result<S3Response<PutObjectLegalHoldOutput>> {
6378 let key = req.input.key.clone();
6379 self.enforce_policy(&req, "s3:PutObjectLegalHold", &req.input.bucket, Some(&key))?;
6380 if let Some(mgr) = self.object_lock.as_ref() {
6381 let on = req
6382 .input
6383 .legal_hold
6384 .as_ref()
6385 .and_then(|h| h.status.as_ref())
6386 .map(|s| s.as_str().eq_ignore_ascii_case("ON"))
6387 .unwrap_or(false);
6388 mgr.set_legal_hold(&req.input.bucket, &req.input.key, on);
6389 return Ok(S3Response::new(PutObjectLegalHoldOutput::default()));
6390 }
6391 self.backend.put_object_legal_hold(req).await
6392 }
6393 async fn get_object_retention(
6394 &self,
6395 req: S3Request<GetObjectRetentionInput>,
6396 ) -> S3Result<S3Response<GetObjectRetentionOutput>> {
6397 let key = req.input.key.clone();
6398 self.enforce_policy(&req, "s3:GetObjectRetention", &req.input.bucket, Some(&key))?;
6399 if let Some(mgr) = self.object_lock.as_ref() {
6400 let retention = mgr
6401 .get(&req.input.bucket, &req.input.key)
6402 .filter(|s| s.mode.is_some() || s.retain_until.is_some())
6403 .map(|s| {
6404 let mode = s.mode.map(|m| {
6405 ObjectLockRetentionMode::from_static(match m {
6406 crate::object_lock::LockMode::Governance => {
6407 ObjectLockRetentionMode::GOVERNANCE
6408 }
6409 crate::object_lock::LockMode::Compliance => {
6410 ObjectLockRetentionMode::COMPLIANCE
6411 }
6412 })
6413 });
6414 let until = s.retain_until.map(chrono_utc_to_timestamp);
6415 ObjectLockRetention {
6416 mode,
6417 retain_until_date: until,
6418 }
6419 });
6420 let output = GetObjectRetentionOutput { retention };
6421 return Ok(S3Response::new(output));
6422 }
6423 self.backend.get_object_retention(req).await
6424 }
6425 async fn put_object_retention(
6426 &self,
6427 req: S3Request<PutObjectRetentionInput>,
6428 ) -> S3Result<S3Response<PutObjectRetentionOutput>> {
6429 let key = req.input.key.clone();
6430 self.enforce_policy(&req, "s3:PutObjectRetention", &req.input.bucket, Some(&key))?;
6431 if let Some(mgr) = self.object_lock.as_ref() {
6432 let bucket = req.input.bucket.clone();
6433 let key = req.input.key.clone();
6434 // v0.8.12 HIGH-7 fix: the bypass header gates Governance
6435 // shortening only when the caller has the matching IAM
6436 // action explicitly allowed; otherwise it's silently
6437 // dropped to `false` and the "shortening Governance
6438 // requires bypass" branch below rejects.
6439 let bypass_header = req.input.bypass_governance_retention.unwrap_or(false);
6440 let bypass = if bypass_header {
6441 self.enforce_policy(&req, "s3:BypassGovernanceRetention", &bucket, Some(&key))
6442 .is_ok()
6443 } else {
6444 false
6445 };
6446 let retention = req.input.retention.as_ref().ok_or_else(|| {
6447 S3Error::with_message(
6448 S3ErrorCode::InvalidRequest,
6449 "PutObjectRetention requires a Retention element",
6450 )
6451 })?;
6452 let new_mode = retention
6453 .mode
6454 .as_ref()
6455 .and_then(|m| crate::object_lock::LockMode::from_aws_str(m.as_str()));
6456 let new_until = retention
6457 .retain_until_date
6458 .as_ref()
6459 .map(timestamp_to_chrono_utc)
6460 .unwrap_or(None);
6461 let now = chrono::Utc::now();
6462 let existing = mgr.get(&bucket, &key).unwrap_or_default();
6463 // S3 immutability rules:
6464 // - Compliance is one-way: once set, mode cannot move to
6465 // Governance, and retain-until cannot be shortened.
6466 // - Governance can be lengthened freely; shortened only
6467 // with bypass=true.
6468 if let Some(existing_mode) = existing.mode
6469 && existing_mode == crate::object_lock::LockMode::Compliance
6470 && existing.is_locked(now)
6471 {
6472 if matches!(new_mode, Some(crate::object_lock::LockMode::Governance)) {
6473 return Err(S3Error::with_message(
6474 S3ErrorCode::AccessDenied,
6475 "Cannot downgrade Compliance retention to Governance while lock is active",
6476 ));
6477 }
6478 if let (Some(prev), Some(next)) = (existing.retain_until, new_until)
6479 && next < prev
6480 {
6481 return Err(S3Error::with_message(
6482 S3ErrorCode::AccessDenied,
6483 "Cannot shorten Compliance retention while lock is active",
6484 ));
6485 }
6486 }
6487 if let Some(existing_mode) = existing.mode
6488 && existing_mode == crate::object_lock::LockMode::Governance
6489 && existing.is_locked(now)
6490 && !bypass
6491 && let (Some(prev), Some(next)) = (existing.retain_until, new_until)
6492 && next < prev
6493 {
6494 return Err(S3Error::with_message(
6495 S3ErrorCode::AccessDenied,
6496 "Shortening Governance retention requires x-amz-bypass-governance-retention: true",
6497 ));
6498 }
6499 let mut state = existing;
6500 if new_mode.is_some() {
6501 state.mode = new_mode;
6502 }
6503 if new_until.is_some() {
6504 state.retain_until = new_until;
6505 }
6506 mgr.set(&bucket, &key, state);
6507 return Ok(S3Response::new(PutObjectRetentionOutput::default()));
6508 }
6509 self.backend.put_object_retention(req).await
6510 }
6511
6512 // ---- Versioning ----
6513 // list_object_versions is implemented above in the compression-hook
6514 // section so it filters S4-internal sidecars (v0.4 #17) AND, when a
6515 // VersioningManager is attached (v0.5 #34), serves chains directly
6516 // from the in-memory index.
6517 async fn get_bucket_versioning(
6518 &self,
6519 req: S3Request<GetBucketVersioningInput>,
6520 ) -> S3Result<S3Response<GetBucketVersioningOutput>> {
6521 // v0.5 #34: when a VersioningManager is attached, the bucket's
6522 // versioning state lives in the manager (= S4-server's
6523 // authoritative source). Pass-through hits the backend only
6524 // when no manager is configured (legacy v0.4 behaviour).
6525 if let Some(mgr) = self.versioning.as_ref() {
6526 let output = match mgr.state(&req.input.bucket).as_aws_status() {
6527 Some(s) => GetBucketVersioningOutput {
6528 status: Some(BucketVersioningStatus::from(s.to_owned())),
6529 ..Default::default()
6530 },
6531 None => GetBucketVersioningOutput::default(),
6532 };
6533 return Ok(S3Response::new(output));
6534 }
6535 self.backend.get_bucket_versioning(req).await
6536 }
6537 async fn put_bucket_versioning(
6538 &self,
6539 req: S3Request<PutBucketVersioningInput>,
6540 ) -> S3Result<S3Response<PutBucketVersioningOutput>> {
6541 // v0.6 #42: MFA gating on the `PutBucketVersioning` request
6542 // itself. S3 spec: when the request body carries an
6543 // `MfaDelete` element (either `Enabled` or `Disabled`), the
6544 // request must include a valid `x-amz-mfa` token — both for
6545 // the *first* enable (so the operator can't quietly side-step
6546 // the gate by never enabling it) and for any subsequent
6547 // change (so a leaked credential alone can't disable MFA
6548 // Delete to bypass it on subsequent DELETEs). Requests that
6549 // omit the `MfaDelete` element entirely (i.e. they flip only
6550 // `Status`) skip this gate, matching AWS.
6551 if let Some(mgr) = self.mfa_delete.as_ref()
6552 && let Some(target_enabled) = req
6553 .input
6554 .versioning_configuration
6555 .mfa_delete
6556 .as_ref()
6557 .map(|m| m.as_str().eq_ignore_ascii_case("Enabled"))
6558 {
6559 let bucket = req.input.bucket.clone();
6560 let header = req.input.mfa.as_deref();
6561 let secret = mgr.lookup_secret(&bucket);
6562 let verified = match (header, secret.as_ref()) {
6563 (Some(h), Some(s)) => match crate::mfa::parse_mfa_header(h) {
6564 Ok((serial, code)) => {
6565 serial == s.serial
6566 && crate::mfa::verify_totp(&s.secret_base32, &code, current_unix_secs())
6567 }
6568 Err(_) => false,
6569 },
6570 _ => false,
6571 };
6572 if !verified {
6573 crate::metrics::record_mfa_delete_denial(&bucket);
6574 let err = if header.is_none() {
6575 crate::mfa::MfaError::Missing
6576 } else {
6577 crate::mfa::MfaError::InvalidCode
6578 };
6579 return Err(mfa_error_to_s3(err));
6580 }
6581 mgr.set_bucket_state(&bucket, target_enabled);
6582 }
6583 // v0.5 #34: stash the new state in the manager, then forward to
6584 // the backend so any downstream that *also* tracks state
6585 // (e.g. a real S3 backend) stays in sync. Manager-attached but
6586 // backend rejection is treated as a soft-fail (state is still
6587 // owned by the manager).
6588 if let Some(mgr) = self.versioning.as_ref() {
6589 let new_state = match req
6590 .input
6591 .versioning_configuration
6592 .status
6593 .as_ref()
6594 .map(|s| s.as_str())
6595 {
6596 Some(s) if s.eq_ignore_ascii_case("Enabled") => {
6597 crate::versioning::VersioningState::Enabled
6598 }
6599 Some(s) if s.eq_ignore_ascii_case("Suspended") => {
6600 crate::versioning::VersioningState::Suspended
6601 }
6602 _ => crate::versioning::VersioningState::Unversioned,
6603 };
6604 mgr.set_state(&req.input.bucket, new_state);
6605 return Ok(S3Response::new(PutBucketVersioningOutput::default()));
6606 }
6607 self.backend.put_bucket_versioning(req).await
6608 }
6609
6610 // ---- Bucket location ----
6611 async fn get_bucket_location(
6612 &self,
6613 req: S3Request<GetBucketLocationInput>,
6614 ) -> S3Result<S3Response<GetBucketLocationOutput>> {
6615 self.backend.get_bucket_location(req).await
6616 }
6617
6618 // ---- Bucket policy ----
6619 async fn get_bucket_policy(
6620 &self,
6621 req: S3Request<GetBucketPolicyInput>,
6622 ) -> S3Result<S3Response<GetBucketPolicyOutput>> {
6623 self.backend.get_bucket_policy(req).await
6624 }
6625 async fn put_bucket_policy(
6626 &self,
6627 req: S3Request<PutBucketPolicyInput>,
6628 ) -> S3Result<S3Response<PutBucketPolicyOutput>> {
6629 self.backend.put_bucket_policy(req).await
6630 }
6631 async fn delete_bucket_policy(
6632 &self,
6633 req: S3Request<DeleteBucketPolicyInput>,
6634 ) -> S3Result<S3Response<DeleteBucketPolicyOutput>> {
6635 self.backend.delete_bucket_policy(req).await
6636 }
6637 async fn get_bucket_policy_status(
6638 &self,
6639 req: S3Request<GetBucketPolicyStatusInput>,
6640 ) -> S3Result<S3Response<GetBucketPolicyStatusOutput>> {
6641 self.backend.get_bucket_policy_status(req).await
6642 }
6643
6644 // ---- Bucket ACL ----
6645 async fn get_bucket_acl(
6646 &self,
6647 req: S3Request<GetBucketAclInput>,
6648 ) -> S3Result<S3Response<GetBucketAclOutput>> {
6649 self.backend.get_bucket_acl(req).await
6650 }
6651 async fn put_bucket_acl(
6652 &self,
6653 req: S3Request<PutBucketAclInput>,
6654 ) -> S3Result<S3Response<PutBucketAclOutput>> {
6655 self.backend.put_bucket_acl(req).await
6656 }
6657
6658 // ---- Bucket CORS (v0.6 #38) ----
6659 async fn get_bucket_cors(
6660 &self,
6661 req: S3Request<GetBucketCorsInput>,
6662 ) -> S3Result<S3Response<GetBucketCorsOutput>> {
6663 if let Some(mgr) = self.cors.as_ref() {
6664 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6665 S3Error::with_message(
6666 S3ErrorCode::NoSuchCORSConfiguration,
6667 "The CORS configuration does not exist".to_string(),
6668 )
6669 })?;
6670 let rules: Vec<CORSRule> = cfg
6671 .rules
6672 .into_iter()
6673 .map(|r| CORSRule {
6674 allowed_headers: if r.allowed_headers.is_empty() {
6675 None
6676 } else {
6677 Some(r.allowed_headers)
6678 },
6679 allowed_methods: r.allowed_methods,
6680 allowed_origins: r.allowed_origins,
6681 expose_headers: if r.expose_headers.is_empty() {
6682 None
6683 } else {
6684 Some(r.expose_headers)
6685 },
6686 id: r.id,
6687 max_age_seconds: r.max_age_seconds.map(|s| s as i32),
6688 })
6689 .collect();
6690 return Ok(S3Response::new(GetBucketCorsOutput {
6691 cors_rules: Some(rules),
6692 }));
6693 }
6694 self.backend.get_bucket_cors(req).await
6695 }
6696 async fn put_bucket_cors(
6697 &self,
6698 req: S3Request<PutBucketCorsInput>,
6699 ) -> S3Result<S3Response<PutBucketCorsOutput>> {
6700 if let Some(mgr) = self.cors.as_ref() {
6701 let cfg = crate::cors::CorsConfig {
6702 rules: req
6703 .input
6704 .cors_configuration
6705 .cors_rules
6706 .into_iter()
6707 .map(|r| crate::cors::CorsRule {
6708 allowed_origins: r.allowed_origins,
6709 allowed_methods: r.allowed_methods,
6710 allowed_headers: r.allowed_headers.unwrap_or_default(),
6711 expose_headers: r.expose_headers.unwrap_or_default(),
6712 max_age_seconds: r
6713 .max_age_seconds
6714 .and_then(|s| if s < 0 { None } else { Some(s as u32) }),
6715 id: r.id,
6716 })
6717 .collect(),
6718 };
6719 // v0.8.15 M-3: AWS S3 rejects `AllowedMethods` outside
6720 // the canonical {GET,PUT,POST,DELETE,HEAD} set (including
6721 // the `*` wildcard). Validate at PutBucketCors time so
6722 // operators see the misconfiguration in the API response
6723 // instead of having silently-broken preflights at the
6724 // browser later.
6725 if let Err(e) = crate::cors::CorsManager::validate(&cfg) {
6726 return Err(S3Error::with_message(
6727 S3ErrorCode::InvalidArgument,
6728 e.to_string(),
6729 ));
6730 }
6731 mgr.put(&req.input.bucket, cfg);
6732 return Ok(S3Response::new(PutBucketCorsOutput::default()));
6733 }
6734 self.backend.put_bucket_cors(req).await
6735 }
6736 async fn delete_bucket_cors(
6737 &self,
6738 req: S3Request<DeleteBucketCorsInput>,
6739 ) -> S3Result<S3Response<DeleteBucketCorsOutput>> {
6740 if let Some(mgr) = self.cors.as_ref() {
6741 mgr.delete(&req.input.bucket);
6742 return Ok(S3Response::new(DeleteBucketCorsOutput::default()));
6743 }
6744 self.backend.delete_bucket_cors(req).await
6745 }
6746
6747 // ---- Bucket lifecycle (v0.6 #37) ----
6748 async fn get_bucket_lifecycle_configuration(
6749 &self,
6750 req: S3Request<GetBucketLifecycleConfigurationInput>,
6751 ) -> S3Result<S3Response<GetBucketLifecycleConfigurationOutput>> {
6752 if let Some(mgr) = self.lifecycle.as_ref() {
6753 let cfg = mgr.get(&req.input.bucket).ok_or_else(|| {
6754 S3Error::with_message(
6755 S3ErrorCode::NoSuchLifecycleConfiguration,
6756 "The lifecycle configuration does not exist".to_string(),
6757 )
6758 })?;
6759 let rules: Vec<LifecycleRule> = cfg.rules.iter().map(internal_rule_to_dto).collect();
6760 return Ok(S3Response::new(GetBucketLifecycleConfigurationOutput {
6761 rules: Some(rules),
6762 transition_default_minimum_object_size: None,
6763 }));
6764 }
6765 self.backend.get_bucket_lifecycle_configuration(req).await
6766 }
6767 async fn put_bucket_lifecycle_configuration(
6768 &self,
6769 req: S3Request<PutBucketLifecycleConfigurationInput>,
6770 ) -> S3Result<S3Response<PutBucketLifecycleConfigurationOutput>> {
6771 if let Some(mgr) = self.lifecycle.as_ref() {
6772 let bucket = req.input.bucket.clone();
6773 let dto_cfg = req.input.lifecycle_configuration.unwrap_or_default();
6774 let cfg = dto_lifecycle_to_internal(&dto_cfg);
6775 mgr.put(&bucket, cfg);
6776 return Ok(S3Response::new(
6777 PutBucketLifecycleConfigurationOutput::default(),
6778 ));
6779 }
6780 self.backend.put_bucket_lifecycle_configuration(req).await
6781 }
6782 async fn delete_bucket_lifecycle(
6783 &self,
6784 req: S3Request<DeleteBucketLifecycleInput>,
6785 ) -> S3Result<S3Response<DeleteBucketLifecycleOutput>> {
6786 if let Some(mgr) = self.lifecycle.as_ref() {
6787 mgr.delete(&req.input.bucket);
6788 return Ok(S3Response::new(DeleteBucketLifecycleOutput::default()));
6789 }
6790 self.backend.delete_bucket_lifecycle(req).await
6791 }
6792
6793 // ---- Bucket tagging (v0.6 #39) ----
6794 async fn get_bucket_tagging(
6795 &self,
6796 req: S3Request<GetBucketTaggingInput>,
6797 ) -> S3Result<S3Response<GetBucketTaggingOutput>> {
6798 let Some(mgr) = self.tagging.as_ref() else {
6799 return self.backend.get_bucket_tagging(req).await;
6800 };
6801 let tags = mgr.get_bucket_tags(&req.input.bucket).unwrap_or_default();
6802 Ok(S3Response::new(GetBucketTaggingOutput {
6803 tag_set: tagset_to_aws(&tags),
6804 }))
6805 }
6806 async fn put_bucket_tagging(
6807 &self,
6808 req: S3Request<PutBucketTaggingInput>,
6809 ) -> S3Result<S3Response<PutBucketTaggingOutput>> {
6810 let Some(mgr) = self.tagging.as_ref() else {
6811 return self.backend.put_bucket_tagging(req).await;
6812 };
6813 let bucket = req.input.bucket.clone();
6814 let parsed = aws_to_tagset(&req.input.tagging.tag_set)
6815 .map_err(|e| S3Error::with_message(S3ErrorCode::InvalidArgument, e.to_string()))?;
6816 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6817 mgr.put_bucket_tags(&bucket, parsed);
6818 Ok(S3Response::new(PutBucketTaggingOutput::default()))
6819 }
6820 async fn delete_bucket_tagging(
6821 &self,
6822 req: S3Request<DeleteBucketTaggingInput>,
6823 ) -> S3Result<S3Response<DeleteBucketTaggingOutput>> {
6824 let Some(mgr) = self.tagging.as_ref() else {
6825 return self.backend.delete_bucket_tagging(req).await;
6826 };
6827 let bucket = req.input.bucket.clone();
6828 self.enforce_policy(&req, "s3:PutBucketTagging", &bucket, None)?;
6829 mgr.delete_bucket_tags(&bucket);
6830 Ok(S3Response::new(DeleteBucketTaggingOutput::default()))
6831 }
6832
6833 // ---- Bucket encryption ----
6834 async fn get_bucket_encryption(
6835 &self,
6836 req: S3Request<GetBucketEncryptionInput>,
6837 ) -> S3Result<S3Response<GetBucketEncryptionOutput>> {
6838 self.backend.get_bucket_encryption(req).await
6839 }
6840 async fn put_bucket_encryption(
6841 &self,
6842 req: S3Request<PutBucketEncryptionInput>,
6843 ) -> S3Result<S3Response<PutBucketEncryptionOutput>> {
6844 self.backend.put_bucket_encryption(req).await
6845 }
6846 async fn delete_bucket_encryption(
6847 &self,
6848 req: S3Request<DeleteBucketEncryptionInput>,
6849 ) -> S3Result<S3Response<DeleteBucketEncryptionOutput>> {
6850 self.backend.delete_bucket_encryption(req).await
6851 }
6852
6853 // ---- Bucket logging ----
6854 async fn get_bucket_logging(
6855 &self,
6856 req: S3Request<GetBucketLoggingInput>,
6857 ) -> S3Result<S3Response<GetBucketLoggingOutput>> {
6858 self.backend.get_bucket_logging(req).await
6859 }
6860 async fn put_bucket_logging(
6861 &self,
6862 req: S3Request<PutBucketLoggingInput>,
6863 ) -> S3Result<S3Response<PutBucketLoggingOutput>> {
6864 self.backend.put_bucket_logging(req).await
6865 }
6866
6867 // ---- Bucket notification (v0.6 #35) ----
6868 //
6869 // When a `NotificationManager` is attached, S4 itself owns per-bucket
6870 // notification configurations and the PUT / GET handlers route through
6871 // the manager. The wire DTO's queue / topic configurations map onto
6872 // S4's `Destination::Sqs` / `Destination::Sns`; LambdaFunction and
6873 // EventBridge configurations are accepted on PUT but silently dropped
6874 // (out of scope for v0.6 #35). When no manager is attached the legacy
6875 // backend-passthrough behaviour applies.
6876 async fn get_bucket_notification_configuration(
6877 &self,
6878 req: S3Request<GetBucketNotificationConfigurationInput>,
6879 ) -> S3Result<S3Response<GetBucketNotificationConfigurationOutput>> {
6880 if let Some(mgr) = self.notifications.as_ref() {
6881 let cfg = mgr.get(&req.input.bucket).unwrap_or_default();
6882 let dto = notif_to_dto(&cfg);
6883 return Ok(S3Response::new(GetBucketNotificationConfigurationOutput {
6884 event_bridge_configuration: dto.event_bridge_configuration,
6885 lambda_function_configurations: dto.lambda_function_configurations,
6886 queue_configurations: dto.queue_configurations,
6887 topic_configurations: dto.topic_configurations,
6888 }));
6889 }
6890 self.backend
6891 .get_bucket_notification_configuration(req)
6892 .await
6893 }
6894 async fn put_bucket_notification_configuration(
6895 &self,
6896 req: S3Request<PutBucketNotificationConfigurationInput>,
6897 ) -> S3Result<S3Response<PutBucketNotificationConfigurationOutput>> {
6898 if let Some(mgr) = self.notifications.as_ref() {
6899 let cfg = notif_from_dto(&req.input.notification_configuration);
6900 mgr.put(&req.input.bucket, cfg);
6901 return Ok(S3Response::new(
6902 PutBucketNotificationConfigurationOutput::default(),
6903 ));
6904 }
6905 self.backend
6906 .put_bucket_notification_configuration(req)
6907 .await
6908 }
6909
6910 // ---- Bucket request payment ----
6911 async fn get_bucket_request_payment(
6912 &self,
6913 req: S3Request<GetBucketRequestPaymentInput>,
6914 ) -> S3Result<S3Response<GetBucketRequestPaymentOutput>> {
6915 self.backend.get_bucket_request_payment(req).await
6916 }
6917 async fn put_bucket_request_payment(
6918 &self,
6919 req: S3Request<PutBucketRequestPaymentInput>,
6920 ) -> S3Result<S3Response<PutBucketRequestPaymentOutput>> {
6921 self.backend.put_bucket_request_payment(req).await
6922 }
6923
6924 // ---- Bucket website ----
6925 async fn get_bucket_website(
6926 &self,
6927 req: S3Request<GetBucketWebsiteInput>,
6928 ) -> S3Result<S3Response<GetBucketWebsiteOutput>> {
6929 self.backend.get_bucket_website(req).await
6930 }
6931 async fn put_bucket_website(
6932 &self,
6933 req: S3Request<PutBucketWebsiteInput>,
6934 ) -> S3Result<S3Response<PutBucketWebsiteOutput>> {
6935 self.backend.put_bucket_website(req).await
6936 }
6937 async fn delete_bucket_website(
6938 &self,
6939 req: S3Request<DeleteBucketWebsiteInput>,
6940 ) -> S3Result<S3Response<DeleteBucketWebsiteOutput>> {
6941 self.backend.delete_bucket_website(req).await
6942 }
6943
6944 // ---- Bucket replication (v0.6 #40) ----
6945 async fn get_bucket_replication(
6946 &self,
6947 req: S3Request<GetBucketReplicationInput>,
6948 ) -> S3Result<S3Response<GetBucketReplicationOutput>> {
6949 if let Some(mgr) = self.replication.as_ref() {
6950 return match mgr.get(&req.input.bucket) {
6951 Some(cfg) => Ok(S3Response::new(GetBucketReplicationOutput {
6952 replication_configuration: Some(replication_to_dto(&cfg)),
6953 })),
6954 None => Err(S3Error::with_message(
6955 S3ErrorCode::Custom("ReplicationConfigurationNotFoundError".into()),
6956 format!(
6957 "no replication configuration on bucket {}",
6958 req.input.bucket
6959 ),
6960 )),
6961 };
6962 }
6963 self.backend.get_bucket_replication(req).await
6964 }
6965 async fn put_bucket_replication(
6966 &self,
6967 req: S3Request<PutBucketReplicationInput>,
6968 ) -> S3Result<S3Response<PutBucketReplicationOutput>> {
6969 if let Some(mgr) = self.replication.as_ref() {
6970 let cfg = replication_from_dto(&req.input.replication_configuration);
6971 mgr.put(&req.input.bucket, cfg);
6972 return Ok(S3Response::new(PutBucketReplicationOutput::default()));
6973 }
6974 self.backend.put_bucket_replication(req).await
6975 }
6976 async fn delete_bucket_replication(
6977 &self,
6978 req: S3Request<DeleteBucketReplicationInput>,
6979 ) -> S3Result<S3Response<DeleteBucketReplicationOutput>> {
6980 if let Some(mgr) = self.replication.as_ref() {
6981 mgr.delete(&req.input.bucket);
6982 return Ok(S3Response::new(DeleteBucketReplicationOutput::default()));
6983 }
6984 self.backend.delete_bucket_replication(req).await
6985 }
6986
6987 // ---- Bucket accelerate ----
6988 async fn get_bucket_accelerate_configuration(
6989 &self,
6990 req: S3Request<GetBucketAccelerateConfigurationInput>,
6991 ) -> S3Result<S3Response<GetBucketAccelerateConfigurationOutput>> {
6992 self.backend.get_bucket_accelerate_configuration(req).await
6993 }
6994 async fn put_bucket_accelerate_configuration(
6995 &self,
6996 req: S3Request<PutBucketAccelerateConfigurationInput>,
6997 ) -> S3Result<S3Response<PutBucketAccelerateConfigurationOutput>> {
6998 self.backend.put_bucket_accelerate_configuration(req).await
6999 }
7000
7001 // ---- Bucket ownership controls ----
7002 async fn get_bucket_ownership_controls(
7003 &self,
7004 req: S3Request<GetBucketOwnershipControlsInput>,
7005 ) -> S3Result<S3Response<GetBucketOwnershipControlsOutput>> {
7006 self.backend.get_bucket_ownership_controls(req).await
7007 }
7008 async fn put_bucket_ownership_controls(
7009 &self,
7010 req: S3Request<PutBucketOwnershipControlsInput>,
7011 ) -> S3Result<S3Response<PutBucketOwnershipControlsOutput>> {
7012 self.backend.put_bucket_ownership_controls(req).await
7013 }
7014 async fn delete_bucket_ownership_controls(
7015 &self,
7016 req: S3Request<DeleteBucketOwnershipControlsInput>,
7017 ) -> S3Result<S3Response<DeleteBucketOwnershipControlsOutput>> {
7018 self.backend.delete_bucket_ownership_controls(req).await
7019 }
7020
7021 // ---- Public access block ----
7022 async fn get_public_access_block(
7023 &self,
7024 req: S3Request<GetPublicAccessBlockInput>,
7025 ) -> S3Result<S3Response<GetPublicAccessBlockOutput>> {
7026 self.backend.get_public_access_block(req).await
7027 }
7028 async fn put_public_access_block(
7029 &self,
7030 req: S3Request<PutPublicAccessBlockInput>,
7031 ) -> S3Result<S3Response<PutPublicAccessBlockOutput>> {
7032 self.backend.put_public_access_block(req).await
7033 }
7034 async fn delete_public_access_block(
7035 &self,
7036 req: S3Request<DeletePublicAccessBlockInput>,
7037 ) -> S3Result<S3Response<DeletePublicAccessBlockOutput>> {
7038 self.backend.delete_public_access_block(req).await
7039 }
7040
7041 // ====================================================================
7042 // v0.6 #41: S3 Select — server-side SQL filter on object body.
7043 //
7044 // Fetch the object via the regular `get_object` path (so SSE-C /
7045 // SSE-S4 / SSE-KMS / S4 codec all decompress + decrypt transparently),
7046 // run a small SQL subset (CSV + JSON Lines, equality / inequality /
7047 // LIKE / AND / OR / NOT) over the in-memory body, and stream the
7048 // matched rows back as AWS event-stream `Records` + `Stats` + `End`
7049 // frames.
7050 //
7051 // Limitations (deliberate, documented):
7052 // - Parquet input is rejected with NotImplemented.
7053 // - Aggregates / GROUP BY / JOIN / ORDER BY / LIMIT are rejected at
7054 // parse time as InvalidRequest (s3s 0.13 doesn't expose AWS's
7055 // domain-specific `InvalidSqlExpression` code).
7056 // - The body is fully buffered before SQL evaluation (S3 Select
7057 // streaming-during-evaluation is v0.7 scope).
7058 // - GPU-accelerated WHERE evaluation is stubbed out (always None).
7059 async fn select_object_content(
7060 &self,
7061 req: S3Request<SelectObjectContentInput>,
7062 ) -> S3Result<S3Response<SelectObjectContentOutput>> {
7063 use crate::select::{
7064 EventStreamWriter, SelectInputFormat, SelectOutputFormat, run_select_csv,
7065 run_select_jsonlines,
7066 };
7067
7068 let select_bucket = req.input.bucket.clone();
7069 let select_key = req.input.key.clone();
7070 self.enforce_rate_limit(&req, &select_bucket)?;
7071 self.enforce_policy(&req, "s3:GetObject", &select_bucket, Some(&select_key))?;
7072
7073 let request = req.input.request;
7074 let sql = request.expression.clone();
7075 if request.expression_type.as_str() != "SQL" {
7076 return Err(S3Error::with_message(
7077 S3ErrorCode::InvalidExpressionType,
7078 format!(
7079 "ExpressionType must be SQL, got: {}",
7080 request.expression_type.as_str()
7081 ),
7082 ));
7083 }
7084
7085 let input_format = if let Some(_json) = request.input_serialization.json.as_ref() {
7086 SelectInputFormat::JsonLines
7087 } else if let Some(csv) = request.input_serialization.csv.as_ref() {
7088 let has_header = csv
7089 .file_header_info
7090 .as_ref()
7091 .map(|h| {
7092 let s = h.as_str();
7093 s.eq_ignore_ascii_case("USE") || s.eq_ignore_ascii_case("IGNORE")
7094 })
7095 .unwrap_or(false);
7096 let delim = csv
7097 .field_delimiter
7098 .as_deref()
7099 .and_then(|s| s.chars().next())
7100 .unwrap_or(',');
7101 SelectInputFormat::Csv {
7102 has_header,
7103 delimiter: delim,
7104 }
7105 } else if request.input_serialization.parquet.is_some() {
7106 return Err(S3Error::with_message(
7107 S3ErrorCode::NotImplemented,
7108 "Parquet input is not supported by this S3 Select implementation (v0.6: CSV / JSON Lines only)",
7109 ));
7110 } else {
7111 return Err(S3Error::with_message(
7112 S3ErrorCode::InvalidRequest,
7113 "InputSerialization requires exactly one of CSV / JSON / Parquet",
7114 ));
7115 };
7116 if let Some(ct) = request.input_serialization.compression_type.as_ref()
7117 && !ct.as_str().eq_ignore_ascii_case("NONE")
7118 {
7119 return Err(S3Error::with_message(
7120 S3ErrorCode::NotImplemented,
7121 format!(
7122 "InputSerialization CompressionType={} is not supported (v0.6: NONE only)",
7123 ct.as_str()
7124 ),
7125 ));
7126 }
7127
7128 let output_format = if request.output_serialization.json.is_some() {
7129 SelectOutputFormat::Json
7130 } else if request.output_serialization.csv.is_some() {
7131 SelectOutputFormat::Csv
7132 } else {
7133 return Err(S3Error::with_message(
7134 S3ErrorCode::InvalidRequest,
7135 "OutputSerialization requires exactly one of CSV / JSON",
7136 ));
7137 };
7138
7139 let get_input = GetObjectInput {
7140 bucket: select_bucket.clone(),
7141 key: select_key.clone(),
7142 sse_customer_algorithm: req.input.sse_customer_algorithm.clone(),
7143 sse_customer_key: req.input.sse_customer_key.clone(),
7144 sse_customer_key_md5: req.input.sse_customer_key_md5.clone(),
7145 ..Default::default()
7146 };
7147 let get_req = S3Request {
7148 input: get_input,
7149 method: http::Method::GET,
7150 uri: format!("/{}/{}", select_bucket, select_key)
7151 .parse()
7152 .map_err(|e| {
7153 S3Error::with_message(
7154 S3ErrorCode::InternalError,
7155 format!("constructing inner GET URI: {e}"),
7156 )
7157 })?,
7158 headers: http::HeaderMap::new(),
7159 extensions: http::Extensions::new(),
7160 credentials: req.credentials.clone(),
7161 region: req.region.clone(),
7162 service: req.service.clone(),
7163 trailing_headers: None,
7164 };
7165 let mut get_resp = self.get_object(get_req).await?;
7166 let blob = get_resp.output.body.take().ok_or_else(|| {
7167 S3Error::with_message(
7168 S3ErrorCode::InternalError,
7169 "Select: object body was empty after GET",
7170 )
7171 })?;
7172 let body_bytes = crate::blob::collect_blob(blob, self.max_body_bytes)
7173 .await
7174 .map_err(internal("collect Select body"))?;
7175 let scanned = body_bytes.len() as u64;
7176
7177 let matched_payload = match input_format {
7178 SelectInputFormat::JsonLines => run_select_jsonlines(&sql, &body_bytes, output_format)
7179 .map_err(|e| select_error_to_s3(e, "JSON Lines"))?,
7180 SelectInputFormat::Csv { .. } => {
7181 run_select_csv(&sql, &body_bytes, input_format, output_format)
7182 .map_err(|e| select_error_to_s3(e, "CSV"))?
7183 }
7184 };
7185
7186 let returned = matched_payload.len() as u64;
7187 let processed = scanned;
7188 let mut events: Vec<S3Result<SelectObjectContentEvent>> = Vec::with_capacity(3);
7189 if !matched_payload.is_empty() {
7190 events.push(Ok(SelectObjectContentEvent::Records(RecordsEvent {
7191 payload: Some(bytes::Bytes::from(matched_payload)),
7192 })));
7193 }
7194 events.push(Ok(SelectObjectContentEvent::Stats(StatsEvent {
7195 details: Some(Stats {
7196 bytes_scanned: Some(scanned as i64),
7197 bytes_processed: Some(processed as i64),
7198 bytes_returned: Some(returned as i64),
7199 }),
7200 })));
7201 events.push(Ok(SelectObjectContentEvent::End(EndEvent {})));
7202 // Touch EventStreamWriter so the public API stays linked into the
7203 // build (the actual wire framing is delegated to s3s).
7204 let _writer = EventStreamWriter::new();
7205
7206 let stream = SelectObjectContentEventStream::new(futures::stream::iter(events));
7207 let output = SelectObjectContentOutput {
7208 payload: Some(stream),
7209 };
7210 Ok(S3Response::new(output))
7211 }
7212
7213 // ---- Bucket Inventory configuration (v0.6 #36) ----
7214 //
7215 // When an `InventoryManager` is attached, S4-server owns the
7216 // configuration store and these handlers no longer pass through to
7217 // the backend. The mapping between the s3s-typed
7218 // `InventoryConfiguration` and the inventory module's internal
7219 // `InventoryConfig` is intentionally lossy: only the fields S4
7220 // actually uses for periodic CSV emission survive the round trip
7221 // (id, source bucket, destination bucket / prefix, format, included
7222 // versions, schedule frequency). Optional fields, encryption, and
7223 // filter prefixes are accepted on PUT and re-surfaced on GET via
7224 // a best-effort default-shape `InventoryConfiguration` so the
7225 // client sees a roundtrip-clean response.
7226 async fn put_bucket_inventory_configuration(
7227 &self,
7228 req: S3Request<PutBucketInventoryConfigurationInput>,
7229 ) -> S3Result<S3Response<PutBucketInventoryConfigurationOutput>> {
7230 if let Some(mgr) = self.inventory.as_ref() {
7231 let cfg = inv_from_dto(
7232 &req.input.bucket,
7233 &req.input.id,
7234 &req.input.inventory_configuration,
7235 );
7236 mgr.put(cfg);
7237 return Ok(S3Response::new(
7238 PutBucketInventoryConfigurationOutput::default(),
7239 ));
7240 }
7241 self.backend.put_bucket_inventory_configuration(req).await
7242 }
7243
7244 async fn get_bucket_inventory_configuration(
7245 &self,
7246 req: S3Request<GetBucketInventoryConfigurationInput>,
7247 ) -> S3Result<S3Response<GetBucketInventoryConfigurationOutput>> {
7248 if let Some(mgr) = self.inventory.as_ref() {
7249 let cfg = mgr.get(&req.input.bucket, &req.input.id);
7250 if let Some(cfg) = cfg {
7251 let out = GetBucketInventoryConfigurationOutput {
7252 inventory_configuration: Some(inv_to_dto(&cfg)),
7253 };
7254 return Ok(S3Response::new(out));
7255 }
7256 // AWS returns `NoSuchConfiguration` (404) when the id has no
7257 // matching inventory configuration on the bucket. The
7258 // generated `S3ErrorCode` enum doesn't expose a typed variant
7259 // for this code, so we round-trip through `from_bytes` which
7260 // wraps unknown codes as `Custom(...)` (= the AWS-canonical
7261 // error-code string survives into the XML response envelope).
7262 let code =
7263 S3ErrorCode::from_bytes(b"NoSuchConfiguration").unwrap_or(S3ErrorCode::NoSuchKey);
7264 return Err(S3Error::with_message(
7265 code,
7266 format!(
7267 "no inventory configuration with id={} on bucket={}",
7268 req.input.id, req.input.bucket
7269 ),
7270 ));
7271 }
7272 self.backend.get_bucket_inventory_configuration(req).await
7273 }
7274
7275 async fn list_bucket_inventory_configurations(
7276 &self,
7277 req: S3Request<ListBucketInventoryConfigurationsInput>,
7278 ) -> S3Result<S3Response<ListBucketInventoryConfigurationsOutput>> {
7279 if let Some(mgr) = self.inventory.as_ref() {
7280 let list = mgr.list_for_bucket(&req.input.bucket);
7281 let dto_list: Vec<InventoryConfiguration> = list.iter().map(inv_to_dto).collect();
7282 let out = ListBucketInventoryConfigurationsOutput {
7283 continuation_token: req.input.continuation_token.clone(),
7284 inventory_configuration_list: if dto_list.is_empty() {
7285 None
7286 } else {
7287 Some(dto_list)
7288 },
7289 is_truncated: Some(false),
7290 next_continuation_token: None,
7291 };
7292 return Ok(S3Response::new(out));
7293 }
7294 self.backend.list_bucket_inventory_configurations(req).await
7295 }
7296
7297 async fn delete_bucket_inventory_configuration(
7298 &self,
7299 req: S3Request<DeleteBucketInventoryConfigurationInput>,
7300 ) -> S3Result<S3Response<DeleteBucketInventoryConfigurationOutput>> {
7301 if let Some(mgr) = self.inventory.as_ref() {
7302 mgr.delete(&req.input.bucket, &req.input.id);
7303 return Ok(S3Response::new(
7304 DeleteBucketInventoryConfigurationOutput::default(),
7305 ));
7306 }
7307 self.backend
7308 .delete_bucket_inventory_configuration(req)
7309 .await
7310 }
7311}
7312
7313// ---------------------------------------------------------------------------
7314// v0.6 #36: Convert between the s3s-typed `InventoryConfiguration` (the wire
7315// surface) and our internal `crate::inventory::InventoryConfig`. Only the
7316// fields S4 actually uses for CSV emission survive the round trip; the
7317// missing fields (filter prefix, optional fields, encryption) are dropped on
7318// PUT and re-rendered as the AWS-default shape on GET so the client sees a
7319// well-formed `InventoryConfiguration`.
7320// ---------------------------------------------------------------------------
7321
7322fn inv_from_dto(
7323 bucket: &str,
7324 id: &str,
7325 dto: &InventoryConfiguration,
7326) -> crate::inventory::InventoryConfig {
7327 let frequency_hours = match dto.schedule.frequency.as_str() {
7328 "Weekly" => 24 * 7,
7329 // Daily is the default; anything S4 doesn't recognise (incl.
7330 // empty, which is the s3s-default) maps to Daily so the
7331 // operator's PUT doesn't silently turn into a no-op cadence.
7332 _ => 24,
7333 };
7334 // Parquet/ORC are not supported (issue #36 scope); we still accept
7335 // the PUT so callers don't fail-loud, but we record CSV and rely on
7336 // the operator catching the discrepancy on GET.
7337 let format = crate::inventory::InventoryFormat::Csv;
7338 crate::inventory::InventoryConfig {
7339 id: id.to_owned(),
7340 bucket: bucket.to_owned(),
7341 destination_bucket: dto.destination.s3_bucket_destination.bucket.clone(),
7342 destination_prefix: dto
7343 .destination
7344 .s3_bucket_destination
7345 .prefix
7346 .clone()
7347 .unwrap_or_default(),
7348 frequency_hours,
7349 format,
7350 included_object_versions: crate::inventory::IncludedVersions::from_aws_str(
7351 dto.included_object_versions.as_str(),
7352 ),
7353 }
7354}
7355
7356fn inv_to_dto(cfg: &crate::inventory::InventoryConfig) -> InventoryConfiguration {
7357 InventoryConfiguration {
7358 id: cfg.id.clone(),
7359 is_enabled: true,
7360 included_object_versions: InventoryIncludedObjectVersions::from(
7361 cfg.included_object_versions.as_aws_str().to_owned(),
7362 ),
7363 destination: InventoryDestination {
7364 s3_bucket_destination: InventoryS3BucketDestination {
7365 account_id: None,
7366 bucket: cfg.destination_bucket.clone(),
7367 encryption: None,
7368 format: InventoryFormat::from(cfg.format.as_aws_str().to_owned()),
7369 prefix: if cfg.destination_prefix.is_empty() {
7370 None
7371 } else {
7372 Some(cfg.destination_prefix.clone())
7373 },
7374 },
7375 },
7376 schedule: InventorySchedule {
7377 // `frequency_hours == 168` -> Weekly; everything else maps to
7378 // Daily for the wire response (the manager keeps the precise
7379 // hour count internally for due-checking).
7380 frequency: InventoryFrequency::from(
7381 if cfg.frequency_hours == 24 * 7 {
7382 "Weekly"
7383 } else {
7384 "Daily"
7385 }
7386 .to_owned(),
7387 ),
7388 },
7389 filter: None,
7390 optional_fields: None,
7391 }
7392}
7393
7394// ---------------------------------------------------------------------------
7395// v0.6 #35: Convert between the s3s-typed `NotificationConfiguration` (the
7396// wire surface) and our internal `crate::notifications::NotificationConfig`.
7397//
7398// We support TopicConfiguration (-> Destination::Sns) and QueueConfiguration
7399// (-> Destination::Sqs). LambdaFunction and EventBridge configurations are
7400// silently dropped on PUT (out of scope for v0.6 #35); the GET response only
7401// surfaces topic / queue rules.
7402//
7403// The webhook destination has no AWS-native wire form: operators configure
7404// webhooks via the JSON snapshot file (`--notifications-state-file`) or by
7405// poking `NotificationManager::put` directly from a custom binary. This
7406// keeps the wire surface AWS-compatible while still letting the always-
7407// available `Webhook` destination be reachable.
7408// ---------------------------------------------------------------------------
7409
7410fn notif_from_dto(dto: &NotificationConfiguration) -> crate::notifications::NotificationConfig {
7411 let mut rules: Vec<crate::notifications::NotificationRule> = Vec::new();
7412 if let Some(topics) = dto.topic_configurations.as_ref() {
7413 for (idx, t) in topics.iter().enumerate() {
7414 let events = events_from_dto(&t.events);
7415 let (prefix, suffix) = filter_from_dto(t.filter.as_ref());
7416 rules.push(crate::notifications::NotificationRule {
7417 id: t.id.clone().unwrap_or_else(|| format!("topic-{idx}")),
7418 events,
7419 destination: crate::notifications::Destination::Sns {
7420 topic_arn: t.topic_arn.clone(),
7421 },
7422 filter_prefix: prefix,
7423 filter_suffix: suffix,
7424 });
7425 }
7426 }
7427 if let Some(queues) = dto.queue_configurations.as_ref() {
7428 for (idx, q) in queues.iter().enumerate() {
7429 let events = events_from_dto(&q.events);
7430 let (prefix, suffix) = filter_from_dto(q.filter.as_ref());
7431 rules.push(crate::notifications::NotificationRule {
7432 id: q.id.clone().unwrap_or_else(|| format!("queue-{idx}")),
7433 events,
7434 destination: crate::notifications::Destination::Sqs {
7435 queue_arn: q.queue_arn.clone(),
7436 },
7437 filter_prefix: prefix,
7438 filter_suffix: suffix,
7439 });
7440 }
7441 }
7442 crate::notifications::NotificationConfig { rules }
7443}
7444
7445fn notif_to_dto(cfg: &crate::notifications::NotificationConfig) -> NotificationConfiguration {
7446 let mut topics: Vec<TopicConfiguration> = Vec::new();
7447 let mut queues: Vec<QueueConfiguration> = Vec::new();
7448 for rule in &cfg.rules {
7449 let events: Vec<Event> = rule
7450 .events
7451 .iter()
7452 .map(|e| Event::from(e.as_aws_str().to_owned()))
7453 .collect();
7454 let filter = filter_to_dto(rule.filter_prefix.as_deref(), rule.filter_suffix.as_deref());
7455 match &rule.destination {
7456 crate::notifications::Destination::Sns { topic_arn } => {
7457 topics.push(TopicConfiguration {
7458 events,
7459 filter,
7460 id: Some(rule.id.clone()),
7461 topic_arn: topic_arn.clone(),
7462 });
7463 }
7464 crate::notifications::Destination::Sqs { queue_arn } => {
7465 queues.push(QueueConfiguration {
7466 events,
7467 filter,
7468 id: Some(rule.id.clone()),
7469 queue_arn: queue_arn.clone(),
7470 });
7471 }
7472 // Webhook destinations have no AWS wire equivalent — they
7473 // round-trip through the JSON snapshot only. Skip them on the
7474 // GET surface (an SDK consumer wouldn't know what to do with
7475 // them anyway).
7476 crate::notifications::Destination::Webhook { .. } => {}
7477 }
7478 }
7479 NotificationConfiguration {
7480 event_bridge_configuration: None,
7481 lambda_function_configurations: None,
7482 queue_configurations: if queues.is_empty() {
7483 None
7484 } else {
7485 Some(queues)
7486 },
7487 topic_configurations: if topics.is_empty() {
7488 None
7489 } else {
7490 Some(topics)
7491 },
7492 }
7493}
7494
7495fn events_from_dto(events: &[Event]) -> Vec<crate::notifications::EventType> {
7496 events
7497 .iter()
7498 .filter_map(|e| crate::notifications::EventType::from_aws_str(e.as_ref()))
7499 .collect()
7500}
7501
7502fn filter_from_dto(
7503 f: Option<&NotificationConfigurationFilter>,
7504) -> (Option<String>, Option<String>) {
7505 let Some(f) = f else {
7506 return (None, None);
7507 };
7508 let Some(key) = f.key.as_ref() else {
7509 return (None, None);
7510 };
7511 let Some(rules) = key.filter_rules.as_ref() else {
7512 return (None, None);
7513 };
7514 let mut prefix = None;
7515 let mut suffix = None;
7516 for r in rules {
7517 let name = r.name.as_ref().map(|n| n.as_str().to_ascii_lowercase());
7518 let value = r.value.clone();
7519 match name.as_deref() {
7520 Some("prefix") => prefix = value,
7521 Some("suffix") => suffix = value,
7522 _ => {}
7523 }
7524 }
7525 (prefix, suffix)
7526}
7527
7528fn filter_to_dto(
7529 prefix: Option<&str>,
7530 suffix: Option<&str>,
7531) -> Option<NotificationConfigurationFilter> {
7532 if prefix.is_none() && suffix.is_none() {
7533 return None;
7534 }
7535 let mut rules: Vec<FilterRule> = Vec::new();
7536 if let Some(p) = prefix {
7537 rules.push(FilterRule {
7538 name: Some(FilterRuleName::from("prefix".to_owned())),
7539 value: Some(p.to_owned()),
7540 });
7541 }
7542 if let Some(s) = suffix {
7543 rules.push(FilterRule {
7544 name: Some(FilterRuleName::from("suffix".to_owned())),
7545 value: Some(s.to_owned()),
7546 });
7547 }
7548 Some(NotificationConfigurationFilter {
7549 key: Some(S3KeyFilter {
7550 filter_rules: Some(rules),
7551 }),
7552 })
7553}
7554
7555// ---------------------------------------------------------------------------
7556// v0.6 #40: Convert between the s3s-typed `ReplicationConfiguration` (the
7557// wire surface) and our internal `crate::replication::ReplicationConfig`.
7558// AWS's `ReplicationRuleFilter` is a sum type — `Prefix | Tag | And { Prefix,
7559// Tags }`; we flatten it into the single `(prefix, tag-vec)` representation
7560// the matcher needs. Sub-blocks v0.6 #40 does not implement
7561// (DeleteMarkerReplication / SourceSelectionCriteria / ReplicationTime /
7562// Metrics / EncryptionConfiguration) round-trip as `None` on GET — operators
7563// who set them on PUT see them silently dropped, mirroring "feature not
7564// supported in this release" semantics.
7565// ---------------------------------------------------------------------------
7566
7567fn replication_from_dto(dto: &ReplicationConfiguration) -> crate::replication::ReplicationConfig {
7568 let rules = dto
7569 .rules
7570 .iter()
7571 .enumerate()
7572 .map(|(idx, r)| {
7573 let id =
7574 r.id.as_ref()
7575 .map(|s| s.as_str().to_owned())
7576 .unwrap_or_else(|| format!("rule-{idx}"));
7577 let priority = r.priority.unwrap_or(0).max(0) as u32;
7578 let status_enabled = r.status.as_str() == ReplicationRuleStatus::ENABLED;
7579 let filter = replication_filter_from_dto(r.filter.as_ref(), r.prefix.as_deref());
7580 let destination_bucket = r.destination.bucket.clone();
7581 let destination_storage_class = r
7582 .destination
7583 .storage_class
7584 .as_ref()
7585 .map(|s| s.as_str().to_owned());
7586 crate::replication::ReplicationRule {
7587 id,
7588 priority,
7589 status_enabled,
7590 filter,
7591 destination_bucket,
7592 destination_storage_class,
7593 }
7594 })
7595 .collect();
7596 crate::replication::ReplicationConfig {
7597 role: dto.role.clone(),
7598 rules,
7599 }
7600}
7601
7602fn replication_to_dto(cfg: &crate::replication::ReplicationConfig) -> ReplicationConfiguration {
7603 let rules = cfg
7604 .rules
7605 .iter()
7606 .map(|r| {
7607 let status = if r.status_enabled {
7608 ReplicationRuleStatus::from_static(ReplicationRuleStatus::ENABLED)
7609 } else {
7610 ReplicationRuleStatus::from_static(ReplicationRuleStatus::DISABLED)
7611 };
7612 let destination = Destination {
7613 access_control_translation: None,
7614 account: None,
7615 bucket: r.destination_bucket.clone(),
7616 encryption_configuration: None,
7617 metrics: None,
7618 replication_time: None,
7619 storage_class: r
7620 .destination_storage_class
7621 .as_ref()
7622 .map(|s| StorageClass::from(s.clone())),
7623 };
7624 let filter = Some(replication_filter_to_dto(&r.filter));
7625 ReplicationRule {
7626 delete_marker_replication: None,
7627 destination,
7628 existing_object_replication: None,
7629 filter,
7630 id: Some(r.id.clone()),
7631 prefix: None,
7632 priority: Some(r.priority as i32),
7633 source_selection_criteria: None,
7634 status,
7635 }
7636 })
7637 .collect();
7638 ReplicationConfiguration {
7639 role: cfg.role.clone(),
7640 rules,
7641 }
7642}
7643
7644fn replication_filter_from_dto(
7645 f: Option<&ReplicationRuleFilter>,
7646 rule_level_prefix: Option<&str>,
7647) -> crate::replication::ReplicationFilter {
7648 let mut prefix: Option<String> = rule_level_prefix.map(str::to_owned);
7649 let mut tags: Vec<(String, String)> = Vec::new();
7650 if let Some(f) = f {
7651 if let Some(p) = f.prefix.as_ref()
7652 && prefix.is_none()
7653 {
7654 prefix = Some(p.clone());
7655 }
7656 if let Some(t) = f.tag.as_ref()
7657 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7658 {
7659 tags.push((k.clone(), v.clone()));
7660 }
7661 if let Some(and) = f.and.as_ref() {
7662 if let Some(p) = and.prefix.as_ref()
7663 && prefix.is_none()
7664 {
7665 prefix = Some(p.clone());
7666 }
7667 if let Some(ts) = and.tags.as_ref() {
7668 for t in ts {
7669 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7670 tags.push((k.clone(), v.clone()));
7671 }
7672 }
7673 }
7674 }
7675 }
7676 crate::replication::ReplicationFilter { prefix, tags }
7677}
7678
7679fn replication_filter_to_dto(f: &crate::replication::ReplicationFilter) -> ReplicationRuleFilter {
7680 if f.tags.is_empty() {
7681 ReplicationRuleFilter {
7682 and: None,
7683 prefix: f.prefix.clone(),
7684 tag: None,
7685 }
7686 } else if f.tags.len() == 1 && f.prefix.is_none() {
7687 let (k, v) = &f.tags[0];
7688 ReplicationRuleFilter {
7689 and: None,
7690 prefix: None,
7691 tag: Some(Tag {
7692 key: Some(k.clone()),
7693 value: Some(v.clone()),
7694 }),
7695 }
7696 } else {
7697 let tags: Vec<Tag> = f
7698 .tags
7699 .iter()
7700 .map(|(k, v)| Tag {
7701 key: Some(k.clone()),
7702 value: Some(v.clone()),
7703 })
7704 .collect();
7705 ReplicationRuleFilter {
7706 and: Some(ReplicationRuleAndOperator {
7707 prefix: f.prefix.clone(),
7708 tags: Some(tags),
7709 }),
7710 prefix: None,
7711 tag: None,
7712 }
7713 }
7714}
7715
7716// ---------------------------------------------------------------------------
7717// v0.6 #37: Convert between the s3s-typed `BucketLifecycleConfiguration`
7718// (the wire surface) and our internal `crate::lifecycle::LifecycleConfig`.
7719// The internal representation flattens AWS's "Filter | And" disjunction
7720// into a single `LifecycleFilter` struct of optional fields plus a tag
7721// vector. Fields S4's evaluator does not consume
7722// (`expired_object_delete_marker`, `noncurrent_version_transitions`,
7723// `transition_default_minimum_object_size`, the storage class on the
7724// noncurrent expiration) are dropped on PUT and re-rendered as their
7725// AWS-default shape on GET so the client always sees a well-formed
7726// configuration.
7727// ---------------------------------------------------------------------------
7728
7729fn dto_lifecycle_to_internal(
7730 dto: &BucketLifecycleConfiguration,
7731) -> crate::lifecycle::LifecycleConfig {
7732 crate::lifecycle::LifecycleConfig {
7733 rules: dto.rules.iter().map(dto_rule_to_internal).collect(),
7734 }
7735}
7736
7737fn dto_rule_to_internal(rule: &LifecycleRule) -> crate::lifecycle::LifecycleRule {
7738 let status = crate::lifecycle::LifecycleStatus::from_aws_str(rule.status.as_str());
7739 let filter = rule
7740 .filter
7741 .as_ref()
7742 .map(dto_filter_to_internal)
7743 .unwrap_or_default();
7744 let expiration_days = rule
7745 .expiration
7746 .as_ref()
7747 .and_then(|e| e.days)
7748 .and_then(|d| u32::try_from(d).ok());
7749 let expiration_date = rule
7750 .expiration
7751 .as_ref()
7752 .and_then(|e| e.date.as_ref())
7753 .and_then(timestamp_to_chrono_utc);
7754 let transitions: Vec<crate::lifecycle::TransitionRule> = rule
7755 .transitions
7756 .as_ref()
7757 .map(|ts| {
7758 ts.iter()
7759 .filter_map(|t| {
7760 let days = u32::try_from(t.days?).ok()?;
7761 let storage_class = t.storage_class.as_ref()?.as_str().to_owned();
7762 Some(crate::lifecycle::TransitionRule {
7763 days,
7764 storage_class,
7765 })
7766 })
7767 .collect()
7768 })
7769 .unwrap_or_default();
7770 let noncurrent_version_expiration_days = rule
7771 .noncurrent_version_expiration
7772 .as_ref()
7773 .and_then(|n| n.noncurrent_days)
7774 .and_then(|d| u32::try_from(d).ok());
7775 let abort_incomplete_multipart_upload_days = rule
7776 .abort_incomplete_multipart_upload
7777 .as_ref()
7778 .and_then(|a| a.days_after_initiation)
7779 .and_then(|d| u32::try_from(d).ok());
7780 crate::lifecycle::LifecycleRule {
7781 id: rule.id.clone().unwrap_or_default(),
7782 status,
7783 filter,
7784 expiration_days,
7785 expiration_date,
7786 transitions,
7787 noncurrent_version_expiration_days,
7788 abort_incomplete_multipart_upload_days,
7789 }
7790}
7791
7792fn dto_filter_to_internal(filter: &LifecycleRuleFilter) -> crate::lifecycle::LifecycleFilter {
7793 let mut prefix = filter.prefix.clone();
7794 let mut tags: Vec<(String, String)> = Vec::new();
7795 let mut size_gt: Option<u64> = filter
7796 .object_size_greater_than
7797 .and_then(|n| u64::try_from(n).ok());
7798 let mut size_lt: Option<u64> = filter
7799 .object_size_less_than
7800 .and_then(|n| u64::try_from(n).ok());
7801 if let Some(t) = &filter.tag
7802 && let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref())
7803 {
7804 tags.push((k.clone(), v.clone()));
7805 }
7806 if let Some(and) = &filter.and {
7807 if prefix.is_none() {
7808 prefix = and.prefix.clone();
7809 }
7810 if size_gt.is_none() {
7811 size_gt = and
7812 .object_size_greater_than
7813 .and_then(|n| u64::try_from(n).ok());
7814 }
7815 if size_lt.is_none() {
7816 size_lt = and
7817 .object_size_less_than
7818 .and_then(|n| u64::try_from(n).ok());
7819 }
7820 if let Some(ts) = &and.tags {
7821 for t in ts {
7822 if let (Some(k), Some(v)) = (t.key.as_ref(), t.value.as_ref()) {
7823 tags.push((k.clone(), v.clone()));
7824 }
7825 }
7826 }
7827 }
7828 crate::lifecycle::LifecycleFilter {
7829 prefix,
7830 tags,
7831 object_size_greater_than: size_gt,
7832 object_size_less_than: size_lt,
7833 }
7834}
7835
7836fn internal_rule_to_dto(rule: &crate::lifecycle::LifecycleRule) -> LifecycleRule {
7837 let expiration = if rule.expiration_days.is_some() || rule.expiration_date.is_some() {
7838 Some(LifecycleExpiration {
7839 date: rule.expiration_date.map(chrono_utc_to_timestamp),
7840 days: rule.expiration_days.map(|d| d as i32),
7841 expired_object_delete_marker: None,
7842 })
7843 } else {
7844 None
7845 };
7846 let transitions: Option<TransitionList> = if rule.transitions.is_empty() {
7847 None
7848 } else {
7849 Some(
7850 rule.transitions
7851 .iter()
7852 .map(|t| Transition {
7853 date: None,
7854 days: Some(t.days as i32),
7855 storage_class: Some(TransitionStorageClass::from(t.storage_class.clone())),
7856 })
7857 .collect(),
7858 )
7859 };
7860 let noncurrent_version_expiration =
7861 rule.noncurrent_version_expiration_days
7862 .map(|d| NoncurrentVersionExpiration {
7863 newer_noncurrent_versions: None,
7864 noncurrent_days: Some(d as i32),
7865 });
7866 let abort_incomplete_multipart_upload =
7867 rule.abort_incomplete_multipart_upload_days
7868 .map(|d| AbortIncompleteMultipartUpload {
7869 days_after_initiation: Some(d as i32),
7870 });
7871 let filter = if rule.filter.tags.is_empty()
7872 && rule.filter.object_size_greater_than.is_none()
7873 && rule.filter.object_size_less_than.is_none()
7874 {
7875 rule.filter.prefix.as_ref().map(|p| LifecycleRuleFilter {
7876 and: None,
7877 object_size_greater_than: None,
7878 object_size_less_than: None,
7879 prefix: Some(p.clone()),
7880 tag: None,
7881 })
7882 } else if rule.filter.tags.len() == 1
7883 && rule.filter.prefix.is_none()
7884 && rule.filter.object_size_greater_than.is_none()
7885 && rule.filter.object_size_less_than.is_none()
7886 {
7887 let (k, v) = rule.filter.tags[0].clone();
7888 Some(LifecycleRuleFilter {
7889 and: None,
7890 object_size_greater_than: None,
7891 object_size_less_than: None,
7892 prefix: None,
7893 tag: Some(Tag {
7894 key: Some(k),
7895 value: Some(v),
7896 }),
7897 })
7898 } else {
7899 let tags = if rule.filter.tags.is_empty() {
7900 None
7901 } else {
7902 Some(
7903 rule.filter
7904 .tags
7905 .iter()
7906 .map(|(k, v)| Tag {
7907 key: Some(k.clone()),
7908 value: Some(v.clone()),
7909 })
7910 .collect(),
7911 )
7912 };
7913 Some(LifecycleRuleFilter {
7914 and: Some(LifecycleRuleAndOperator {
7915 object_size_greater_than: rule
7916 .filter
7917 .object_size_greater_than
7918 .and_then(|n| i64::try_from(n).ok()),
7919 object_size_less_than: rule
7920 .filter
7921 .object_size_less_than
7922 .and_then(|n| i64::try_from(n).ok()),
7923 prefix: rule.filter.prefix.clone(),
7924 tags,
7925 }),
7926 object_size_greater_than: None,
7927 object_size_less_than: None,
7928 prefix: None,
7929 tag: None,
7930 })
7931 };
7932 LifecycleRule {
7933 abort_incomplete_multipart_upload,
7934 expiration,
7935 filter,
7936 id: if rule.id.is_empty() {
7937 None
7938 } else {
7939 Some(rule.id.clone())
7940 },
7941 noncurrent_version_expiration,
7942 noncurrent_version_transitions: None,
7943 prefix: None,
7944 status: ExpirationStatus::from(rule.status.as_aws_str().to_owned()),
7945 transitions,
7946 }
7947}
7948
7949// (timestamp <-> chrono helpers `timestamp_to_chrono_utc` /
7950// `chrono_utc_to_timestamp` are defined earlier in this file for the
7951// tagging/notifications work; the lifecycle DTO converters reuse them.)
7952
7953// ---------------------------------------------------------------------------
7954// v0.5 #33: SigV4a (asymmetric ECDSA-P256) integration hook.
7955//
7956// Kept as a self-contained block at the bottom of the file so it doesn't
7957// touch the existing `S4Service` struct, `new()`, or any of the per-op
7958// handlers above. The hook is wired in by the binary at server-build time
7959// as a hyper middleware layer (see `main.rs`), NOT inside `S4Service`.
7960//
7961// Lifecycle:
7962// 1. `SigV4aGate::new(store)` is constructed once at boot from the
7963// operator-supplied credential directory.
7964// 2. For each incoming request, `SigV4aGate::pre_route(&req,
7965// &requested_region, &canonical_request_bytes)` is invoked BEFORE
7966// the request hits the S3 framework. If the request claims SigV4a
7967// and verifies, control returns to the framework. Otherwise a 403
7968// `SignatureDoesNotMatch` is produced.
7969// 3. Plain SigV4 (HMAC-SHA256) requests pass through untouched.
7970// ---------------------------------------------------------------------------
7971
7972/// Gate that fronts the S3 service path with SigV4a verification (v0.5 #33).
7973///
7974/// Wraps a [`crate::sigv4a::SigV4aCredentialStore`] and exposes a single
7975/// `pre_route` entry point that returns `Ok(())` for both
7976/// "request is plain SigV4 — pass through" and "request is SigV4a and
7977/// verified", and an `Err(...)` containing a 403-equivalent diagnostic
7978/// otherwise. Cheap to clone (the inner store is `Arc`-backed).
7979///
7980/// v0.8.4 #76 (audit H-6): the gate now enforces an `x-amz-date`
7981/// freshness window (default 15 min, AWS-spec) and a strict credential
7982/// scope shape (`<key>/<YYYYMMDD>/s3/aws4_request`), shutting the
7983/// captured-request replay vector — previously a stolen valid SigV4a
7984/// signature could be replayed indefinitely (including DELETE).
7985#[derive(Debug, Clone)]
7986pub struct SigV4aGate {
7987 store: crate::sigv4a::SharedSigV4aCredentialStore,
7988 /// v0.8.4 #76: how far the request's `x-amz-date` may drift from
7989 /// the server's clock before being rejected with 403
7990 /// `RequestTimeTooSkewed`. Matches the AWS S3 spec default of
7991 /// 15 min when constructed via [`SigV4aGate::new`]; the operator
7992 /// can override via [`SigV4aGate::with_skew_tolerance`] (CLI flag
7993 /// `--sigv4a-skew-tolerance-seconds`).
7994 skew_tolerance: chrono::Duration,
7995}
7996
7997impl SigV4aGate {
7998 /// Default `x-amz-date` skew tolerance — 15 min, matching AWS S3.
7999 pub const DEFAULT_SKEW_TOLERANCE_SECS: i64 = 900;
8000
8001 #[must_use]
8002 pub fn new(store: crate::sigv4a::SharedSigV4aCredentialStore) -> Self {
8003 Self {
8004 store,
8005 skew_tolerance: chrono::Duration::seconds(Self::DEFAULT_SKEW_TOLERANCE_SECS),
8006 }
8007 }
8008
8009 /// v0.8.4 #76: override the `x-amz-date` skew tolerance (default
8010 /// 15 min). Operators can widen this for high-clock-drift
8011 /// environments or tighten it for compliance regimes that demand
8012 /// stricter freshness.
8013 #[must_use]
8014 pub fn with_skew_tolerance(mut self, skew: chrono::Duration) -> Self {
8015 self.skew_tolerance = skew;
8016 self
8017 }
8018
8019 /// Read the configured skew tolerance — exposed mostly for test +
8020 /// observability use.
8021 #[must_use]
8022 pub fn skew_tolerance(&self) -> chrono::Duration {
8023 self.skew_tolerance
8024 }
8025
8026 /// Inspect an incoming HTTP request. Behaviour:
8027 ///
8028 /// - Not SigV4a (no `X-Amz-Region-Set` and no SigV4a `Authorization`
8029 /// prefix) → returns `Ok(())`; the framework's existing SigV4
8030 /// path handles the request.
8031 /// - SigV4a + valid signature + region match + fresh x-amz-date
8032 /// → `Ok(())`.
8033 /// - SigV4a + unknown access-key-id → `Err` with `InvalidAccessKeyId`.
8034 /// - SigV4a + bad signature / region mismatch → `Err` with
8035 /// `SignatureDoesNotMatch`.
8036 /// - SigV4a + missing or skewed `x-amz-date` → `Err` with one of
8037 /// the v0.8.4 #76 freshness variants (`RequestTimeTooSkewed`
8038 /// et al.).
8039 ///
8040 /// `canonical_request_bytes` is the SigV4a string-to-sign (or
8041 /// canonical-request bytes; the caller decides) that the framework
8042 /// has already produced for this request. Keeping it as a parameter
8043 /// instead of rebuilding it inside the hook avoids duplicating the
8044 /// canonicalisation logic.
8045 pub fn pre_route<B>(
8046 &self,
8047 req: &http::Request<B>,
8048 requested_region: &str,
8049 canonical_request_bytes: &[u8],
8050 ) -> Result<(), SigV4aGateError> {
8051 self.pre_route_at(
8052 req,
8053 requested_region,
8054 canonical_request_bytes,
8055 chrono::Utc::now(),
8056 )
8057 }
8058
8059 /// Like [`SigV4aGate::pre_route`] but takes an explicit `now` for
8060 /// tests that need to pin the freshness clock. Production callers
8061 /// use `pre_route` (which calls `chrono::Utc::now()`).
8062 pub fn pre_route_at<B>(
8063 &self,
8064 req: &http::Request<B>,
8065 requested_region: &str,
8066 canonical_request_bytes: &[u8],
8067 now: chrono::DateTime<chrono::Utc>,
8068 ) -> Result<(), SigV4aGateError> {
8069 if !crate::sigv4a::detect(req) {
8070 return Ok(());
8071 }
8072 let auth_hdr = req
8073 .headers()
8074 .get(http::header::AUTHORIZATION)
8075 .and_then(|v| v.to_str().ok())
8076 .ok_or(SigV4aGateError::MissingAuthorization)?;
8077 let parsed = crate::sigv4a::parse_authorization_header(auth_hdr)
8078 .map_err(|_| SigV4aGateError::MalformedAuthorization)?;
8079 let region_set = req
8080 .headers()
8081 .get(crate::sigv4a::REGION_SET_HEADER)
8082 .and_then(|v| v.to_str().ok())
8083 .unwrap_or("*");
8084 let key = self
8085 .store
8086 .get(&parsed.access_key_id)
8087 .ok_or_else(|| SigV4aGateError::UnknownAccessKey(parsed.access_key_id.clone()))?;
8088 // v0.8.4 #76: snapshot the request headers into a
8089 // lowercase-keyed flat map so `verify_request` can do the
8090 // x-amz-date freshness checks without taking a generic
8091 // `HeaderMap` dep. Cheap because the headers list is tiny.
8092 //
8093 // v0.8.5 #84 (audit H-4): detect duplicate header names while
8094 // we flatten — `HashMap::insert` would silently overwrite the
8095 // first value with the second, mirroring the auth-confusion
8096 // vector the canonical-request builder also defends against.
8097 // Reject upfront so the rest of the gate (freshness check,
8098 // ECDSA verify) never sees a half-truncated header set. We
8099 // detect by checking `contains_key` *before* insertion rather
8100 // than by counting via `headers().get_all`, because the
8101 // upstream `HeaderMap` iteration yields each duplicate entry
8102 // as its own (name, value) pair — the second-seen entry is
8103 // exactly what `contains_key` traps.
8104 let mut header_map: std::collections::HashMap<String, String> =
8105 std::collections::HashMap::with_capacity(req.headers().len());
8106 for (name, value) in req.headers() {
8107 if let Ok(v) = value.to_str() {
8108 let lower = name.as_str().to_ascii_lowercase();
8109 if header_map.contains_key(&lower) {
8110 return Err(SigV4aGateError::Verify(
8111 crate::sigv4a::SigV4aError::DuplicateSignedHeader { header: lower },
8112 ));
8113 }
8114 header_map.insert(lower, v.to_string());
8115 }
8116 }
8117 crate::sigv4a::verify_request(
8118 &parsed,
8119 &header_map,
8120 canonical_request_bytes,
8121 key,
8122 region_set,
8123 requested_region,
8124 now,
8125 self.skew_tolerance,
8126 )
8127 .map_err(SigV4aGateError::Verify)?;
8128 Ok(())
8129 }
8130}
8131
8132/// Failure modes from [`SigV4aGate::pre_route`]. All variants map to
8133/// HTTP 403 with one of the two AWS-standard error codes
8134/// (`InvalidAccessKeyId` / `SignatureDoesNotMatch` / `RequestTimeTooSkewed`)
8135/// — see [`SigV4aGateError::s3_error_code`].
8136#[derive(Debug, thiserror::Error)]
8137pub enum SigV4aGateError {
8138 #[error("missing Authorization header")]
8139 MissingAuthorization,
8140 #[error("malformed SigV4a Authorization header")]
8141 MalformedAuthorization,
8142 #[error("unknown SigV4a access-key-id: {0}")]
8143 UnknownAccessKey(String),
8144 #[error("SigV4a verification failed: {0}")]
8145 Verify(#[source] crate::sigv4a::SigV4aError),
8146}
8147
8148impl SigV4aGateError {
8149 /// AWS S3 error code that should accompany the response.
8150 ///
8151 /// v0.8.4 #76 (audit H-6): the freshness check surfaces
8152 /// `RequestTimeTooSkewed` (matches AWS spec); date / scope shape
8153 /// failures surface as `InvalidRequest` (400); other failures stay
8154 /// `SignatureDoesNotMatch` / `InvalidAccessKeyId` (403) so the wire
8155 /// surface stays AWS-compatible.
8156 #[must_use]
8157 pub fn s3_error_code(&self) -> &'static str {
8158 match self {
8159 Self::UnknownAccessKey(_) => "InvalidAccessKeyId",
8160 Self::Verify(crate::sigv4a::SigV4aError::RequestTimeTooSkewed { .. }) => {
8161 "RequestTimeTooSkewed"
8162 }
8163 Self::Verify(
8164 crate::sigv4a::SigV4aError::MissingXAmzDate
8165 | crate::sigv4a::SigV4aError::InvalidDateFormat
8166 | crate::sigv4a::SigV4aError::DateScopeMismatch
8167 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
8168 | crate::sigv4a::SigV4aError::InvalidTerminator
8169 | crate::sigv4a::SigV4aError::WrongService { .. }
8170 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
8171 ) => "InvalidRequest",
8172 _ => "SignatureDoesNotMatch",
8173 }
8174 }
8175
8176 /// HTTP status code to accompany the response. v0.8.4 #76: format
8177 /// errors that are clearly client mistakes (missing / malformed
8178 /// `x-amz-date`, malformed credential scope, wrong service) are
8179 /// surfaced as 400 InvalidRequest; the rest stay 403.
8180 #[must_use]
8181 pub fn http_status(&self) -> http::StatusCode {
8182 match self {
8183 Self::Verify(
8184 crate::sigv4a::SigV4aError::MissingXAmzDate
8185 | crate::sigv4a::SigV4aError::InvalidDateFormat
8186 | crate::sigv4a::SigV4aError::DateScopeMismatch
8187 | crate::sigv4a::SigV4aError::XAmzDateNotSigned
8188 | crate::sigv4a::SigV4aError::InvalidTerminator
8189 | crate::sigv4a::SigV4aError::WrongService { .. }
8190 | crate::sigv4a::SigV4aError::InvalidCredentialScope,
8191 ) => http::StatusCode::BAD_REQUEST,
8192 _ => http::StatusCode::FORBIDDEN,
8193 }
8194 }
8195}
8196
8197#[cfg(test)]
8198mod tests {
8199 use super::*;
8200
8201 #[test]
8202 fn manifest_roundtrip_via_metadata() {
8203 let original = ChunkManifest {
8204 codec: CodecKind::CpuZstd,
8205 original_size: 1234,
8206 compressed_size: 567,
8207 crc32c: 0xdead_beef,
8208 };
8209 let mut meta: Option<Metadata> = None;
8210 write_manifest(&mut meta, &original);
8211 let extracted = extract_manifest(&meta).expect("manifest must round-trip");
8212 assert_eq!(extracted.codec, original.codec);
8213 assert_eq!(extracted.original_size, original.original_size);
8214 assert_eq!(extracted.compressed_size, original.compressed_size);
8215 assert_eq!(extracted.crc32c, original.crc32c);
8216 }
8217
8218 #[test]
8219 fn missing_metadata_yields_none() {
8220 let meta: Option<Metadata> = None;
8221 assert!(extract_manifest(&meta).is_none());
8222 }
8223
8224 #[test]
8225 fn partial_metadata_yields_none() {
8226 let mut meta = Metadata::new();
8227 meta.insert(META_CODEC.into(), "cpu-zstd".into());
8228 let opt = Some(meta);
8229 assert!(extract_manifest(&opt).is_none());
8230 }
8231
8232 #[test]
8233 fn parse_copy_source_range_basic() {
8234 let r = parse_copy_source_range("bytes=10-20").unwrap();
8235 match r {
8236 s3s::dto::Range::Int { first, last } => {
8237 assert_eq!(first, 10);
8238 assert_eq!(last, Some(20));
8239 }
8240 _ => panic!("expected Int range"),
8241 }
8242 }
8243
8244 #[test]
8245 fn parse_copy_source_range_rejects_inverted() {
8246 let err = parse_copy_source_range("bytes=20-10").unwrap_err();
8247 assert!(err.contains("last < first"));
8248 }
8249
8250 #[test]
8251 fn parse_copy_source_range_rejects_missing_prefix() {
8252 let err = parse_copy_source_range("10-20").unwrap_err();
8253 assert!(err.contains("must start with 'bytes='"));
8254 }
8255
8256 #[test]
8257 fn parse_copy_source_range_rejects_open_ended() {
8258 // S3 upload_part_copy spec requires N-M (closed); suffix and
8259 // open-ended forms are not allowed for this header.
8260 assert!(parse_copy_source_range("bytes=10-").is_err());
8261 assert!(parse_copy_source_range("bytes=-10").is_err());
8262 }
8263
8264 // v0.7 #49: safe_object_uri must round-trip every legal S3 key
8265 // (which includes spaces, slashes, control chars, raw UTF-8) into
8266 // a parseable `http::Uri` instead of panicking like the previous
8267 // `format!(...).parse().unwrap()` call sites did.
8268
8269 #[test]
8270 fn safe_object_uri_basic_ascii() {
8271 let uri = safe_object_uri("bucket", "key").expect("ascii must be safe");
8272 assert_eq!(uri.path(), "/bucket/key");
8273 }
8274
8275 #[test]
8276 fn safe_object_uri_encodes_spaces() {
8277 let uri = safe_object_uri("bucket", "key with spaces").expect("must encode spaces");
8278 // RFC 3986 path-segment encoding turns ' ' into %20.
8279 assert!(
8280 uri.path().contains("%20"),
8281 "expected percent-encoded space, got {}",
8282 uri.path()
8283 );
8284 assert!(uri.path().starts_with("/bucket/"));
8285 }
8286
8287 #[test]
8288 fn safe_object_uri_preserves_slashes() {
8289 // S3 keys legally contain '/' as a logical path separator —
8290 // the helper must NOT escape it (otherwise the synthetic URI
8291 // changes the perceived hierarchy).
8292 let uri = safe_object_uri("bucket", "key/with/slashes").expect("slashes must round-trip");
8293 assert_eq!(uri.path(), "/bucket/key/with/slashes");
8294 }
8295
8296 #[test]
8297 fn safe_object_uri_handles_newline_without_panic() {
8298 // Newlines are control chars in URIs; whether the result is
8299 // Ok (encoded as %0A) or Err (parse rejects), the helper
8300 // MUST NOT panic. Either outcome is acceptable.
8301 let _ = safe_object_uri("bucket", "key\n");
8302 }
8303
8304 #[test]
8305 fn safe_object_uri_handles_null_byte_without_panic() {
8306 let _ = safe_object_uri("bucket", "key\0bad");
8307 }
8308
8309 #[test]
8310 fn safe_object_uri_handles_unicode_without_panic() {
8311 // RTL override, BOM, plain Japanese — none should panic.
8312 let _ = safe_object_uri("bucket", "rtl\u{202E}override");
8313 let _ = safe_object_uri("bucket", "\u{FEFF}bom-key");
8314 let _ = safe_object_uri("bucket", "日本語キー");
8315 }
8316
8317 #[test]
8318 fn safe_object_uri_no_panic_for_every_byte() {
8319 // Exhaustive byte coverage: 0x00..=0xFF as a 1-byte key.
8320 // None of these may panic. (0x80..=0xFF are not valid UTF-8
8321 // by themselves; we go through `String::from_utf8_lossy` so
8322 // the helper sees a real `&str` regardless of the raw byte.)
8323 for b in 0u8..=255 {
8324 let s = String::from_utf8_lossy(&[b]).into_owned();
8325 let _ = safe_object_uri("bucket", &s);
8326 }
8327 }
8328
8329 /// v0.8.1 #58: smoke test for the DEK-handling shape used by the
8330 /// SSE-KMS branches of `put_object` and `complete_multipart_upload`.
8331 /// Mirrors the call pattern (generate_dek → length check → copy
8332 /// into stack `[u8; 32]` → reborrow as `&[u8; 32]` for `SseSource`)
8333 /// without spinning up a full `S4Service`.
8334 ///
8335 /// The real assertion this guards against is a regression where
8336 /// the `Zeroizing` wrapper is accidentally dropped before the
8337 /// stack copy lands (e.g. someone refactors to use
8338 /// `let dek = kms.generate_dek(...).await?.0; drop(dek); ...`)
8339 /// or where `&**dek` is rewritten in a way that doesn't compile.
8340 #[tokio::test]
8341 async fn kms_dek_lifetime_within_function_scope() {
8342 use crate::kms::{KmsBackend, LocalKms};
8343 use std::collections::HashMap;
8344 use std::path::PathBuf;
8345 use zeroize::Zeroizing;
8346
8347 let mut keks = HashMap::new();
8348 keks.insert("scope".to_string(), [33u8; 32]);
8349 let kms = LocalKms::from_keks(PathBuf::from("/tmp/kms-scope-test"), keks);
8350
8351 // Mirror the put_object KMS branch shape exactly.
8352 let (dek, wrapped) = kms.generate_dek("scope").await.unwrap();
8353 assert_eq!(dek.len(), 32);
8354 let mut dek_arr: Zeroizing<[u8; 32]> = Zeroizing::new([0u8; 32]);
8355 dek_arr.copy_from_slice(&dek);
8356
8357 // The reborrow used at the SseSource construction site —
8358 // mirrors the call-site pattern where `let dek_ref: &[u8; 32]`
8359 // auto-derefs from a `Zeroizing<[u8; 32]>` reference.
8360 let dek_ref: &[u8; 32] = &dek_arr;
8361 // Sanity: the reborrow points at the same bytes.
8362 assert_eq!(dek_ref, &*dek_arr);
8363 // Wrapped key id flows through unchanged.
8364 assert_eq!(wrapped.key_id, "scope");
8365
8366 // At end of scope, both `dek` (Zeroizing<Vec<u8>>) and
8367 // `dek_arr` (Zeroizing<[u8; 32]>) are dropped, wiping the
8368 // backing memory. Cannot directly assert the wipe (would be
8369 // UB to read freed memory), so this test instead enforces
8370 // that the call shape compiles and executes; the wipe itself
8371 // is exercised by the `zeroize` crate's own test suite.
8372 }
8373
8374 /// v0.8.5 #86 (audit M-2): the replication dispatcher must
8375 /// `acquire_owned()` a permit from `replication_semaphore` before
8376 /// kicking off the destination PUT, so a saturated semaphore
8377 /// back-pressures the in-flight queue depth instead of letting it
8378 /// grow without bound. We exercise the field directly (initial
8379 /// permit count, override via `with_replication_max_concurrent`,
8380 /// permit drop on `Drop`) — the full `spawn_replication_if_matched`
8381 /// integration is exercised by the existing replication tests in
8382 /// `tests/feature_e2e.rs` once a `ReplicationManager` is attached.
8383 #[tokio::test]
8384 async fn replication_semaphore_caps_concurrent_dispatchers() {
8385 // Build a minimal `S4Service` directly — no handler path is
8386 // exercised, only the constructor + setter + accessor shape.
8387 let registry = Arc::new(
8388 CodecRegistry::new(CodecKind::Passthrough)
8389 .with(Arc::new(s4_codec::passthrough::Passthrough)),
8390 );
8391 let dispatcher = Arc::new(s4_codec::dispatcher::AlwaysDispatcher(
8392 CodecKind::Passthrough,
8393 ));
8394 let s4 = S4Service::new(NoopBackend, registry, dispatcher);
8395
8396 // Default cap matches the documented constant.
8397 assert_eq!(
8398 s4.replication_semaphore().available_permits(),
8399 S4Service::<NoopBackend>::DEFAULT_REPLICATION_MAX_CONCURRENT,
8400 "fresh S4Service must expose DEFAULT_REPLICATION_MAX_CONCURRENT permits"
8401 );
8402
8403 // Override via the builder — replaces the underlying `Semaphore`.
8404 let s4 = s4.with_replication_max_concurrent(2);
8405 assert_eq!(
8406 s4.replication_semaphore().available_permits(),
8407 2,
8408 "with_replication_max_concurrent(2) must expose exactly 2 permits"
8409 );
8410
8411 // Acquiring permits must reduce `available_permits()` and
8412 // dropping them must restore the count — this is the contract
8413 // `spawn_replication_if_matched` relies on for back-pressure.
8414 let sem = Arc::clone(s4.replication_semaphore());
8415 let p1 = sem.clone().acquire_owned().await.expect("permit 1");
8416 let p2 = sem.clone().acquire_owned().await.expect("permit 2");
8417 assert_eq!(
8418 sem.available_permits(),
8419 0,
8420 "two acquired permits must zero `available_permits()`"
8421 );
8422 // A third `try_acquire_owned` must fail — the cap is enforced
8423 // synchronously, no extra spawn slips through.
8424 assert!(
8425 sem.clone().try_acquire_owned().is_err(),
8426 "third acquire must back-pressure: cap was 2"
8427 );
8428 drop(p1);
8429 drop(p2);
8430 assert_eq!(
8431 sem.available_permits(),
8432 2,
8433 "dropping permits must restore cap"
8434 );
8435
8436 // Lower-bound clamp: a 0 cap would deadlock all dispatchers,
8437 // so the setter clamps it to 1 instead of accepting it
8438 // (callers are warned in the CLI doc).
8439 let s4 = s4.with_replication_max_concurrent(0);
8440 assert_eq!(
8441 s4.replication_semaphore().available_permits(),
8442 1,
8443 "cap=0 must be clamped to 1 to avoid total deadlock"
8444 );
8445 }
8446
8447 /// v0.8.5 #86 (audit M-1): the access-log flusher must return a
8448 /// `JoinHandle<()>` that the caller can `abort()` on shutdown
8449 /// without leaving a dangling task. The pre-#86 call site dropped
8450 /// the handle at end-of-block (silently detaching it); the fix is
8451 /// hoisting it into a process-lived `Vec` so the graceful-shutdown
8452 /// branch in `main.rs` can wait for clean exit. This test exercises
8453 /// the `JoinHandle.abort()` shape directly so a future refactor that
8454 /// stops returning the handle (or returns a non-abortable wrapper)
8455 /// trips this regression guard.
8456 #[tokio::test]
8457 async fn flusher_handle_can_be_aborted_cleanly() {
8458 // Stand up a minimal `AccessLog` pointing at a tmp dir so the
8459 // flusher's `create_dir_all` succeeds. The dir is cleaned up
8460 // by the OS / test harness; we don't assert on the contents.
8461 let tmp = std::env::temp_dir().join(format!(
8462 "s4-86-flusher-{}-{}",
8463 std::process::id(),
8464 std::time::SystemTime::now()
8465 .duration_since(std::time::UNIX_EPOCH)
8466 .map(|d| d.as_nanos())
8467 .unwrap_or(0)
8468 ));
8469 let dest = crate::access_log::AccessLogDest { dir: tmp.clone() };
8470 let log = crate::access_log::AccessLog::new(dest);
8471 let handle = log.spawn_flusher(None);
8472 assert!(
8473 !handle.is_finished(),
8474 "freshly-spawned flusher must not yet be finished"
8475 );
8476 handle.abort();
8477 // `await`-ing an aborted handle returns `Err(JoinError)` whose
8478 // `is_cancelled()` is true.
8479 let join_result = handle.await;
8480 assert!(
8481 join_result.is_err(),
8482 "aborted flusher must surface JoinError, got Ok"
8483 );
8484 assert!(
8485 join_result.unwrap_err().is_cancelled(),
8486 "JoinError must report .is_cancelled() = true after abort()"
8487 );
8488 let _ = std::fs::remove_dir_all(&tmp);
8489 }
8490
8491 /// Stub backend used solely by the v0.8.5 #86 unit tests above —
8492 /// the `S4Service` constructor needs `B: S3` but the tests only
8493 /// exercise builder / accessor shape, never a handler call. Every
8494 /// `S3` method falls through to the trait's default
8495 /// `NotImplemented` (which `s3s` provides automatically).
8496 struct NoopBackend;
8497
8498 #[async_trait::async_trait]
8499 impl S3 for NoopBackend {}
8500
8501 /// v0.8.5 #81 (audit H-7): the panic-catch wrapper at the
8502 /// dispatcher spawn site must intercept a panicking inner future,
8503 /// log at ERROR, and bump the per-kind counter — instead of letting
8504 /// the panic propagate as a `JoinError` that no operator dashboard
8505 /// scrapes. We exercise the wrapper directly (rather than driving a
8506 /// full `spawn_replication_if_matched` end-to-end, which would
8507 /// require a full `S4Service` + backend) because the wrapper shape
8508 /// is the load-bearing piece — any inner-future swap would still
8509 /// route through the same `AssertUnwindSafe(...).catch_unwind()`
8510 /// closure we want to lock in here.
8511 #[tokio::test]
8512 async fn dispatcher_panic_caught_and_metric_bumped() {
8513 use futures::FutureExt as _;
8514
8515 let handle = crate::metrics::test_metrics_handle();
8516 let kind = "replication";
8517
8518 // Mirror the production wrapper shape verbatim — if the
8519 // production code ever stops using `AssertUnwindSafe.catch_unwind`
8520 // this test shouldn't keep passing on a hand-rolled copy that
8521 // diverged.
8522 let panicking = async {
8523 panic!("simulated dispatcher panic");
8524 };
8525 let result = std::panic::AssertUnwindSafe(panicking).catch_unwind().await;
8526 assert!(
8527 result.is_err(),
8528 "catch_unwind must surface the panic instead of swallowing it"
8529 );
8530 // Bump the production counter via the same helper the wrapper
8531 // calls so the rendered output gates on the production code
8532 // path, not a parallel bookkeeping copy.
8533 crate::metrics::record_dispatcher_panic(kind);
8534
8535 let rendered = handle.render();
8536 assert!(
8537 rendered.contains("s4_dispatcher_panics_total"),
8538 "expected s4_dispatcher_panics_total in metrics output, got: {rendered}"
8539 );
8540 assert!(
8541 rendered.contains("kind=\"replication\""),
8542 "expected kind=\"replication\" label in metrics output, got: {rendered}"
8543 );
8544 }
8545
8546 /// v0.9 #106-audit-R2 P2-INT-2: the shared trailer-verify helper
8547 /// short-circuits when the `x-amz-trailer` header is absent (no
8548 /// claim → nothing to verify).
8549 #[test]
8550 fn verify_client_trailer_checksums_passes_when_no_header() {
8551 let computed = crate::streaming_checksum::ComputedDigests::default();
8552 verify_client_trailer_checksums(None, None, &computed).expect("no claim → Ok");
8553 }
8554
8555 /// Helper that only announces non-checksum trailers (e.g. the
8556 /// `x-amz-trailer-signature` SDKs add for SigV4 streaming) is also
8557 /// a no-op — the filter discards them before anything else runs.
8558 #[test]
8559 fn verify_client_trailer_checksums_ignores_non_checksum_trailers() {
8560 let computed = crate::streaming_checksum::ComputedDigests::default();
8561 verify_client_trailer_checksums(Some("x-amz-trailer-signature"), None, &computed)
8562 .expect("non-checksum trailers must not fail");
8563 }
8564
8565 /// Fail-closed: announced checksum trailer + no trailing-headers
8566 /// handle = `BadDigest`. This is the core regression fence for the
8567 /// buffered-path silent-skip the P2-INT-2 fix closes.
8568 #[test]
8569 fn verify_client_trailer_checksums_no_handle_fails_closed() {
8570 let computed = crate::streaming_checksum::ComputedDigests::default();
8571 let err = verify_client_trailer_checksums(Some("x-amz-checksum-crc32c"), None, &computed)
8572 .expect_err("announced trailer with no handle must fail closed");
8573 assert_eq!(err.code().as_str(), "BadDigest");
8574 assert!(
8575 err.message()
8576 .unwrap_or_default()
8577 .contains("trailing-headers handle"),
8578 "error message must hint at the missing handle, got {err:?}"
8579 );
8580 }
8581
8582 /// Case-insensitive trailer name match — AWS SDKs may use any
8583 /// casing per RFC 9110 §5.1. The filter must still detect the
8584 /// `x-amz-checksum-` prefix; the helper then propagates the bad-
8585 /// digest reject via the missing handle.
8586 #[test]
8587 fn verify_client_trailer_checksums_case_insensitive_filter() {
8588 let computed = crate::streaming_checksum::ComputedDigests::default();
8589 let err = verify_client_trailer_checksums(Some("X-Amz-Checksum-Crc32c"), None, &computed)
8590 .expect_err("upper-case trailer name must still be detected");
8591 assert_eq!(err.code().as_str(), "BadDigest");
8592 }
8593
8594 /// Mixed announce: one checksum trailer and one unrelated trailer.
8595 /// The filter retains the checksum one and routes to the fail-closed
8596 /// branch when the handle is absent.
8597 #[test]
8598 fn verify_client_trailer_checksums_mixed_announce_still_validates() {
8599 let computed = crate::streaming_checksum::ComputedDigests::default();
8600 let err = verify_client_trailer_checksums(
8601 Some("x-amz-checksum-sha256, x-amz-trailer-signature"),
8602 None,
8603 &computed,
8604 )
8605 .expect_err("mixed announce with checksum entry must still fail closed");
8606 assert_eq!(err.code().as_str(), "BadDigest");
8607 }
8608}