elastik-core 7.2.0

Elastik V6 Engine: six verbs, one HTTP disk.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
use axum::{
    http::{header, HeaderMap, HeaderName, HeaderValue, StatusCode},
    response::{IntoResponse, Response},
};
use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS};
use std::collections::{BTreeMap, HashSet};

use crate::world::Stage;
use crate::{precondition_failed, storage_error, to_header_map, world, Core};

/// User-configurable matcher used by both Layer 3 (user allow,
/// `ELASTIK_PERSIST_HEADERS`) and Layer 1.5 (user deny,
/// `ELASTIK_DENY_HEADERS`) of the four-layer persist policy. Same
/// matcher type, two field instances on `Core`; the call site
/// (`should_persist_for_storage`) decides allow-vs-deny semantics.
/// Built once at startup; held on `Core`.
///
/// Entries are normalized to lowercase. A trailing `*` makes an
/// entry a prefix match (e.g. `x-my-*` matches `x-my-anything`).
/// Anything else is exact match.
///
/// The empty allowlist is fully valid and means "no custom headers
/// beyond the built-in default allow set are persisted." Default
/// `Core` construction in tests uses `HeaderAllowlist::empty()`.
#[derive(Default, Clone)]
pub(crate) struct HeaderAllowlist {
    exact: HashSet<String>,
    prefixes: Vec<String>,
}

impl HeaderAllowlist {
    /// Empty allowlist (default-deny custom headers). Used by
    /// test fixtures and as the inert state for `Core` constructors
    /// that don't read environment. The production startup path
    /// uses `config::header_allowlist_from_env()` instead, which
    /// returns an `empty()` for an unset env var anyway.
    #[allow(dead_code)]
    pub(crate) fn empty() -> Self {
        Self::default()
    }

    /// Parse a comma-separated list. Whitespace per entry is
    /// trimmed; entries are lowercased. A trailing `*` denotes a
    /// prefix match. Empty or `*`-only entries are skipped.
    pub(crate) fn parse(raw: &str) -> Self {
        let mut exact = HashSet::new();
        let mut prefixes: Vec<String> = Vec::new();
        for entry in raw.split(',') {
            let entry = entry.trim().to_ascii_lowercase();
            if entry.is_empty() {
                continue;
            }
            if let Some(prefix) = entry.strip_suffix('*') {
                if !prefix.is_empty() {
                    prefixes.push(prefix.to_string());
                }
                continue;
            }
            exact.insert(entry);
        }
        Self { exact, prefixes }
    }

    pub(crate) fn matches(&self, name_lower: &str) -> bool {
        self.exact.contains(name_lower) || self.prefixes.iter().any(|p| name_lower.starts_with(p))
    }

    #[allow(dead_code)]
    pub(crate) fn is_empty(&self) -> bool {
        self.exact.is_empty() && self.prefixes.is_empty()
    }
}

/// Layer 2 -- built-in default allow. Standard representation
/// headers that "travel with the bytes" and that the vast majority
/// of users want round-tripped without configuring anything.
///
/// Closed list, hardcoded. Update only when a header is reviewed
/// as "describes the body, not the request or transport." Operators
/// who want to drop one for their deployment use
/// `ELASTIK_DENY_HEADERS` (Layer 1.5).
const DEFAULT_PERSIST_HEADERS: &[&str] = &[
    // Body representation: how the body is encoded/displayed/labeled.
    "content-disposition",
    "content-encoding",
    "content-language",
    "content-md5",
    // `last-modified` is intentionally NOT here. Elastik uses the
    // HMAC-chained `ETag` as the canonical version identifier;
    // adding `Last-Modified` would invite clients to send
    // `If-Modified-Since` and bypass the audit-chained
    // `If-None-Match` flow. Don't re-add without revisiting that
    // contract.
    // Caching directives that travel with the body.
    "cache-control",
    "expires",
    // CORS family. The full set is shipped because any subset would
    // surprise an operator dropping bytes through a browser.
    "access-control-allow-origin",
    "access-control-allow-methods",
    "access-control-allow-headers",
    "access-control-allow-credentials",
    "access-control-expose-headers",
    "access-control-max-age",
    // Browser security policies for HTML / JS / image bodies.
    "content-security-policy",
    "content-security-policy-report-only",
    "x-frame-options",
    "permissions-policy",
    "cross-origin-resource-policy",
    "cross-origin-opener-policy",
    "cross-origin-embedder-policy",
    // Browser response-policy hints that travel with the body but
    // sit outside the CSP family.
    "referrer-policy",
    "x-robots-tag",
];

/// Layer 2 default-allow lookup. Caller contract identical to
/// `is_never_persisted_header`: `name_lower` must already be
/// ASCII-lowercased. The constant array entries are all lowercase.
fn is_default_persisted_header(name_lower: &str) -> bool {
    DEFAULT_PERSIST_HEADERS.contains(&name_lower)
}

/// Four-layer persist decision used by `request_meta_headers` and
/// `apply_meta_headers`:
///
///   L1   (hard deny, hardcoded): security / transport / tracing /
///        cloud / IP-leak / pseudo-header pollutants. Always wins.
///   L1.5 (user deny, env-configured): operator's `ELASTIK_DENY_HEADERS`
///        list. Lets an operator subtract from L2 defaults (e.g.
///        "I don't want `cache-control` round-tripping for my
///        deployment"). Same matcher shape as L3 (exact + `*`
///        prefix). Beats L2 and L3 below.
///   L2   (default allow, hardcoded): standard representation
///        headers that travel with the body. Persisted unless L1
///        or L1.5 blocks them.
///   L3   (user allow, env-configured): operator's `ELASTIK_PERSIST_HEADERS`
///        allowlist. Adds custom headers (`x-author`, `x-my-*`)
///        on top of L2.
///
/// Anything not matched by L2 or L3 is dropped -- the model is
/// default-deny for custom headers, default-allow for standard
/// representation headers, with both knobs (L1.5 and L3) for
/// operator-side fine-tuning.
pub(crate) fn should_persist_for_storage(
    name_lower: &str,
    user_allow: &HeaderAllowlist,
    user_deny: &HeaderAllowlist,
) -> bool {
    if is_never_persisted_header(name_lower) {
        return false;
    }
    if user_deny.matches(name_lower) {
        return false;
    }
    if is_default_persisted_header(name_lower) {
        return true;
    }
    user_allow.matches(name_lower)
}

pub(crate) fn apply_meta_headers(
    headers: &[(String, String)],
    out: &mut Vec<(HeaderName, HeaderValue)>,
) {
    // Read-side guard: `headers` is `Stage.headers` loaded from
    // SQLite (already filtered by `request_meta_headers` at write
    // time), so the L1 hard deny is the only check that matters
    // here. We don't re-apply L1.5 / L2 / L3 on read -- if the
    // operator changes either `ELASTIK_PERSIST_HEADERS` (L3) or
    // `ELASTIK_DENY_HEADERS` (L1.5) after data is already written,
    // the persisted bytes still round-trip. Operators wanting to
    // scrub stored headers re-PUT the affected worlds. The hard
    // deny (L1) stays in force so a write-time policy bug or a
    // corrupted database row can never replay credentials or
    // tracing context.
    for (k, v) in headers {
        if is_never_persisted_header(&k.to_ascii_lowercase()) {
            continue;
        }
        let Ok(name) = HeaderName::from_bytes(k.as_bytes()) else {
            continue;
        };
        let Ok(val) = HeaderValue::from_str(v) else {
            continue;
        };
        out.push((name, val));
    }
}

const URL_PATH_ENCODE: &AsciiSet = &CONTROLS
    .add(b' ')
    .add(b'"')
    .add(b'#')
    .add(b'%')
    .add(b'<')
    .add(b'>')
    .add(b'?')
    .add(b'[')
    .add(b'\\')
    .add(b']')
    .add(b'^')
    .add(b'`')
    .add(b'{')
    .add(b'|')
    .add(b'}');

pub(crate) fn world_url(world_name: &str) -> String {
    format!("/{}", utf8_percent_encode(world_name, URL_PATH_ENCODE))
}

pub(crate) fn apply_world_links(world_name: &str, out: &mut Vec<(HeaderName, HeaderValue)>) {
    let monitor = format!("</listen{}>; rel=\"monitor\"", world_url(world_name));
    if let Ok(v) = HeaderValue::from_str(&monitor) {
        out.push((header::LINK, v));
    }
    out.push((
        header::LINK,
        HeaderValue::from_static("</proc/worlds>; rel=\"collection\""),
    ));
}

pub(crate) fn hmac_etag(hmac: &str) -> String {
    format!("hmac-{hmac}")
}

pub(crate) fn body_etag(body: &[u8]) -> String {
    format!("sha256-{}", world::sha256_hex(body))
}

pub(crate) fn etag_header(etag: &str) -> HeaderValue {
    HeaderValue::from_str(&format!("\"{etag}\""))
        .unwrap_or_else(|_| HeaderValue::from_static("\"invalid\""))
}

#[allow(clippy::result_large_err)]
pub(crate) fn check_write_preconditions(
    core: &Core,
    world_name: &str,
    req_headers: &HeaderMap,
) -> Result<(), Response> {
    if !req_headers.contains_key(header::IF_MATCH)
        && !req_headers.contains_key(header::IF_NONE_MATCH)
    {
        return Ok(());
    }
    let current = core
        .read_world_with_etag(world_name)
        .map_err(|e| storage_error("precondition read", e))?;
    let current_tag = current.as_ref().map(|(_, etag)| etag.clone());

    if let Some(h) = req_headers
        .get(header::IF_MATCH)
        .and_then(|v| v.to_str().ok())
    {
        let Some(tag) = &current_tag else {
            return Err(precondition_failed("If-Match requires an existing world"));
        };
        if !etag_list_strong_matches(h, tag) {
            return Err(precondition_failed("If-Match did not match current ETag"));
        }
    }

    if let Some(h) = req_headers
        .get(header::IF_NONE_MATCH)
        .and_then(|v| v.to_str().ok())
    {
        if let Some(tag) = &current_tag {
            if etag_list_weak_matches(h, tag) {
                return Err(precondition_failed("If-None-Match matched current ETag"));
            }
        }
    }

    Ok(())
}

pub(crate) fn read_not_modified(req_headers: &HeaderMap, current: &str) -> bool {
    req_headers
        .get(header::IF_NONE_MATCH)
        .and_then(|v| v.to_str().ok())
        .map(|h| etag_list_weak_matches(h, current))
        .unwrap_or(false)
}

pub(crate) fn etag_list_strong_matches(header_value: &str, current: &str) -> bool {
    let quoted = format!("\"{current}\"");
    header_value
        .split(',')
        .map(str::trim)
        .any(|candidate| candidate == "*" || candidate == quoted.as_str())
}

pub(crate) fn etag_list_weak_matches(header_value: &str, current: &str) -> bool {
    let quoted = format!("\"{current}\"");
    header_value.split(',').map(str::trim).any(|candidate| {
        candidate == "*"
            || candidate == quoted.as_str()
            || candidate
                .strip_prefix("W/")
                .map(|weak| weak == quoted.as_str())
                .unwrap_or(false)
    })
}

pub(crate) fn effective_range(
    req_headers: &HeaderMap,
    len: usize,
    current_etag: &str,
) -> Result<Option<(usize, usize)>, ()> {
    if let Some(if_range) = req_headers
        .get(header::IF_RANGE)
        .and_then(|v| v.to_str().ok())
    {
        if !if_range_strong_matches(if_range, current_etag) {
            return Ok(None);
        }
    }
    parse_range(req_headers, len)
}

pub(crate) fn parse_range(
    req_headers: &HeaderMap,
    len: usize,
) -> Result<Option<(usize, usize)>, ()> {
    let Some(raw) = req_headers.get(header::RANGE).and_then(|v| v.to_str().ok()) else {
        return Ok(None);
    };
    let Some(spec) = raw.trim().strip_prefix("bytes=") else {
        return Err(());
    };
    if spec.contains(',') {
        return Ok(None);
    }
    let Some((left, right)) = spec.split_once('-') else {
        return Err(());
    };
    if len == 0 {
        return Err(());
    }
    if left.is_empty() {
        let suffix: usize = right.parse().map_err(|_| ())?;
        if suffix == 0 {
            return Err(());
        }
        let take = suffix.min(len);
        return Ok(Some((len - take, len - 1)));
    }
    let start: usize = left.parse().map_err(|_| ())?;
    if start >= len {
        return Err(());
    }
    let end = if right.is_empty() {
        len - 1
    } else {
        right.parse().map_err(|_| ())?
    };
    if end < start {
        return Err(());
    }
    Ok(Some((start, end.min(len - 1))))
}

pub(crate) fn if_range_strong_matches(header_value: &str, current: &str) -> bool {
    header_value.trim() == format!("\"{current}\"")
}

pub(crate) fn request_content_type(headers: &HeaderMap) -> String {
    headers
        .get(header::CONTENT_TYPE)
        .and_then(|v| v.to_str().ok())
        .map(str::trim)
        .filter(|v| !v.is_empty())
        .unwrap_or("application/octet-stream")
        .to_owned()
}

pub(crate) fn request_meta_headers(
    headers: &HeaderMap,
    user_allow: &HeaderAllowlist,
    user_deny: &HeaderAllowlist,
) -> Vec<(String, String)> {
    let mut out = BTreeMap::new();
    for (k, v) in headers {
        let name = k.as_str().to_ascii_lowercase();
        if should_persist_for_storage(&name, user_allow, user_deny) {
            if let Ok(val) = v.to_str() {
                out.insert(name, val.to_string());
            }
        }
    }
    out.into_iter().collect()
}

/// Layer 1 hard deny. **Caller contract: `name_lower` must already
/// be ASCII-lowercased.** RFC 7230 makes header names case-
/// insensitive; axum's `HeaderName::as_str()` returns the canonical
/// lowercase form, so headers entering through axum's `HeaderMap`
/// are already lowercase. Callers reading from non-axum sources
/// (Stage.headers loaded from SQLite, env-var allowlists, test
/// fixtures) must `.to_ascii_lowercase()` before calling this. The
/// internal arms below match against lowercase string literals;
/// passing mixed case yields a false negative.
pub(crate) fn is_never_persisted_header(name_lower: &str) -> bool {
    let name = name_lower;
    name.starts_with("sec-")
        || name.starts_with("access-control-request-")
        || name.starts_with("want-")
        // HTTP/2 and HTTP/3 pseudo-headers: `:method`, `:path`,
        // `:scheme`, `:authority`, `:status`. These are wire-level
        // metadata, never legitimate application headers; if axum
        // ever surfaces one as a normal header (server bug or
        // future spec change), it must not bleed into stored
        // representation. Defense in depth.
        || name.starts_with(":")
        // Distributed tracing: Zipkin's multi-header propagation
        // (`x-b3-traceid`, `x-b3-spanid`, `x-b3-sampled`, ...) is
        // per-call link metadata, never a property of stored data.
        // If APM auto-injection bleeds a header into a write, the
        // next reader would see the writer's trace ID -- breaks
        // every downstream tracing/correlation system.
        || name.starts_with("x-b3-")
        // AWS ALB / CloudFront / API Gateway runtime injections.
        // `x-amzn-trace-id` (X-Ray), `x-amzn-requestid`,
        // `x-amzn-mtls-clientcert`, etc.
        || name.starts_with("x-amzn-")
        // Cloudflare runtime injections. `cf-ray`, `cf-connecting-ip`,
        // `cf-visitor`, `cf-ipcountry`, `cf-warp-tag-id`, etc. None
        // describe stored representation; all describe the request's
        // path through Cloudflare's edge.
        || name.starts_with("cf-")
        || matches!(
            name,
            // Credentials and ambient identity must never come back as stored data.
            "authorization"
                | "proxy-authorization"
                | "cookie"
                | "set-cookie"
                // Hop-by-hop and transport headers are properties of this request,
                // not the stored representation.
                | "host"
                | "connection"
                | "keep-alive"
                | "proxy-authenticate"
                | "proxy-connection"
                | "te"
                | "trailer"
                | "transfer-encoding"
                | "upgrade"
                | "http2-settings"
                // Request controls are consumed at write/read time and then gone.
                | "accept"
                | "accept-charset"
                | "accept-encoding"
                | "accept-language"
                | "expect"
                | "from"
                | "max-forwards"
                | "origin"
                | "prefer"
                | "range"
                | "referer"
                | "referrer"
                | "dnt"
                | "user-agent"
                | "if-match"
                | "if-none-match"
                | "if-range"
                | "if-modified-since"
                | "if-unmodified-since"
                // Browser client hints are request-only hints, not response metadata.
                | "device-memory"
                | "downlink"
                | "dpr"
                | "ect"
                | "rtt"
                | "save-data"
                | "width"
                | "viewport-width"
                // Browser/client negotiation state is consumed per request.
                | "accept-ch"
                | "alt-used"
                | "attribution-reporting-eligible"
                | "available-dictionary"
                | "dictionary-id"
                | "early-data"
                | "idempotency-key"
                | "service-worker"
                | "service-worker-navigation-preload"
                | "upgrade-insecure-requests"
                // Server/transport advertisements describe this response or stream.
                | "alt-svc"
                | "server-timing"
                | "retry-after"
                | "x-powered-by"
                | "preference-applied"
                | "priority"
                | "critical-ch"
                | "clear-site-data"
                // Core-owned response headers are derived from stored bytes/audit.
                // Content-Type is persisted separately as Stage.content_type.
                | "content-type"
                | "content-length"
                | "etag"
                | "accept-ranges"
                | "content-range"
                | "link"
                | "location"
                | "allow"
                | "date"
                | "server"
                | "www-authenticate"
                | "age"
                | "vary"
                | "x-request-id"
                | "x-elapsed-us"
                | "x-elapsed-ms"
                | "x-content-type-options"
                // Proxy trail is about how the request arrived, not what was written.
                | "forwarded"
                | "via"
                | "x-forwarded-for"
                | "x-forwarded-host"
                | "x-forwarded-proto"
                | "x-real-ip"
                // Other client-IP forwarding headers from
                // load-balancers and CDNs. `true-client-ip` is
                // Akamai and Cloudflare Enterprise; `client-ip` is
                // the legacy form used by older proxies. Same data
                // class as `x-forwarded-for`.
                | "true-client-ip"
                | "client-ip"
                // Distributed tracing context: W3C Trace Context
                // via `traceparent` / `tracestate`, W3C Baggage,
                // and Zipkin's single-header b3 format. These
                // describe the request's RPC link, not the stored
                // representation. Auto-injected by every modern APM
                // or OpenTelemetry agent; if persisted, the next
                // reader replays the writer's trace ID and corrupts
                // every downstream tracing system.
                | "traceparent"
                | "tracestate"
                | "baggage"
                | "b3"
                // HTTP transport version markers. `http2-settings`
                // is HTTP/1.1->HTTP/2 upgrade negotiation;
                // `http3-settings` is its analog for QUIC. Either
                // landing in stored data means the listener saw
                // upgrade traffic and let it through. Defensive.
                | "http3-settings"
                // HTTP/1.0 cache-control control header. `Pragma:
                // no-cache` is a per-request directive, not stored
                // representation metadata. Living fossil from RFC
                // 1945 -- denylisting it now closes the round-trip
                // edge case where an old client tags a write.
                | "pragma"
        )
}

pub(crate) fn not_modified(world_name: &str, etag: &str, stage: &Stage) -> Response {
    let mut headers = vec![
        (header::ETAG, etag_header(etag)),
        (
            header::CONTENT_TYPE,
            HeaderValue::from_str(&stage.content_type)
                .unwrap_or_else(|_| HeaderValue::from_static("application/octet-stream")),
        ),
        (header::ACCEPT_RANGES, HeaderValue::from_static("bytes")),
    ];
    apply_world_links(world_name, &mut headers);
    apply_meta_headers(&stage.headers, &mut headers);
    (StatusCode::NOT_MODIFIED, to_header_map(headers), "").into_response()
}

pub(crate) fn range_not_satisfiable(len: usize) -> Response {
    let headers = vec![
        (
            header::CONTENT_TYPE,
            HeaderValue::from_static("text/plain; charset=utf-8"),
        ),
        (header::ACCEPT_RANGES, HeaderValue::from_static("bytes")),
        (
            header::CONTENT_RANGE,
            HeaderValue::from_str(&format!("bytes */{len}")).unwrap(),
        ),
    ];
    (
        StatusCode::RANGE_NOT_SATISFIABLE,
        to_header_map(headers),
        "range not satisfiable\n",
    )
        .into_response()
}