mockforge-kafka 0.3.143

Kafka protocol support for MockForge
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
//! Non-flexible Fetch codec for v4–v11.
//!
//! Modern clients auto-negotiate to Fetch v12 (flexible), which
//! `fetch_codec` already handles. Older clients stuck on librdkafka 1.8.x —
//! notably `edenhill/kcat:1.7.1` — send Fetch v11, which is non-flexible.
//!
//! v4 is the floor:
//!   - v3 added `max_bytes` at the top of the request.
//!   - v4 added `isolation_level`; every response from v4 onward carries
//!     `last_stable_offset` + `aborted_transactions` per partition, which
//!     clients parse unconditionally at v4+.
//!
//! Supporting v0–v3 would mean a second per-partition response shape
//! (without `last_stable_offset` / `aborted_transactions`) to satisfy a
//! client class that essentially no longer exists in the wild, so we skip
//! it. v4 is the minimum version any client released in the last ~7 years
//! sends.
//!
//! Request body shape differs across the v4–v11 range:
//!   v4:      replica_id, max_wait_ms, min_bytes, max_bytes, isolation_level,
//!            topics[name, partitions[partition_index, fetch_offset,
//!                                    partition_max_bytes]]
//!   v5–v6:   +per-partition log_start_offset (between fetch_offset and
//!            partition_max_bytes)
//!   v7–v8:   +session_id, session_epoch at top;
//!            +forgotten_topics_data array at end (before rack_id in v11)
//!   v9–v10:  +per-partition current_leader_epoch (between partition_index
//!            and fetch_offset)
//!   v11:     +rack_id (non-compact STRING at end)
//!
//! Response body shape:
//!   v4:      throttle_time_ms at top; per-partition has partition_index,
//!            error_code, high_watermark, last_stable_offset,
//!            aborted_transactions array, records.
//!   v5–v10:  per-partition adds log_start_offset (between
//!            last_stable_offset and aborted_transactions).
//!   v7+:     top gains error_code + session_id (after throttle_time_ms).
//!   v11:     per-partition adds preferred_read_replica (between
//!            aborted_transactions and records).
//!
//! Response header is v0 for every non-flexible version: just
//! correlation_id, no tag buffer.

use crate::fetch_codec::{
    FetchPartitionRequest, FetchRequestV12, FetchTopicRequest, FetchTopicResponse,
};

// =========================================================================
// Non-flexible wire primitives
// =========================================================================

fn take<'a>(buf: &mut &'a [u8], n: usize) -> Result<&'a [u8], String> {
    if buf.len() < n {
        return Err(format!("short read: wanted {n}, have {}", buf.len()));
    }
    let (head, tail) = buf.split_at(n);
    *buf = tail;
    Ok(head)
}

fn read_i8(buf: &mut &[u8]) -> Result<i8, String> {
    Ok(take(buf, 1)?[0] as i8)
}

fn read_i16(buf: &mut &[u8]) -> Result<i16, String> {
    let b = take(buf, 2)?;
    Ok(i16::from_be_bytes([b[0], b[1]]))
}

fn read_i32(buf: &mut &[u8]) -> Result<i32, String> {
    let b = take(buf, 4)?;
    Ok(i32::from_be_bytes([b[0], b[1], b[2], b[3]]))
}

fn read_i64(buf: &mut &[u8]) -> Result<i64, String> {
    let b = take(buf, 8)?;
    Ok(i64::from_be_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]]))
}

fn read_string(buf: &mut &[u8]) -> Result<String, String> {
    let len = read_i16(buf)?;
    if len < 0 {
        return Err("expected non-null STRING, got null".into());
    }
    let bytes = take(buf, len as usize)?;
    String::from_utf8(bytes.to_vec()).map_err(|e| format!("invalid utf8: {e}"))
}

fn push_string(out: &mut Vec<u8>, s: &str) {
    out.extend_from_slice(&(s.len() as i16).to_be_bytes());
    out.extend_from_slice(s.as_bytes());
}

/// Version at which each feature first appears in a non-flexible Fetch.
fn has_log_start_offset_in_request(v: i16) -> bool {
    v >= 5
}
fn has_session_fields(v: i16) -> bool {
    v >= 7
}
fn has_current_leader_epoch(v: i16) -> bool {
    v >= 9
}
fn has_rack_id(v: i16) -> bool {
    v >= 11
}
fn has_log_start_offset_in_response(v: i16) -> bool {
    v >= 5
}
fn has_response_session_fields(v: i16) -> bool {
    v >= 7
}
fn has_preferred_read_replica(v: i16) -> bool {
    v >= 11
}

// =========================================================================
// Fetch v4–v11 request parser
// =========================================================================

/// Parse a non-flexible Fetch request body for any version in `4..=11`.
/// Returns the same `FetchRequestV12` shape the broker already consumes;
/// fields that don't exist in older versions (e.g. `session_id` pre-v7) are
/// populated with the spec-defined default (0 for session_id).
pub fn parse_fetch_v4_v11(api_version: i16, body: &[u8]) -> Result<FetchRequestV12, String> {
    if !(4..=11).contains(&api_version) {
        return Err(format!("parse_fetch_v4_v11 called with unsupported version {api_version}"));
    }
    let mut cur = body;

    let _replica_id = read_i32(&mut cur)?;
    let max_wait_ms = read_i32(&mut cur)?;
    let min_bytes = read_i32(&mut cur)?;
    // max_bytes added in v3, always present at v4+.
    let max_bytes = read_i32(&mut cur)?;
    // isolation_level added in v4.
    let _isolation_level = read_i8(&mut cur)?;

    // session fields introduced in v7.
    let session_id = if has_session_fields(api_version) {
        let id = read_i32(&mut cur)?;
        let _session_epoch = read_i32(&mut cur)?;
        id
    } else {
        0
    };

    let topics_count = read_i32(&mut cur)?;
    if topics_count < 0 {
        return Err(format!("fetch topics count is negative: {topics_count}"));
    }
    let mut topics = Vec::with_capacity(topics_count as usize);

    for _ in 0..topics_count {
        let topic = read_string(&mut cur)?;
        let parts_count = read_i32(&mut cur)?;
        if parts_count < 0 {
            return Err(format!("fetch partitions count for {topic} is negative"));
        }
        let mut partitions = Vec::with_capacity(parts_count as usize);
        for _ in 0..parts_count {
            let partition_index = read_i32(&mut cur)?;
            if has_current_leader_epoch(api_version) {
                let _current_leader_epoch = read_i32(&mut cur)?;
            }
            let fetch_offset = read_i64(&mut cur)?;
            if has_log_start_offset_in_request(api_version) {
                let _log_start_offset = read_i64(&mut cur)?;
            }
            let partition_max_bytes = read_i32(&mut cur)?;
            partitions.push(FetchPartitionRequest {
                partition_index,
                fetch_offset,
                partition_max_bytes,
            });
        }
        topics.push(FetchTopicRequest { topic, partitions });
    }

    // forgotten_topics_data added in v7. Parse and discard.
    if has_session_fields(api_version) {
        let forgotten_count = read_i32(&mut cur)?;
        if forgotten_count > 0 {
            for _ in 0..forgotten_count {
                let _forgotten_topic = read_string(&mut cur)?;
                let plen = read_i32(&mut cur)?;
                for _ in 0..plen.max(0) {
                    let _ = read_i32(&mut cur)?;
                }
            }
        }
    }

    // rack_id added in v11.
    if has_rack_id(api_version) {
        let _rack_id = read_string(&mut cur)?;
    }

    Ok(FetchRequestV12 {
        max_wait_ms,
        min_bytes,
        max_bytes,
        session_id,
        topics,
    })
}

// =========================================================================
// Fetch v4–v11 response serializer
// =========================================================================

/// Serialize a full non-flexible Fetch response. Writes response header v0
/// (correlation_id only) followed by a body whose shape depends on
/// `api_version` (4..=11).
pub fn serialize_fetch_v4_v11_response(
    correlation_id: i32,
    api_version: i16,
    session_id: i32,
    topics: &[FetchTopicResponse],
) -> Vec<u8> {
    debug_assert!(
        (4..=11).contains(&api_version),
        "serialize_fetch_v4_v11_response called with api_version {api_version}"
    );

    let mut out = Vec::new();
    // Response header v0.
    out.extend_from_slice(&correlation_id.to_be_bytes());

    // throttle_time_ms (v1+, always present at v4+).
    out.extend_from_slice(&0i32.to_be_bytes());

    // error_code + session_id at top-level in v7+.
    if has_response_session_fields(api_version) {
        out.extend_from_slice(&0i16.to_be_bytes()); // top-level error_code
        out.extend_from_slice(&session_id.to_be_bytes());
    }

    // responses (topic array, int32 length).
    out.extend_from_slice(&(topics.len() as i32).to_be_bytes());
    for topic in topics {
        push_string(&mut out, &topic.topic);
        out.extend_from_slice(&(topic.partitions.len() as i32).to_be_bytes());
        for p in &topic.partitions {
            out.extend_from_slice(&p.partition_index.to_be_bytes());
            out.extend_from_slice(&p.error_code.to_be_bytes());
            out.extend_from_slice(&p.high_watermark.to_be_bytes());
            // last_stable_offset: v4+ always present. We don't track
            // transactional state, so advertise high_watermark.
            out.extend_from_slice(&p.high_watermark.to_be_bytes());
            if has_log_start_offset_in_response(api_version) {
                out.extend_from_slice(&p.log_start_offset.to_be_bytes());
            }
            // aborted_transactions: empty array (int32 = 0) at v4+.
            out.extend_from_slice(&0i32.to_be_bytes());
            if has_preferred_read_replica(api_version) {
                // preferred_read_replica = -1 (no preference)
                out.extend_from_slice(&(-1i32).to_be_bytes());
            }
            // records: non-null bytes (int32 length + bytes). Empty fetch
            // sends length = 0, which clients accept without triggering a
            // "MessageSetSize = -1" back-off.
            out.extend_from_slice(&(p.records.len() as i32).to_be_bytes());
            out.extend_from_slice(&p.records);
        }
    }
    out
}

// =========================================================================
// Tests
// =========================================================================

#[cfg(test)]
mod tests {
    use super::*;
    use crate::fetch_codec::{serialize_record_batch_v2, FetchPartitionResponse};
    use crate::partitions::KafkaMessage;

    /// Build a minimal v11 request body: one topic, one partition,
    /// all optional fields present.
    fn build_v11_request(topic: &str, partition: i32, fetch_offset: i64) -> Vec<u8> {
        let mut body = Vec::new();
        body.extend_from_slice(&(-1i32).to_be_bytes()); // replica_id
        body.extend_from_slice(&500i32.to_be_bytes()); // max_wait_ms
        body.extend_from_slice(&1i32.to_be_bytes()); // min_bytes
        body.extend_from_slice(&1_048_576i32.to_be_bytes()); // max_bytes
        body.push(0); // isolation_level
        body.extend_from_slice(&0i32.to_be_bytes()); // session_id
        body.extend_from_slice(&(-1i32).to_be_bytes()); // session_epoch

        body.extend_from_slice(&1i32.to_be_bytes()); // topics count = 1
        push_string(&mut body, topic);
        body.extend_from_slice(&1i32.to_be_bytes()); // partitions count = 1
        body.extend_from_slice(&partition.to_be_bytes()); // partition_index
        body.extend_from_slice(&(-1i32).to_be_bytes()); // current_leader_epoch (v9+)
        body.extend_from_slice(&fetch_offset.to_be_bytes());
        body.extend_from_slice(&(-1i64).to_be_bytes()); // log_start_offset (v5+)
        body.extend_from_slice(&65_536i32.to_be_bytes()); // partition_max_bytes

        body.extend_from_slice(&0i32.to_be_bytes()); // forgotten_topics_data count = 0
        push_string(&mut body, ""); // rack_id empty
        body
    }

    #[test]
    fn parses_v11_request() {
        let body = build_v11_request("orders", 3, 42);
        let parsed = parse_fetch_v4_v11(11, &body).unwrap();
        assert_eq!(parsed.max_wait_ms, 500);
        assert_eq!(parsed.max_bytes, 1_048_576);
        assert_eq!(parsed.session_id, 0);
        assert_eq!(parsed.topics.len(), 1);
        assert_eq!(parsed.topics[0].topic, "orders");
        assert_eq!(parsed.topics[0].partitions[0].partition_index, 3);
        assert_eq!(parsed.topics[0].partitions[0].fetch_offset, 42);
        assert_eq!(parsed.topics[0].partitions[0].partition_max_bytes, 65_536);
    }

    #[test]
    fn parses_v4_request_minimal() {
        // v4: no session fields, no current_leader_epoch, no log_start_offset,
        // no forgotten_topics, no rack_id.
        let mut body = Vec::new();
        body.extend_from_slice(&(-1i32).to_be_bytes()); // replica_id
        body.extend_from_slice(&100i32.to_be_bytes()); // max_wait_ms
        body.extend_from_slice(&1i32.to_be_bytes()); // min_bytes
        body.extend_from_slice(&524_288i32.to_be_bytes()); // max_bytes
        body.push(0); // isolation_level

        body.extend_from_slice(&1i32.to_be_bytes()); // topics count
        push_string(&mut body, "t");
        body.extend_from_slice(&1i32.to_be_bytes()); // partitions count
        body.extend_from_slice(&0i32.to_be_bytes()); // partition_index
        body.extend_from_slice(&7i64.to_be_bytes()); // fetch_offset
        body.extend_from_slice(&32_768i32.to_be_bytes()); // partition_max_bytes

        let parsed = parse_fetch_v4_v11(4, &body).unwrap();
        assert_eq!(parsed.max_wait_ms, 100);
        assert_eq!(parsed.topics[0].partitions[0].fetch_offset, 7);
        assert_eq!(parsed.topics[0].partitions[0].partition_max_bytes, 32_768);
    }

    #[test]
    fn rejects_unsupported_versions() {
        assert!(parse_fetch_v4_v11(3, &[]).is_err());
        assert!(parse_fetch_v4_v11(12, &[]).is_err());
    }

    fn msg(offset: i64, value: &[u8]) -> KafkaMessage {
        KafkaMessage {
            offset,
            timestamp: 1_000,
            key: None,
            value: value.to_vec(),
            headers: vec![],
        }
    }

    fn one_response(records: Vec<u8>) -> Vec<FetchTopicResponse> {
        vec![FetchTopicResponse {
            topic: "t".to_string(),
            partitions: vec![FetchPartitionResponse {
                partition_index: 0,
                error_code: 0,
                high_watermark: 5,
                log_start_offset: 0,
                records,
            }],
        }]
    }

    #[test]
    fn response_v4_shape() {
        // v4 per-partition: partition_index(4) + error_code(2) +
        //   high_watermark(8) + last_stable_offset(8) +
        //   aborted_transactions(4=0 entries) + records(len+bytes)
        let data = serialize_fetch_v4_v11_response(7, 4, 0, &one_response(vec![]));

        // correlation_id
        assert_eq!(&data[0..4], &7i32.to_be_bytes());
        // throttle_time_ms
        assert_eq!(&data[4..8], &0i32.to_be_bytes());
        // topics count (v4 has no session fields)
        assert_eq!(&data[8..12], &1i32.to_be_bytes());
        // topic name "t"
        assert_eq!(&data[12..14], &1i16.to_be_bytes());
        assert_eq!(&data[14..15], b"t");
        // partitions count
        assert_eq!(&data[15..19], &1i32.to_be_bytes());
        // partition layout
        assert_eq!(&data[19..23], &0i32.to_be_bytes()); // partition_index
        assert_eq!(&data[23..25], &0i16.to_be_bytes()); // error_code
        assert_eq!(&data[25..33], &5i64.to_be_bytes()); // high_watermark
        assert_eq!(&data[33..41], &5i64.to_be_bytes()); // last_stable_offset
        assert_eq!(&data[41..45], &0i32.to_be_bytes()); // aborted_transactions count
        assert_eq!(&data[45..49], &0i32.to_be_bytes()); // records length = 0
        assert_eq!(data.len(), 49);
    }

    #[test]
    fn response_v11_adds_session_and_preferred_replica() {
        // v11 extras vs v4:
        //   top: +error_code(2) + session_id(4)
        //   partition: +log_start_offset(8) + preferred_read_replica(4)
        let data = serialize_fetch_v4_v11_response(7, 11, 42, &one_response(vec![]));

        // header(4) + throttle(4) + error(2) + session(4) = 14 before topics
        assert_eq!(&data[8..10], &0i16.to_be_bytes()); // top-level error_code
        assert_eq!(&data[10..14], &42i32.to_be_bytes()); // session_id
        assert_eq!(&data[14..18], &1i32.to_be_bytes()); // topics count
                                                        // topic name "t"
        assert_eq!(&data[18..20], &1i16.to_be_bytes());
        assert_eq!(&data[20..21], b"t");
        // partitions count
        assert_eq!(&data[21..25], &1i32.to_be_bytes());
        // partition layout
        assert_eq!(&data[25..29], &0i32.to_be_bytes()); // partition_index
        assert_eq!(&data[29..31], &0i16.to_be_bytes()); // error_code
        assert_eq!(&data[31..39], &5i64.to_be_bytes()); // high_watermark
        assert_eq!(&data[39..47], &5i64.to_be_bytes()); // last_stable_offset
        assert_eq!(&data[47..55], &0i64.to_be_bytes()); // log_start_offset
        assert_eq!(&data[55..59], &0i32.to_be_bytes()); // aborted_transactions
        assert_eq!(&data[59..63], &(-1i32).to_be_bytes()); // preferred_read_replica
        assert_eq!(&data[63..67], &0i32.to_be_bytes()); // records length = 0
        assert_eq!(data.len(), 67);
    }

    #[test]
    fn response_v11_embeds_records_blob() {
        // The records field carries a complete RecordBatch v2 blob.
        let stored = [msg(10, b"hi")];
        let refs: Vec<&KafkaMessage> = stored.iter().collect();
        let batch = serialize_record_batch_v2(&refs);
        let batch_len = batch.len();
        let topics = one_response(batch.clone());

        let data = serialize_fetch_v4_v11_response(1, 11, 0, &topics);
        // Records sit at the very end; the preceding i32 is their length.
        let len_offset = data.len() - batch_len - 4;
        let got_len = i32::from_be_bytes([
            data[len_offset],
            data[len_offset + 1],
            data[len_offset + 2],
            data[len_offset + 3],
        ]);
        assert_eq!(got_len as usize, batch_len);
        assert_eq!(&data[len_offset + 4..], batch.as_slice());
    }
}