bearing 0.1.0-alpha.2

A Rust port of Apache Lucene
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
// SPDX-License-Identifier: Apache-2.0
//! Segment infos reader and writer for the segments_N commit point file.

use std::collections::HashMap;
use std::io;

use log::debug;

use crate::codecs::codec_util;
use crate::index::SegmentCommitInfo;
use crate::index::index_file_names;
use crate::store::checksum_input::ChecksumIndexInput;
use crate::store::memory::MemoryIndexOutput;
use crate::store::{DataInput, DataOutput, Directory, SegmentFile};
use crate::util::string_helper;

/// Codec name for the segments_N file header.
const CODEC_NAME: &str = "segments";

/// Format version: VERSION_86 = 10 (Lucene 8.6+).
const VERSION_CURRENT: i32 = 10;

/// Lucene version 10.3.2 — written into segments_N as the index version.
const LUCENE_VERSION_MAJOR: i32 = 10;
const LUCENE_VERSION_MINOR: i32 = 3;
const LUCENE_VERSION_BUGFIX: i32 = 2;

/// The codec name written for each segment entry.
/// For the Lucene103 codec target, this is "Lucene103".
const SEGMENT_CODEC_NAME: &str = "Lucene103";

/// Writes a `segments_N` file.
///
/// This is the commit point file that lists all segments in the index.
/// The generation number determines the filename suffix (e.g., generation 1 → `segments_1`).
///
/// Returns a [`SegmentFile`] for the segments_N file.
///
/// # Arguments
/// * `segments` — the segment commit infos to include
/// * `generation` — the commit generation (≥ 1)
/// * `version` — the segment infos version (monotonically increasing, typically matches generation)
/// * `counter` — the segment name counter (number of segments ever created)
/// * `user_data` — optional commit user data (empty for MVP)
pub fn write(
    segments: &[&SegmentCommitInfo],
    generation: i64,
    version: i64,
    counter: i64,
    user_data: &HashMap<String, String>,
) -> io::Result<SegmentFile> {
    let gen_suffix = index_file_names::radix36(generation as u64);
    let filename = format!("segments_{gen_suffix}");
    let id = string_helper::random_id();

    let mut out = MemoryIndexOutput::new(filename.clone());

    // Index header: codec="segments", version=10, id=random, suffix=generation in base-36
    codec_util::write_index_header(&mut out, CODEC_NAME, VERSION_CURRENT, &id, &gen_suffix)?;

    // Lucene version (Version.LATEST = 10.3.2)
    out.write_vint(LUCENE_VERSION_MAJOR)?;
    out.write_vint(LUCENE_VERSION_MINOR)?;
    out.write_vint(LUCENE_VERSION_BUGFIX)?;

    // Index created version major
    out.write_vint(LUCENE_VERSION_MAJOR)?;

    // Segment infos version (BE long)
    out.write_be_long(version)?;

    // Counter (VLong)
    out.write_vlong(counter)?;

    // Number of segments (BE int)
    let num_segments = segments.len() as i32;
    out.write_be_int(num_segments)?;

    debug!(
        "segment_infos: writing segments_{gen_suffix}, version={version}, \
         counter={counter}, num_segments={num_segments}"
    );

    // Min segment version (only if segments > 0)
    if !segments.is_empty() {
        // All segments are created by this writer, so min version = LUCENE_VERSION
        out.write_vint(LUCENE_VERSION_MAJOR)?;
        out.write_vint(LUCENE_VERSION_MINOR)?;
        out.write_vint(LUCENE_VERSION_BUGFIX)?;
    }

    // Per-segment entries
    for sci in segments {
        let si = &sci.info;

        // Segment name
        out.write_string(&si.name)?;

        // Segment ID (16 bytes)
        out.write_bytes(&si.id)?;

        // Codec name
        out.write_string(SEGMENT_CODEC_NAME)?;

        // Del gen (BE long)
        out.write_be_long(sci.del_gen)?;

        // Del count (BE int)
        out.write_be_int(sci.del_count)?;

        // Field infos gen (BE long)
        out.write_be_long(sci.field_infos_gen)?;

        // Doc values gen (BE long)
        out.write_be_long(sci.doc_values_gen)?;

        // Soft del count (BE int)
        out.write_be_int(sci.soft_del_count)?;

        // SCI ID
        match &sci.id {
            Some(sci_id) => {
                out.write_byte(1)?;
                out.write_bytes(sci_id)?;
            }
            None => {
                out.write_byte(0)?;
            }
        }

        // Field infos files (empty for fresh segment)
        out.write_set_of_strings(&[])?;

        // Doc values updates files (empty for fresh segment)
        out.write_be_int(0)?;

        debug!(
            "segment_infos: segment={} maxDoc={} compound={} delGen={} delCount={}",
            si.name, si.max_doc, si.is_compound_file, sci.del_gen, sci.del_count
        );
    }

    // User data
    out.write_map_of_strings(user_data)?;

    // Footer
    codec_util::write_footer(&mut out)?;

    Ok(out.into_inner())
}

/// A raw segment entry parsed from a `segments_N` file.
///
/// Contains only what's stored in the `segments_N` file itself — no data from
/// `.si` or `.fnm` files. The caller is responsible for reading those via the
/// appropriate codec format readers.
#[derive(Debug)]
pub struct SegmentEntry {
    /// Segment name (e.g., "_0").
    pub name: String,
    /// Segment ID (16 bytes).
    pub id: [u8; codec_util::ID_LENGTH],
    /// Codec name (e.g., "Lucene103").
    pub codec: String,
    /// Delete generation.
    pub del_gen: i64,
    /// Number of deleted documents.
    pub del_count: i32,
    /// Field infos generation.
    pub field_infos_gen: i64,
    /// Doc values generation.
    pub doc_values_gen: i64,
    /// Number of soft-deleted documents.
    pub soft_del_count: i32,
    /// Segment commit info ID (optional).
    pub sci_id: Option<[u8; codec_util::ID_LENGTH]>,
}

/// Result of reading a `segments_N` file.
///
/// Contains only the data stored in the `segments_N` file. Per-segment metadata
/// (`.si`, `.fnm`) must be read separately by the caller using the codec name
/// from each [`SegmentEntry`].
pub struct SegmentInfosRead {
    /// The raw segment entries in this commit.
    pub segments: Box<[SegmentEntry]>,
    /// The commit generation (from the filename suffix).
    pub generation: i64,
    /// The segment infos version (monotonically increasing).
    pub version: i64,
    /// The segment name counter.
    pub counter: i64,
    /// User data written with this commit.
    pub user_data: HashMap<String, String>,
}

/// Parses the generation number from a `segments_N` filename.
///
/// Ported from `SegmentInfos.generationFromSegmentsFileName`.
///
/// - `"segments"` → generation 0
/// - `"segments_1"` → generation 1
/// - `"segments_a"` → generation 10 (base-36)
/// - `"segments_10"` → generation 36
pub fn generation_from_segments_file_name(file_name: &str) -> io::Result<i64> {
    if file_name == "segments" {
        return Ok(0);
    }
    let suffix = file_name.strip_prefix("segments_").ok_or_else(|| {
        io::Error::other(format!("fileName \"{file_name}\" is not a segments file"))
    })?;
    i64::from_str_radix(suffix, 36)
        .map_err(|e| io::Error::other(format!("invalid generation in {file_name}: {e}")))
}

/// Returns the `segments_N` filename for the most recent commit generation.
///
/// Ported from `SegmentInfos.getLastCommitSegmentsFileName`.
/// Parses each `segments_N` generation as base-36 and picks the numeric max.
pub fn get_last_commit_segments_file_name(files: &[String]) -> io::Result<String> {
    let mut max_generation: i64 = -1;
    for file in files {
        if file.starts_with("segments_") {
            let generation = generation_from_segments_file_name(file)?;
            if generation > max_generation {
                max_generation = generation;
            }
        }
    }
    if max_generation == -1 {
        return Err(io::Error::other("no segments_N file found in directory"));
    }
    let suffix = index_file_names::radix36(max_generation as u64);
    Ok(format!("segments_{suffix}"))
}

/// Reads a `segments_N` file from `directory`.
///
/// Returns only the data stored in the `segments_N` file. Does NOT read
/// per-segment `.si` or `.fnm` files — the caller should use the codec name
/// from each [`SegmentEntry`] to dispatch to the appropriate format readers.
pub fn read(directory: &dyn Directory, segment_file_name: &str) -> io::Result<SegmentInfosRead> {
    let generation = generation_from_segments_file_name(segment_file_name)?;
    let expected_suffix = index_file_names::radix36(generation as u64);

    let input = directory.open_input(segment_file_name)?;
    let mut input = ChecksumIndexInput::new(input);

    // The segments_N file has a random ID we don't know ahead of time,
    // so use check_header (not check_index_header) then read the ID and suffix manually.
    codec_util::check_header(&mut input, CODEC_NAME, VERSION_CURRENT, VERSION_CURRENT)?;

    // Read segment infos ID (16 bytes) — we discover it here
    let mut _id = [0u8; codec_util::ID_LENGTH];
    input.read_bytes(&mut _id)?;

    // Read and validate suffix (should match generation in base-36)
    let suffix_len = input.read_byte()? as usize;
    let mut suffix_bytes = vec![0u8; suffix_len];
    input.read_bytes(&mut suffix_bytes)?;
    let suffix = String::from_utf8(suffix_bytes).map_err(|e| io::Error::other(e.to_string()))?;
    if suffix != expected_suffix {
        return Err(io::Error::other(format!(
            "segments suffix mismatch: expected {expected_suffix:?}, got {suffix:?}"
        )));
    }

    // Lucene version (VInts)
    let _major = input.read_vint()?;
    let _minor = input.read_vint()?;
    let _bugfix = input.read_vint()?;

    // Index created version major
    let _index_created_version = input.read_vint()?;

    // Segment infos version (BE long)
    let version = input.read_be_long()?;

    // Counter (VLong)
    let counter = input.read_vlong()?;

    // Number of segments (BE int)
    let num_segments = input.read_be_int()?;
    if num_segments < 0 {
        return Err(io::Error::other(format!(
            "invalid segment count: {num_segments}"
        )));
    }

    // Min segment version (only present if segments > 0)
    if num_segments > 0 {
        let _min_major = input.read_vint()?;
        let _min_minor = input.read_vint()?;
        let _min_bugfix = input.read_vint()?;
    }

    // Per-segment entries
    let mut segments = Vec::with_capacity(num_segments as usize);
    for _ in 0..num_segments {
        let seg_name = input.read_string()?;

        let mut seg_id = [0u8; codec_util::ID_LENGTH];
        input.read_bytes(&mut seg_id)?;

        let codec_name = input.read_string()?;

        let del_gen = input.read_be_long()?;
        let del_count = input.read_be_int()?;
        let field_infos_gen = input.read_be_long()?;
        let doc_values_gen = input.read_be_long()?;
        let soft_del_count = input.read_be_int()?;

        let sci_id = match input.read_byte()? {
            1 => {
                let mut id = [0u8; codec_util::ID_LENGTH];
                input.read_bytes(&mut id)?;
                Some(id)
            }
            0 => None,
            marker => {
                return Err(io::Error::other(format!("invalid SCI ID marker: {marker}")));
            }
        };

        // Field infos files (set of strings)
        let _field_infos_files = input.read_set_of_strings()?;

        // Doc values updates files
        let num_dv_fields = input.read_be_int()?;
        for _ in 0..num_dv_fields {
            let _field_number = input.read_be_int()?;
            let _files = input.read_set_of_strings()?;
        }

        segments.push(SegmentEntry {
            name: seg_name,
            id: seg_id,
            codec: codec_name,
            del_gen,
            del_count,
            field_infos_gen,
            doc_values_gen,
            soft_del_count,
            sci_id,
        });
    }

    // User data
    let user_data = input.read_map_of_strings()?;

    // Footer
    codec_util::check_footer(&mut input)?;

    debug!(
        "segment_infos: read {segment_file_name}, version={version}, \
         counter={counter}, num_segments={num_segments}"
    );

    Ok(SegmentInfosRead {
        segments: segments.into_boxed_slice(),
        generation,
        version,
        counter,
        user_data,
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::codecs::codec_util::{CODEC_MAGIC, FOOTER_LENGTH, FOOTER_MAGIC};
    use crate::document::{DocValuesType, IndexOptions};
    use crate::index::{FieldInfo, FieldInfos, PointDimensionConfig, SegmentInfo};
    use crate::test_util::TestDataReader;

    fn make_test_segment_commit_info(
        name: &str,
        max_doc: i32,
        segment_id: [u8; 16],
        sci_id: Option<[u8; 16]>,
    ) -> SegmentCommitInfo {
        let si = SegmentInfo::new(
            name.to_string(),
            max_doc,
            true,
            segment_id,
            HashMap::new(),
            HashMap::new(),
        );
        let fis = FieldInfos::new(vec![FieldInfo::new(
            "contents".to_string(),
            0,
            false,
            false,
            IndexOptions::DocsAndFreqsAndPositions,
            DocValuesType::None,
            PointDimensionConfig::default(),
        )]);
        SegmentCommitInfo::new(si, fis, sci_id)
    }

    // Ported from org.apache.lucene.index.TestSegmentInfos

    #[test]
    fn test_write_empty_segments() {
        let user_data = HashMap::new();
        let file = write(&[], 1, 1, 0, &user_data).unwrap();

        assert_eq!(file.name, "segments_1");

        // Verify header magic
        let mut r = TestDataReader::new(&file.data, 0);
        assert_eq!(r.read_be_int(), CODEC_MAGIC);

        // Verify footer at end
        r.pos = file.data.len() - FOOTER_LENGTH;
        assert_eq!(r.read_be_int(), FOOTER_MAGIC);
        assert_eq!(r.read_be_int(), 0); // algorithm

        // Verify data section
        r.pos = codec_util::index_header_length(CODEC_NAME, "1");

        // Version.LATEST = 10.3.2
        assert_eq!(r.read_vint(), 10);
        assert_eq!(r.read_vint(), 3);
        assert_eq!(r.read_vint(), 2);

        // indexCreatedVersionMajor
        assert_eq!(r.read_vint(), 10);

        // version (BE long)
        assert_eq!(r.read_be_long(), 1);

        // counter (VLong)
        assert_eq!(r.read_vlong(), 0);

        // num segments (BE int)
        assert_eq!(r.read_be_int(), 0);

        // No min segment version (size == 0)

        // userData (empty map: VInt 0)
        assert_eq!(r.read_vint(), 0);

        // Next should be footer
        assert_eq!(r.pos, file.data.len() - FOOTER_LENGTH);
    }

    #[test]
    fn test_write_single_segment() {
        let seg_id = [0xABu8; 16];
        let sci_id = [0xCDu8; 16];
        let sci = make_test_segment_commit_info("_0", 3, seg_id, Some(sci_id));

        let user_data = HashMap::new();
        let file = write(&[&sci], 1, 1, 1, &user_data).unwrap();

        assert_eq!(file.name, "segments_1");

        let mut r =
            TestDataReader::new(&file.data, codec_util::index_header_length(CODEC_NAME, "1"));

        // Version.LATEST
        assert_eq!(r.read_vint(), 10);
        assert_eq!(r.read_vint(), 3);
        assert_eq!(r.read_vint(), 2);

        // indexCreatedVersionMajor
        assert_eq!(r.read_vint(), 10);

        // version (BE long)
        assert_eq!(r.read_be_long(), 1);

        // counter (VLong)
        assert_eq!(r.read_vlong(), 1);

        // num segments (BE int)
        assert_eq!(r.read_be_int(), 1);

        // min segment version (10.3.2)
        assert_eq!(r.read_vint(), 10);
        assert_eq!(r.read_vint(), 3);
        assert_eq!(r.read_vint(), 2);

        // Segment entry
        let name = r.read_string();
        assert_eq!(name, "_0");

        // Segment ID (16 bytes)
        assert_eq!(&file.data[r.pos..r.pos + 16], &[0xABu8; 16]);
        r.pos += 16;

        // Codec name
        let codec = r.read_string();
        assert_eq!(codec, "Lucene103");

        // del_gen (BE long)
        assert_eq!(r.read_be_long(), -1);

        // del_count (BE int)
        assert_eq!(r.read_be_int(), 0);

        // field_infos_gen (BE long)
        assert_eq!(r.read_be_long(), -1);

        // doc_values_gen (BE long)
        assert_eq!(r.read_be_long(), -1);

        // soft_del_count (BE int)
        assert_eq!(r.read_be_int(), 0);

        // SCI ID present (byte 1)
        assert_eq!(file.data[r.pos], 1);
        r.pos += 1;

        // SCI ID (16 bytes)
        assert_eq!(&file.data[r.pos..r.pos + 16], &[0xCDu8; 16]);
        r.pos += 16;

        // fieldInfosFiles (empty set: VInt 0)
        assert_eq!(r.read_vint(), 0);

        // docValuesUpdatesFiles (empty: BE int 0)
        assert_eq!(r.read_be_int(), 0);

        // userData (empty map: VInt 0)
        assert_eq!(r.read_vint(), 0);

        // Next should be footer
        assert_eq!(r.pos, file.data.len() - FOOTER_LENGTH);
    }

    #[test]
    fn test_write_segment_no_sci_id() {
        let seg_id = [0x11u8; 16];
        let sci = make_test_segment_commit_info("_0", 1, seg_id, None);

        let user_data = HashMap::new();
        let file = write(&[&sci], 1, 1, 1, &user_data).unwrap();

        let mut r =
            TestDataReader::new(&file.data, codec_util::index_header_length(CODEC_NAME, "1"));

        // Skip: version(3 VInts) + indexCreatedMajor(VInt) + version(BE long) + counter(VLong) + numSegs(BE int)
        for _ in 0..4 {
            r.read_vint();
        }
        r.read_be_long();
        r.read_vlong();
        r.read_be_int();

        // Skip min segment version
        for _ in 0..3 {
            r.read_vint();
        }

        // Skip: name + segment_id + codec
        r.read_string();
        r.pos += 16;
        r.read_string();

        // Skip: del_gen + del_count + field_infos_gen + doc_values_gen + soft_del_count
        r.read_be_long();
        r.read_be_int();
        r.read_be_long();
        r.read_be_long();
        r.read_be_int();

        // SCI ID absent (byte 0, no ID bytes follow)
        assert_eq!(file.data[r.pos], 0);
        r.pos += 1;

        // fieldInfosFiles (empty)
        assert_eq!(r.read_vint(), 0);

        // docValuesUpdatesFiles (empty)
        assert_eq!(r.read_be_int(), 0);
    }

    // --- Read round-trip tests ---

    #[test]
    fn test_read_roundtrip_empty() {
        let user_data = HashMap::new();
        let file = write(&[], 1, 1, 0, &user_data).unwrap();

        // Put the segments_N file into a directory
        let mut dir = crate::store::MemoryDirectory::new();
        dir.write_file(&file.name, &file.data).unwrap();

        let result = read(&dir, &file.name).unwrap();
        assert_is_empty!(&result.segments);
        assert_eq!(result.version, 1);
        assert_eq!(result.counter, 0);
        assert_is_empty!(&result.user_data);
    }

    #[test]
    fn test_read_roundtrip_single_segment() {
        let seg_id = [0xABu8; 16];
        let sci_id = [0xCDu8; 16];
        let sci = make_test_segment_commit_info("_0", 3, seg_id, Some(sci_id));

        let user_data = HashMap::new();
        let file = write(&[&sci], 1, 1, 1, &user_data).unwrap();

        let mut dir = crate::store::MemoryDirectory::new();
        dir.write_file(&file.name, &file.data).unwrap();

        let result = read(&dir, &file.name).unwrap();

        assert_len_eq_x!(&result.segments, 1);
        assert_eq!(result.segments[0].name, "_0");
        assert_eq!(result.segments[0].id, seg_id);
        assert_eq!(result.segments[0].codec, "Lucene103");
        assert_eq!(result.segments[0].sci_id, Some(sci_id));
        assert_eq!(result.version, 1);
        assert_eq!(result.counter, 1);
    }

    // --- Write-side tests ---

    #[test]
    fn test_write_generation_suffix() {
        let user_data = HashMap::new();
        let file = write(&[], 36, 36, 0, &user_data).unwrap();
        // 36 in base-36 = "10"
        assert_eq!(file.name, "segments_10");
    }

    #[test]
    fn test_byte_order_correctness() {
        // Verify that BE fields are truly big-endian and VInt fields are variable-length
        let seg_id = [0x00u8; 16];
        let sci = make_test_segment_commit_info("_0", 5, seg_id, Some([0x00; 16]));

        let user_data = HashMap::new();
        let file = write(&[&sci], 1, 0x0102030405060708, 1, &user_data).unwrap();

        let mut r =
            TestDataReader::new(&file.data, codec_util::index_header_length(CODEC_NAME, "1"));

        // Skip Version.LATEST + indexCreatedVersionMajor (4 VInts)
        for _ in 0..4 {
            r.read_vint();
        }

        // Version should be BE: 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08
        assert_eq!(file.data[r.pos], 0x01);
        assert_eq!(file.data[r.pos + 1], 0x02);
        assert_eq!(file.data[r.pos + 2], 0x03);
        assert_eq!(file.data[r.pos + 3], 0x04);
        assert_eq!(file.data[r.pos + 4], 0x05);
        assert_eq!(file.data[r.pos + 5], 0x06);
        assert_eq!(file.data[r.pos + 6], 0x07);
        assert_eq!(file.data[r.pos + 7], 0x08);
        let ver = r.read_be_long();
        assert_eq!(ver, 0x0102030405060708);
    }

    // --- generation_from_segments_file_name tests ---

    #[test]
    fn test_generation_bare_segments() {
        assert_eq!(generation_from_segments_file_name("segments").unwrap(), 0);
    }

    #[test]
    fn test_generation_single_digit() {
        assert_eq!(generation_from_segments_file_name("segments_1").unwrap(), 1);
        assert_eq!(generation_from_segments_file_name("segments_9").unwrap(), 9);
    }

    #[test]
    fn test_generation_base36_letters() {
        assert_eq!(
            generation_from_segments_file_name("segments_a").unwrap(),
            10
        );
        assert_eq!(
            generation_from_segments_file_name("segments_z").unwrap(),
            35
        );
    }

    #[test]
    fn test_generation_base36_multi_char() {
        assert_eq!(
            generation_from_segments_file_name("segments_10").unwrap(),
            36
        );
        assert_eq!(
            generation_from_segments_file_name("segments_1a").unwrap(),
            46
        );
    }

    #[test]
    fn test_generation_invalid_filename() {
        assert!(generation_from_segments_file_name("_0.cfs").is_err());
        assert!(generation_from_segments_file_name("not_segments").is_err());
    }

    // --- get_last_commit_segments_file_name tests ---

    #[test]
    fn test_last_commit_single_file() {
        let files = vec!["segments_1".to_string()];
        assert_eq!(
            get_last_commit_segments_file_name(&files).unwrap(),
            "segments_1"
        );
    }

    #[test]
    fn test_last_commit_numeric_max_not_lexicographic() {
        // segments_10 = gen 36, segments_z = gen 35
        // Lexicographic would pick segments_z; numeric picks segments_10
        let files = vec!["segments_z".to_string(), "segments_10".to_string()];
        assert_eq!(
            get_last_commit_segments_file_name(&files).unwrap(),
            "segments_10"
        );
    }

    #[test]
    fn test_last_commit_ignores_non_segments() {
        let files = vec![
            "_0.cfs".to_string(),
            "_0.si".to_string(),
            "segments_3".to_string(),
            "write.lock".to_string(),
        ];
        assert_eq!(
            get_last_commit_segments_file_name(&files).unwrap(),
            "segments_3"
        );
    }

    #[test]
    fn test_last_commit_no_segments_files() {
        let files = vec!["_0.cfs".to_string(), "write.lock".to_string()];
        assert!(get_last_commit_segments_file_name(&files).is_err());
    }

    #[test]
    fn test_last_commit_empty() {
        let files: Vec<String> = vec![];
        assert!(get_last_commit_segments_file_name(&files).is_err());
    }

    #[test]
    fn test_last_commit_multiple_generations() {
        let files = vec![
            "segments_1".to_string(),
            "segments_5".to_string(),
            "segments_3".to_string(),
        ];
        assert_eq!(
            get_last_commit_segments_file_name(&files).unwrap(),
            "segments_5"
        );
    }
}