kindling-mobi 0.14.5

Kindle MOBI/AZW3 builder for dictionaries, books, and comics. Drop-in kindlegen replacement.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
/// KF8 (Kindle Format 8) dual-format support for book MOBIs.
///
/// Produces KF8 text records, FDST, skeleton/fragment/NCX INDX records,
/// CNCX records, and DATP record for the KF8 section of a dual KF7+KF8
/// MOBI file.
///
/// The assembly follows Calibre's writer8/skeleton.py:
/// - Each spine item is split into a SKELETON (XHTML shell with empty
///   aid-marked body) and one or more FRAGMENT chunks (the body contents).
/// - The text flow is laid out as
///     [skel_0][frag_0_0][frag_0_1]...[skel_1][frag_1_0]...
/// - The skeleton INDX lists each skel's absolute start offset and length.
/// - The fragment INDX lists each chunk's insert position (absolute byte
///   offset in the combined text where the fragment content should be
///   spliced before `</body>`), a CNCX selector offset, file number,
///   global sequence number, and the (start_pos, length) geometry of the
///   fragment chunk.
/// - The CNCX records hold the XPath selector strings
///   (e.g. `P-//*[@aid='0']`) that Kindle uses to locate the target
///   element inside the skeleton.
///
/// All INDX records follow Calibre's byte layout (see
/// writer8/index.py): 192-byte header, TAGX section at offset 192 in
/// the primary record, followed by the entry geometry and an IDXT
/// footer; data records carry the per-entry control-byte-prefixed
/// tag value stream and an IDXT footer pointing at each entry's
/// starting offset.

use regex::Regex;

use crate::cncx::CncxBuilder;
use crate::palmdoc;
use crate::vwi::{encode_vwi, encode_vwi_inv};

const RECORD_SIZE: usize = 4096;
const INDX_HEADER_LENGTH: usize = 192;

/// Represents the complete KF8 section (all records after the BOUNDARY).
pub struct Kf8Section {
    /// KF8 text records (compressed, with trailing bytes)
    pub text_records: Vec<Vec<u8>>,
    /// Uncompressed text length (HTML + CSS flows combined)
    pub text_length: usize,
    /// FDST record
    pub fdst: Vec<u8>,
    /// Fragment INDX records (primary + data)
    pub fragment_indx: Vec<Vec<u8>>,
    /// CNCX records that back the fragment INDX tag 2 (aid selector)
    /// offsets. These are PalmDB records and must be written to the
    /// output file by the caller. In Calibre they immediately follow
    /// the fragment index records (between the fragment INDX data
    /// record and the skeleton INDX primary record).
    #[allow(dead_code)]
    pub cncx_records: Vec<Vec<u8>>,
    /// Skeleton INDX records (primary + data)
    pub skeleton_indx: Vec<Vec<u8>>,
    /// NCX INDX records (primary + data)
    pub ncx_indx: Vec<Vec<u8>>,
    /// CNCX records for the NCX INDX (label strings for TOC entries).
    /// Must be written immediately after NCX INDX in the record table.
    pub ncx_cncx_records: Vec<Vec<u8>>,
    /// DATP record
    pub datp: Vec<u8>,
    /// Number of flows (typically 2: HTML + CSS)
    pub flow_count: usize,
    /// CSS content that was appended as a separate flow
    #[allow(dead_code)]
    pub css_content: Vec<u8>,
    /// Uncompressed KF8 HTML bytes (the HTML flow only, without CSS).
    /// Exposed so the MOBI writer can run `html_check::validate_text_blob`
    /// on the assembled blob before compression/record splitting. This is
    /// a snapshot; the record bytes are derived from HTML+CSS combined.
    pub html_bytes: Vec<u8>,
}

/// Information about a skeleton (one per spine HTML file).
#[derive(Debug, Clone)]
struct SkeletonEntry {
    /// Label like "SKEL0000000000"
    label: String,
    /// Absolute byte offset in the combined KF8 text where this
    /// skeleton starts
    start_pos: usize,
    /// Byte length of this skeleton's XHTML shell (NOT including
    /// spliced fragment content)
    length: usize,
    /// Number of fragment chunks belonging to this skeleton
    chunk_count: usize,
}

/// Information about a fragment chunk.
#[derive(Debug, Clone)]
struct FragmentEntry {
    /// Insert position (absolute byte offset in combined text where
    /// the fragment bytes should be spliced into the skeleton to
    /// reconstruct the rendered page)
    insert_pos: usize,
    /// Base-32 CNCX selector string like `"P-//*[@aid='0']"`
    selector: String,
    /// Which skeleton (spine file) this fragment belongs to
    file_number: usize,
    /// Globally unique monotonic sequence number across all fragments
    /// in the book
    sequence_number: usize,
    /// Position of this fragment inside its skeleton's fragment
    /// payload block (i.e., offset relative to the first fragment of
    /// that skel). For a 1-fragment-per-skel comic page this is 0.
    start_pos: usize,
    /// Byte length of this fragment chunk
    length: usize,
}

/// Build the complete KF8 section for a book MOBI.
///
/// `kindlegen_parity` routes the spine-item HTML transform through a
/// byte-for-byte-with-kindlegen mode (strips DOCTYPE/meta/styles, uses
/// `image/jpg` mime, skips aid on img tags, resets the aid counter to
/// `spine_idx * 1_000_000` per page). Off by default — kindling's normal
/// output is "better than kindlegen" (pretty-printed, aided img tags,
/// IANA-correct `image/jpeg` mime).
pub fn build_kf8_section(
    html_parts: &[String],
    css_content: &str,
    href_to_recindex: &std::collections::HashMap<String, usize>,
    spine_items: &[(String, String)],
    no_compress: bool,
    kindlegen_parity: bool,
    title: &str,
) -> Kf8Section {
    // Step 1: Build the split skeletons + fragments from each HTML part.
    let (kf8_html, skeleton_entries, fragment_entries) =
        build_kf8_html(html_parts, href_to_recindex, spine_items, kindlegen_parity);

    // Step 2: Append CSS as a separate flow.
    let css_bytes = css_content.as_bytes();
    let html_length = kf8_html.len();
    let total_text_length = html_length + css_bytes.len();

    let html_bytes_snapshot = kf8_html.clone();

    let mut combined_text = kf8_html;
    combined_text.extend_from_slice(css_bytes);

    // Step 3: Compress text into records.
    let num_skeletons = skeleton_entries.len();
    let (text_records, text_length) = if no_compress {
        split_text_uncompressed_kf8(&combined_text, num_skeletons)
    } else {
        compress_text_kf8(&combined_text, num_skeletons)
    };

    // Step 4: FDST record.
    let fdst = build_fdst(html_length, total_text_length);

    // Step 5: Skeleton INDX records.
    let skeleton_indx = build_skeleton_indx(&skeleton_entries);

    // Step 6: Fragment INDX records. Also builds the CNCX containing
    // the selector strings referenced by tag 2 of each fragment entry.
    let (fragment_indx, cncx_records) = build_fragment_indx_with_cncx(&fragment_entries);

    // Step 7: NCX INDX with proper 5-tag structure + CNCX label record.
    // Title is used as the TOC label in the NCX CNCX. Must be non-empty.
    let (ncx_indx, ncx_cncx_records) =
        build_ncx_indx(title, &skeleton_entries, html_length);

    // Step 8: DATP record (stub).
    let datp = build_datp();

    let flow_count = 2; // Always 2 per KCC/libmobi

    Kf8Section {
        text_records,
        text_length,
        fdst,
        fragment_indx,
        cncx_records,
        skeleton_indx,
        ncx_indx,
        ncx_cncx_records,
        datp,
        flow_count,
        css_content: css_bytes.to_vec(),
        html_bytes: html_bytes_snapshot,
    }
}

/// Build KF8 HTML with aid attributes, kindle:embed image URLs, and the
/// skeleton/fragment split.
///
/// Returns (combined_text_bytes, skeleton_entries, fragment_entries).
/// The combined text layout is `[skel_0][frag_0_0]...[skel_1][frag_1_0]...`.
fn build_kf8_html(
    html_parts: &[String],
    href_to_recindex: &std::collections::HashMap<String, usize>,
    spine_items: &[(String, String)],
    kindlegen_parity: bool,
) -> (Vec<u8>, Vec<SkeletonEntry>, Vec<FragmentEntry>) {
    let path_to_recindex = build_image_path_lookup(href_to_recindex, spine_items);

    let mut skeleton_entries: Vec<SkeletonEntry> = Vec::new();
    let mut fragment_entries: Vec<FragmentEntry> = Vec::new();
    let mut combined: Vec<u8> = Vec::new();
    let mut global_seq: usize = 0;

    // AID counter: in parity mode kindlegen resets the intra-counter to
    // 0 at each spine item and prefixes every AID with `spine_idx *
    // AID_PAGE_STRIDE`, so pages never collide unless a page has a
    // million+ tags. In default mode we count sequentially across the
    // whole spine (simpler, self-consistent, also never collides).
    const AID_PAGE_STRIDE: u32 = 1_000_000;
    let mut global_aid_counter: u32 = 0;

    for (skel_idx, raw_part) in html_parts.iter().enumerate() {
        let mut aid_counter: u32 = if kindlegen_parity {
            (skel_idx as u32) * AID_PAGE_STRIDE
        } else {
            global_aid_counter
        };
        // 1. Normalize this spine item into Calibre-style KF8 output:
        //    add aid attributes to aid-able tags and rewrite image src
        //    URLs to `kindle:embed:....`.
        let processed = process_kf8_part(raw_part, &mut aid_counter, &path_to_recindex, kindlegen_parity);
        if !kindlegen_parity {
            global_aid_counter = aid_counter;
        }

        // 2. Split into (skeleton, body_inner). The body tag is left
        //    on the skeleton with its aid attribute intact, but the
        //    body's inner content moves into the fragment.
        let split = split_skeleton_and_body(&processed);
        let skel_bytes = split.skeleton.as_bytes();
        let body_inner = split.body_inner.as_bytes();

        // 3. Emit the skeleton into the combined text.
        let skel_start = combined.len();
        combined.extend_from_slice(skel_bytes);
        let skel_len = skel_bytes.len();

        // 4. Absolute insert position: where the body's inner bytes
        //    should be spliced back when reconstructing the page. That
        //    is `skel_start + split.body_inner_offset` (the byte offset
        //    of `</body>` inside the skeleton, i.e. exactly where the
        //    fragment content belongs in the final rendered document).
        let insert_pos = skel_start + split.body_inner_offset;

        // 5. Emit a single fragment that contains the body's inner
        //    content (everything between `<body ...>` and `</body>`).
        //    More complex pages could split the inner content further
        //    — for simple comic pages there is always one chunk.
        let frag_start = combined.len();
        combined.extend_from_slice(body_inner);
        let frag_len = body_inner.len();

        let _ = frag_start; // start position in combined text, not stored
        // Compute start_pos relative to the first fragment of this
        // skel's fragment block (which is `skel_start + skel_len`).
        // With one fragment per skel, this is 0.
        let relative_start = 0usize;

        fragment_entries.push(FragmentEntry {
            insert_pos,
            selector: format!("P-//*[@aid='{}']", split.body_aid),
            file_number: skel_idx,
            sequence_number: global_seq,
            start_pos: relative_start,
            length: frag_len,
        });
        global_seq += 1;

        skeleton_entries.push(SkeletonEntry {
            label: format!("SKEL{:010}", skel_idx),
            start_pos: skel_start,
            length: skel_len,
            chunk_count: 1,
        });
    }

    (combined, skeleton_entries, fragment_entries)
}

/// Output of `split_skeleton_and_body`.
struct SkelSplit {
    /// The XHTML shell. Body tag is present with its aid attribute and
    /// closing `</body></html>` suffix, but the body is empty.
    skeleton: String,
    /// The body's inner content (everything that was between `<body ...>`
    /// and `</body>` in the original processed HTML).
    body_inner: String,
    /// Byte offset inside `skeleton` where `body_inner` should be
    /// spliced back to reconstruct the original processed HTML.
    /// Points at the byte just after `<body ...>` (which is also the
    /// byte of `</body>` in the emptied shell).
    body_inner_offset: usize,
    /// The aid attribute value of the body tag, used to build the
    /// fragment's CNCX selector string.
    body_aid: String,
}

/// Split a processed HTML document into (skeleton_with_empty_body, body_inner).
///
/// The input must already have aid attributes applied and image src
/// rewrites done (i.e. be the output of `process_kf8_part`).
///
/// The split logic scans for the first `<body ...>` opening tag and the
/// LAST `</body>` closing tag, lifts everything between them into the
/// body_inner, and leaves the skeleton with an empty body.
fn split_skeleton_and_body(html: &str) -> SkelSplit {
    // Locate `<body` (start of body open tag) and its closing `>`.
    let bytes = html.as_bytes();
    let (open_start, open_end, body_aid) = match find_body_open(bytes) {
        Some(v) => v,
        None => {
            // No body tag: treat the whole thing as a skeleton with no
            // fragment content. This keeps dictionaries / fallback
            // callers safe even though they don't exercise this path.
            return SkelSplit {
                skeleton: html.to_string(),
                body_inner: String::new(),
                body_inner_offset: html.len(),
                body_aid: "0".to_string(),
            };
        }
    };

    // Locate the LAST `</body>` so nested body tags (shouldn't exist,
    // but just in case) don't trip us up.
    let close_start = match find_last_close_body(bytes) {
        Some(v) => v,
        None => {
            return SkelSplit {
                skeleton: html.to_string(),
                body_inner: String::new(),
                body_inner_offset: html.len(),
                body_aid: body_aid,
            };
        }
    };

    if close_start <= open_end {
        // Degenerate case: empty body. The skeleton is the whole doc,
        // and there is no fragment content.
        return SkelSplit {
            skeleton: html.to_string(),
            body_inner: String::new(),
            body_inner_offset: open_end,
            body_aid,
        };
    }

    let head = &html[..open_end]; // up to and including `<body ...>`
    let inner = &html[open_end..close_start];
    let tail = &html[close_start..]; // starts with `</body>`

    let mut skeleton = String::with_capacity(head.len() + tail.len());
    skeleton.push_str(head);
    skeleton.push_str(tail);

    SkelSplit {
        skeleton,
        body_inner: inner.to_string(),
        body_inner_offset: head.len(),
        body_aid,
    }
    .also(|_| {
        // dead-code trick to avoid unused open_start warning
        let _ = open_start;
    })
}

/// Tiny extension trait so we can append side-effecting closures without
/// rebinding — only used above to silence the unused `open_start` var.
trait Also: Sized {
    fn also<F: FnOnce(&Self)>(self, f: F) -> Self {
        f(&self);
        self
    }
}
impl<T> Also for T {}

/// Find the first `<body ...>` open tag. Returns (tag_start, after_tag_end, aid).
///
/// `tag_start` is the byte index of the `<`, `after_tag_end` is the byte
/// index just past the matching `>`, and `aid` is the value of the
/// `aid="..."` attribute on the tag (defaulting to "0" if absent).
fn find_body_open(bytes: &[u8]) -> Option<(usize, usize, String)> {
    let haystack = bytes;
    let needle = b"<body";
    let mut i = 0;
    while i + needle.len() <= haystack.len() {
        if &haystack[i..i + needle.len()] == needle {
            // Verify next byte is whitespace or '>' (so we don't match
            // `<bodytext` or similar).
            let after = haystack[i + needle.len()];
            if after == b' '
                || after == b'\t'
                || after == b'\n'
                || after == b'\r'
                || after == b'>'
                || after == b'/'
            {
                // Find the closing '>' of this tag.
                let mut j = i + needle.len();
                while j < haystack.len() && haystack[j] != b'>' {
                    j += 1;
                }
                if j >= haystack.len() {
                    return None;
                }
                let tag_str = std::str::from_utf8(&haystack[i..=j]).ok()?;
                let aid = extract_aid_value(tag_str).unwrap_or_else(|| "0".to_string());
                return Some((i, j + 1, aid));
            }
        }
        i += 1;
    }
    None
}

/// Find the LAST `</body>` close tag. Returns byte index of the `<`.
fn find_last_close_body(bytes: &[u8]) -> Option<usize> {
    let needle = b"</body>";
    if bytes.len() < needle.len() {
        return None;
    }
    let mut i = bytes.len() - needle.len();
    loop {
        if &bytes[i..i + needle.len()] == needle {
            return Some(i);
        }
        if i == 0 {
            return None;
        }
        i -= 1;
    }
}

/// Extract the aid attribute value from a tag string like `<body aid="3">`.
fn extract_aid_value(tag_str: &str) -> Option<String> {
    // Handle both single and double-quoted attribute values, as well as
    // surrounding whitespace.
    let re = Regex::new(r#"\baid\s*=\s*["']([^"']*)["']"#).unwrap();
    re.captures(tag_str)
        .and_then(|c| c.get(1))
        .map(|m| m.as_str().to_string())
}

/// Process a single HTML part for KF8: add aid attributes and rewrite
/// image sources. Calibre's aid-able tag set + the standard
/// `kindle:embed:XXXX?mime=image/<ext>` src rewrite.
///
/// `kindlegen_parity` swaps the IANA-correct `image/jpeg` mime for the
/// non-standard `image/jpg` kindlegen emits, and drops `img` from the
/// aid-able tag set so img tags are left unaided (matching kindlegen).
fn process_kf8_part(
    html: &str,
    aid_counter: &mut u32,
    path_to_recindex: &std::collections::HashMap<String, usize>,
    kindlegen_parity: bool,
) -> String {
    let mut result = html.to_string();

    // Always use image/jpg - Kindle firmware requires this for kindle:embed.
    let mime_suffix = "jpg";

    // Rewrite image src to kindle:embed format.
    let src_re = Regex::new(r#"(?i)\bsrc\s*=\s*"([^"]*)""#).unwrap();
    result = src_re
        .replace_all(&result, |caps: &regex::Captures| {
            let src_path = caps.get(1).unwrap().as_str();
            if let Some(&recindex) = path_to_recindex.get(src_path) {
                format!(
                    "src=\"kindle:embed:{}?mime=image/{}\"",
                    encode_base32_4char(recindex),
                    mime_suffix
                )
            } else if let Some(fname) = src_path.rsplit('/').next() {
                if let Some(&recindex) = path_to_recindex.get(fname) {
                    format!(
                        "src=\"kindle:embed:{}?mime=image/{}\"",
                        encode_base32_4char(recindex),
                        mime_suffix
                    )
                } else {
                    caps.get(0).unwrap().as_str().to_string()
                }
            } else {
                caps.get(0).unwrap().as_str().to_string()
            }
        })
        .to_string();

    // Add aid attributes to block / inline tags in Calibre's aid-able
    // set. We append the aid attribute just before the closing `>` of
    // each tag, to mirror Calibre's "pop then re-add last" behavior.
    //
    // Parity mode excludes `img` from the aid set because kindlegen's
    // KF8 comic output leaves img tags unaided. Default mode INCLUDES
    // `img` (kindling's normal behavior — gives Kindle readers one more
    // selectable/interactive anchor per page).
    let tag_re = if kindlegen_parity {
        Regex::new(
            r"(?i)<(p|div|h[1-6]|li|ul|ol|table|tr|td|th|section|article|aside|nav|header|footer|figure|figcaption|blockquote|span|a|em|strong|b|i|body)(\s[^>]*?)?(/?)>",
        )
        .unwrap()
    } else {
        Regex::new(
            r"(?i)<(p|div|h[1-6]|li|ul|ol|table|tr|td|th|section|article|aside|nav|header|footer|figure|figcaption|blockquote|img|span|a|em|strong|b|i|body)(\s[^>]*?)?(/?)>",
        )
        .unwrap()
    };
    result = tag_re
        .replace_all(&result, |caps: &regex::Captures| {
            let tag = caps.get(1).unwrap().as_str();
            let attrs = caps.get(2).map(|m| m.as_str()).unwrap_or("");
            let self_close = caps.get(3).map(|m| m.as_str()).unwrap_or("");
            // Skip if this tag already has an aid (e.g. double-processing).
            if attrs.contains(" aid=") || attrs.contains("\taid=") {
                return caps.get(0).unwrap().as_str().to_string();
            }
            let aid = encode_aid_base32(*aid_counter);
            *aid_counter += 1;
            format!("<{}{} aid=\"{}\"{}>", tag, attrs, aid, self_close)
        })
        .to_string();

    result
}

/// Encode an aid value as a lowercase base-32 string using Calibre's
/// alphabet (0-9, A-V). No zero padding — bare value, "0" for zero.
fn encode_aid_base32(value: u32) -> String {
    if value == 0 {
        return "0".to_string();
    }
    const CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUV";
    let mut out = Vec::new();
    let mut v = value;
    while v > 0 {
        out.push(CHARS[(v & 0x1F) as usize]);
        v >>= 5;
    }
    out.reverse();
    String::from_utf8(out).unwrap()
}

/// Encode a 1-based record index as 4 base-32 digits (0-9, A-V),
/// zero-padded. Used by `kindle:embed:XXXX?mime=...` image URLs.
fn encode_base32_4char(recindex: usize) -> String {
    const CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUV";
    let mut result = [b'0'; 4];
    let mut v = recindex;
    for i in (0..4).rev() {
        result[i] = CHARS[v % 32];
        v /= 32;
    }
    String::from_utf8(result.to_vec()).unwrap()
}

/// Build the path-to-recindex lookup map.
fn build_image_path_lookup(
    href_to_recindex: &std::collections::HashMap<String, usize>,
    spine_items: &[(String, String)],
) -> std::collections::HashMap<String, usize> {
    let mut path_to_recindex: std::collections::HashMap<String, usize> =
        std::collections::HashMap::new();

    for (href, &recindex) in href_to_recindex {
        path_to_recindex.insert(href.clone(), recindex);
        if let Some(fname) = href.rsplit('/').next() {
            path_to_recindex.entry(fname.to_string()).or_insert(recindex);
        }
    }

    for (_, spine_href) in spine_items {
        if let Some((spine_dir, _)) = spine_href.rsplit_once('/') {
            let _ = spine_dir;
            for (href, &recindex) in href_to_recindex {
                let relative = format!("../{}", href);
                path_to_recindex.entry(relative).or_insert(recindex);
            }
        }
    }

    path_to_recindex
}

/// Append KF8 TBS (Trailing Byte Sequence) bytes to a text record.
///
/// TBS tells the Kindle firmware which NCX entries overlap with a given
/// text record. The format (from the end of the record):
///   - Last byte = size | 0x80 (total TBS size including this byte)
///   - Preceding bytes = type/data encoded values
///
/// For the simple case (1 NCX entry covering the whole book):
///   - First text record (index 0): `[0x82, 0x80, 0x83]`
///     type byte 0x82 = type 2 ("NCX boundary starts here"),
///     value byte 0x80 = inverted VWI for 0 (NCX entry 0),
///     size byte 0x83 = 3 | 0x80 (3 bytes total)
///   - Subsequent records: `[0x81]`
///     size byte 0x81 = 1 | 0x80 (1 byte = empty TBS, no new NCX entries)
fn append_kf8_tbs(record: &mut Vec<u8>, record_index: usize, num_skeletons: usize) {
    if record_index == 0 {
        // First text record: type 2 = NCX entry starts here.
        // Value encodes fragment/skeleton info. KCC uses (2*num_skeletons - 1)
        // for a comic with N pages: 3 pages → value 5, matching kindlegen.
        let value = if num_skeletons > 0 { (2 * num_skeletons - 1) as u32 } else { 0 };
        let value_bytes = encode_vwi_inv(value);
        let tbs_size = 1 + value_bytes.len() + 1; // type + value + size
        record.push(0x82); // type 2
        record.extend_from_slice(&value_bytes);
        record.push((tbs_size as u8) | 0x80); // size byte
    } else {
        // Subsequent records: empty TBS (no new NCX boundaries).
        record.push(0x81);
    }
}

/// Compress KF8 text into PalmDOC records with KF8 trailing bytes.
///
/// KF8 records use `extra_flags = 3` (bit 0 = multibyte, bit 1 = TBS).
/// Trailing bytes are appended in order: multibyte(0x00) then TBS.
/// The firmware strips them from the end backward: TBS first (bit 1),
/// then multibyte (bit 0).
fn compress_text_kf8(text_bytes: &[u8], num_skeletons: usize) -> (Vec<Vec<u8>>, usize) {
    let total_length = text_bytes.len();
    let chunk_size = RECORD_SIZE;

    let records: Vec<Vec<u8>> = text_bytes
        .chunks(chunk_size)
        .enumerate()
        .map(|(i, chunk)| {
            let mut compressed = palmdoc::compress(chunk);
            compressed.push(0x00); // multibyte (no overhang)
            append_kf8_tbs(&mut compressed, i, num_skeletons);
            compressed
        })
        .collect();

    (records, total_length)
}

/// Split KF8 text into uncompressed records (debug path).
fn split_text_uncompressed_kf8(text_bytes: &[u8], num_skeletons: usize) -> (Vec<Vec<u8>>, usize) {
    let total_length = text_bytes.len();
    let chunk_size = RECORD_SIZE;

    let records: Vec<Vec<u8>> = text_bytes
        .chunks(chunk_size)
        .enumerate()
        .map(|(i, chunk)| {
            let mut rec = chunk.to_vec();
            // Trailing bytes for extra_flags=3 (bit 0=multibyte,
            // bit 1=TBS). Same layout as compressed path.
            rec.push(0x00); // multibyte (no overhang)
            append_kf8_tbs(&mut rec, i, num_skeletons);
            rec
        })
        .collect();

    (records, total_length)
}

/// Build the FDST (Flow Descriptor Table) record.
fn build_fdst(html_length: usize, total_length: usize) -> Vec<u8> {
    let has_css = total_length > html_length;
    let flow_count: usize = 2; // Always 2 per KCC/libmobi

    let record_size = 12 + flow_count * 8;
    let mut fdst = Vec::with_capacity(record_size);
    fdst.extend_from_slice(b"FDST");
    fdst.extend_from_slice(&12u32.to_be_bytes());
    fdst.extend_from_slice(&(flow_count as u32).to_be_bytes());

    // Flow 0: HTML
    fdst.extend_from_slice(&0u32.to_be_bytes());
    fdst.extend_from_slice(&(html_length as u32).to_be_bytes());

    // Flow 1: CSS or zero-length stub
    if has_css {
        fdst.extend_from_slice(&(html_length as u32).to_be_bytes());
        fdst.extend_from_slice(&(total_length as u32).to_be_bytes());
    } else {
        fdst.extend_from_slice(&(html_length as u32).to_be_bytes());
        fdst.extend_from_slice(&(html_length as u32).to_be_bytes());
    }

    fdst
}

// --- INDX BUILDERS ---------------------------------------------------------
//
// These follow Calibre's writer8/index.py layout exactly. Each index has
// a list of `TagMeta` definitions giving (tag_number, values_per_entry,
// bitmask). For each entry, we compute a control byte that encodes
// which tags are present, then emit the entry as:
//
//     [ len_byte | label_bytes | control_byte | inverted_vwi_values... ]
//
// Entries are collected into a DATA record (with IDXT footer listing
// each entry's offset inside the record), and the HEADER record carries
// the TAGX table plus a geometry block with one entry per data record
// giving (last_label, entry_count) for binary-searching the right data
// record at read time.

#[derive(Clone, Copy, Debug)]
struct TagMeta {
    /// Tag number that will appear in raw MOBI dumps (not written, but
    /// kept for readability)
    #[allow(dead_code)]
    number: u8,
    /// Values per entry: how many inverted VWIs follow per "n"
    values_per_entry: u8,
    /// Bitmask for the control byte
    mask: u8,
}

const END_TAG: u8 = 1;

/// Calibre's `mask_to_bit_shifts` for 1-bit and 2-bit-wide masks.
fn mask_shifts(mask: u8) -> u32 {
    match mask {
        1 => 0,
        2 => 1,
        3 => 0,
        4 => 2,
        8 => 3,
        12 => 2,
        16 => 4,
        32 => 5,
        48 => 4,
        64 => 6,
        128 => 7,
        192 => 6,
        _ => 0,
    }
}

/// Build a TAGX section from a list of tag definitions.
///
/// Layout: `TAGX` + u32 length + u32 control_byte_count + per-tag 4 bytes
/// (number, vpe, mask, end_flag) + sentinel (0,0,0,1) end tag.
fn build_tagx(tag_defs: &[(u8, u8, u8)]) -> Vec<u8> {
    let mut body = Vec::new();
    for (num, vpe, mask) in tag_defs {
        body.push(*num);
        body.push(*vpe);
        body.push(*mask);
        body.push(0); // end_flag = 0
    }
    // Sentinel end tag
    body.push(0);
    body.push(0);
    body.push(0);
    body.push(END_TAG);

    let total_length = 12 + body.len();
    let control_byte_count: u32 = 1;

    let mut out = Vec::with_capacity(total_length);
    out.extend_from_slice(b"TAGX");
    out.extend_from_slice(&(total_length as u32).to_be_bytes());
    out.extend_from_slice(&control_byte_count.to_be_bytes());
    out.extend_from_slice(&body);
    out
}

/// Compute the control byte for an entry given the tag definitions and
/// the number of VWI values per tag for this entry.
fn control_byte_for(tag_defs: &[TagMeta], nvals_per_tag: &[usize]) -> u8 {
    let mut ans: u32 = 0;
    for (tag, &nvals) in tag_defs.iter().zip(nvals_per_tag.iter()) {
        let nentries = (nvals as u32) / (tag.values_per_entry as u32);
        let shifts = mask_shifts(tag.mask);
        ans |= (tag.mask as u32) & (nentries << shifts);
    }
    ans as u8
}

/// Encode an INDX data entry.
///
/// Layout: `[len(label_bytes)] [label_bytes] [control_byte] [vwi_inv values...]`
///
/// The label length is a single byte that mirrors Calibre's
/// `raw.insert(0, len(index_num))`. The control byte marks which tag
/// groups are present in `values`.
fn encode_indx_entry(
    label: &[u8],
    tag_defs: &[TagMeta],
    values_by_tag: &[Vec<u32>],
) -> Vec<u8> {
    assert_eq!(tag_defs.len(), values_by_tag.len());
    let nvals_per_tag: Vec<usize> = values_by_tag.iter().map(|v| v.len()).collect();
    let control = control_byte_for(tag_defs, &nvals_per_tag);

    let mut out = Vec::with_capacity(1 + label.len() + 1 + 8 * tag_defs.len());
    out.push(label.len() as u8);
    out.extend_from_slice(label);
    out.push(control);
    for vals in values_by_tag {
        for v in vals {
            out.extend_from_slice(&encode_vwi_inv(*v));
        }
    }
    out
}

/// Build a KF8 INDX data record (generation 1) wrapping the given
/// already-encoded entries. Follows Calibre's writer8/index.py layout:
///
///   header (192 bytes) + entry_bytes + align_to_4 + `IDXT` + u16 offsets
///
/// The header stores the IDXT offset at byte 20, entry count at byte 24,
/// and a marker `type=1` at byte 12 (Calibre's second-half of unknown1).
fn build_indx_data_record(entries: &[Vec<u8>]) -> Vec<u8> {
    let mut header = vec![0u8; INDX_HEADER_LENGTH];
    header[0..4].copy_from_slice(b"INDX");
    put32(&mut header, 4, INDX_HEADER_LENGTH as u32);
    // offset 8..12: zeroes (unknown1 first half)
    put32(&mut header, 12, 1); // "type=1" marker for data record
    // offset 16..20: zeroes (index type; stays 0 for data records)

    let mut entries_data = Vec::new();
    let mut offsets: Vec<u16> = Vec::with_capacity(entries.len());
    for e in entries {
        let off = INDX_HEADER_LENGTH + entries_data.len();
        offsets.push(off as u16);
        entries_data.extend_from_slice(e);
    }

    // Align to 4 bytes before IDXT (Calibre's align_block).
    while (INDX_HEADER_LENGTH + entries_data.len()) % 4 != 0 {
        entries_data.push(0);
    }

    let idxt_offset = INDX_HEADER_LENGTH + entries_data.len();
    put32(&mut header, 20, idxt_offset as u32);
    put32(&mut header, 24, entries.len() as u32);
    // offset 28..36: 8 bytes of 0xFF (Calibre writes \xff*8)
    for b in &mut header[28..36] {
        *b = 0xFF;
    }

    let mut idxt = Vec::with_capacity(4 + 2 * offsets.len());
    idxt.extend_from_slice(b"IDXT");
    for o in &offsets {
        idxt.extend_from_slice(&o.to_be_bytes());
    }
    while idxt.len() % 4 != 0 {
        idxt.push(0);
    }

    let mut record = header;
    record.extend_from_slice(&entries_data);
    record.extend_from_slice(&idxt);
    record
}

/// Build a KF8 INDX primary/header record wrapping the given TAGX and
/// geometry (one entry per data record giving (last_label, count)).
///
/// Follows Calibre IndexHeader:
///   offset  4: header_length (192)
///   offset  8: zeroes(8)
///   offset 16: type = 2
///   offset 20: idxt_offset (set later)
///   offset 24: num_of_records
///   offset 28: encoding = 65001
///   offset 32: NULL (0xFFFFFFFF)
///   offset 36: num_of_entries
///   offset 40..52: zeroes (ordt/ligt/num_ordt)
///   offset 52: num_of_cncx
///   offset 56..180: zeroes(124)
///   offset 180: tagx_offset = 192
///   offset 184: zeroes(8)
fn build_indx_primary(
    tagx: &[u8],
    num_data_records: usize,
    num_entries: usize,
    num_cncx: usize,
    geometry: &[(Vec<u8>, u32)], // (last_label_bytes, entry_count_in_record)
) -> Vec<u8> {
    let mut header = vec![0u8; INDX_HEADER_LENGTH];
    header[0..4].copy_from_slice(b"INDX");
    put32(&mut header, 4, INDX_HEADER_LENGTH as u32);
    // offset 8..16 zeroes
    put32(&mut header, 16, 2); // type = 2
    // idxt_offset at 20 — filled in later
    put32(&mut header, 24, num_data_records as u32);
    put32(&mut header, 28, 65001); // encoding = UTF-8
    put32(&mut header, 32, 0xFFFFFFFF); // NULL
    put32(&mut header, 36, num_entries as u32);
    // offset 40..52: zeroes
    put32(&mut header, 52, num_cncx as u32);
    // offset 56..180: zeroes
    put32(&mut header, 180, INDX_HEADER_LENGTH as u32); // tagx_offset
    // offset 184..192: zeroes

    // TAGX section starts at offset 192. Then per-record geometry
    // entries, each formatted as `[len][label][u16 count]`. Then IDXT
    // pointing to the start of each geometry entry.
    let mut tagx_block = tagx.to_vec();
    while tagx_block.len() % 4 != 0 {
        tagx_block.push(0);
    }

    let mut geom_block = Vec::new();
    let mut geom_offsets: Vec<u16> = Vec::with_capacity(geometry.len());
    let geom_base = INDX_HEADER_LENGTH + tagx_block.len();
    for (label, count) in geometry {
        geom_offsets.push((geom_base + geom_block.len()) as u16);
        geom_block.push(label.len() as u8);
        geom_block.extend_from_slice(label);
        geom_block.extend_from_slice(&(*count as u16).to_be_bytes());
    }
    while geom_block.len() % 4 != 0 {
        geom_block.push(0);
    }

    let idxt_offset = geom_base + geom_block.len();
    put32(&mut header, 20, idxt_offset as u32);

    let mut idxt = Vec::with_capacity(4 + 2 * geom_offsets.len());
    idxt.extend_from_slice(b"IDXT");
    for o in &geom_offsets {
        idxt.extend_from_slice(&o.to_be_bytes());
    }
    while idxt.len() % 4 != 0 {
        idxt.push(0);
    }

    let mut record = header;
    record.extend_from_slice(&tagx_block);
    record.extend_from_slice(&geom_block);
    record.extend_from_slice(&idxt);
    record
}

/// Build the skeleton INDX (primary + data).
///
/// Tag layout (Calibre SkelIndex):
///   tag 1 (chunk_count, vpe=1, mask=3) -- values = (n, n) [repeated twice]
///   tag 6 (geometry,    vpe=2, mask=12) -- values = (start_pos, length,
///                                                    start_pos, length)
fn build_skeleton_indx(skels: &[SkeletonEntry]) -> Vec<Vec<u8>> {
    if skels.is_empty() {
        return minimal_indx();
    }

    let tag_defs = [
        TagMeta { number: 1, values_per_entry: 1, mask: 3 },
        TagMeta { number: 6, values_per_entry: 2, mask: 12 },
    ];
    let tagx = build_tagx(&[(1, 1, 3), (6, 2, 12)]);

    let mut entries: Vec<Vec<u8>> = Vec::with_capacity(skels.len());
    for s in skels {
        let chunk_count_vals = vec![s.chunk_count as u32, s.chunk_count as u32];
        let geom_vals = vec![
            s.start_pos as u32,
            s.length as u32,
            s.start_pos as u32,
            s.length as u32,
        ];
        let entry = encode_indx_entry(
            s.label.as_bytes(),
            &tag_defs,
            &[chunk_count_vals, geom_vals],
        );
        entries.push(entry);
    }

    let data_record = build_indx_data_record(&entries);
    let last_label = skels.last().unwrap().label.as_bytes().to_vec();
    let primary = build_indx_primary(
        &tagx,
        1,
        skels.len(),
        0,
        &[(last_label, skels.len() as u32)],
    );

    vec![primary, data_record]
}

/// Build the fragment INDX (primary + data) and the CNCX records that
/// hold the aid selector strings referenced by tag 2 of each entry.
///
/// Tag layout (Calibre ChunkIndex):
///   tag 2 (cncx_offset,     vpe=1, mask=1)
///   tag 3 (file_number,     vpe=1, mask=2)
///   tag 4 (sequence_number, vpe=1, mask=4)
///   tag 6 (geometry,        vpe=2, mask=8) -- values = (start_pos, length)
fn build_fragment_indx_with_cncx(
    frags: &[FragmentEntry],
) -> (Vec<Vec<u8>>, Vec<Vec<u8>>) {
    if frags.is_empty() {
        return (minimal_indx(), Vec::new());
    }

    let tag_defs = [
        TagMeta { number: 2, values_per_entry: 1, mask: 1 },
        TagMeta { number: 3, values_per_entry: 1, mask: 2 },
        TagMeta { number: 4, values_per_entry: 1, mask: 4 },
        TagMeta { number: 6, values_per_entry: 2, mask: 8 },
    ];
    let tagx = build_tagx(&[(2, 1, 1), (3, 1, 2), (4, 1, 4), (6, 2, 8)]);

    // Build the CNCX from the fragment selectors. Deduped automatically
    // by CncxBuilder.
    let mut cncx = CncxBuilder::new();
    let cncx_offsets: Vec<u32> = frags.iter().map(|f| cncx.add(&f.selector)).collect();

    let mut entries: Vec<Vec<u8>> = Vec::with_capacity(frags.len());
    for (f, cncx_off) in frags.iter().zip(cncx_offsets.iter()) {
        // Label is the decimal insert position, 10 chars, zero-padded.
        // libmobi/Kindle parse this with strtoul to recover the byte
        // offset inside the final reconstructed page.
        let label = format!("{:010}", f.insert_pos);
        let label_bytes = label.as_bytes();

        let values: [Vec<u32>; 4] = [
            vec![*cncx_off],
            vec![f.file_number as u32],
            vec![f.sequence_number as u32],
            vec![f.start_pos as u32, f.length as u32],
        ];
        let entry = encode_indx_entry(label_bytes, &tag_defs, &values);
        entries.push(entry);
    }

    let cncx_records = cncx.into_records();
    let num_cncx = cncx_records.len();

    let data_record = build_indx_data_record(&entries);
    let last_label = format!("{:010}", frags.last().unwrap().insert_pos)
        .into_bytes();
    let primary = build_indx_primary(
        &tagx,
        1,
        frags.len(),
        num_cncx,
        &[(last_label, frags.len() as u32)],
    );

    (vec![primary, data_record], cncx_records)
}

/// Build the NCX INDX (minimal stub with one entry per spine file).
///
/// We don't emit a real ToC here — the NCX exists purely so libmobi /
/// Kindle have a non-0xFFFFFFFF index when parsing the header. A single
/// entry with offset 0 (tag 1) is enough to keep the file well-formed.
/// Build NCX INDX with the 5-tag structure required by Kindle firmware.
///
/// Tags: 1 (offset), 2 (length), 3 (CNCX label), 4 (depth), 6 (pos_fid).
/// Emits a single NCX entry covering the entire text, matching kindlegen's
/// behavior for simple comics. Returns (indx_records, cncx_records).
fn build_ncx_indx(
    title: &str,
    skeleton_entries: &[SkeletonEntry],
    text_length: usize,
) -> (Vec<Vec<u8>>, Vec<Vec<u8>>) {
    // Build CNCX containing the book title as the TOC label.
    let mut ncx_cncx = CncxBuilder::new();
    let label_offset = ncx_cncx.add(title);

    // 5 tags matching kindlegen's NCX layout:
    //   tag 1: offset (1 val, mask 0x01)
    //   tag 2: length (1 val, mask 0x02)
    //   tag 3: CNCX label offset (1 val, mask 0x04)
    //   tag 4: depth (1 val, mask 0x08)
    //   tag 6: pos_fid (2 vals, mask 0x10)
    let tag_defs = [
        TagMeta { number: 1, values_per_entry: 1, mask: 0x01 },
        TagMeta { number: 2, values_per_entry: 1, mask: 0x02 },
        TagMeta { number: 3, values_per_entry: 1, mask: 0x04 },
        TagMeta { number: 4, values_per_entry: 1, mask: 0x08 },
        TagMeta { number: 6, values_per_entry: 2, mask: 0x10 },
    ];
    let tagx = build_tagx(&[(1, 1, 0x01), (2, 1, 0x02), (3, 1, 0x04), (4, 1, 0x08), (6, 2, 0x10)]);

    // Single NCX entry covering the entire book.
    let offset = if skeleton_entries.is_empty() { 0 } else { skeleton_entries[0].start_pos };
    let length = if text_length > offset { text_length - offset } else { 0 };

    // Label "0" matches kindlegen's minimal NCX label format.
    let label = b"0";
    let values: [Vec<u32>; 5] = [
        vec![offset as u32],        // tag 1: byte offset in text
        vec![length as u32],        // tag 2: length of region
        vec![label_offset],         // tag 3: CNCX offset for title
        vec![0],                    // tag 4: depth (flat)
        vec![0, 0],                 // tag 6: pos_fid (frag 0, offset 0)
    ];
    let entry = encode_indx_entry(label, &tag_defs, &values);

    let data_record = build_indx_data_record(&[entry]);
    let ncx_cncx_count = ncx_cncx.record_count();
    let primary = build_indx_primary(&tagx, 1, 1, ncx_cncx_count, &[(label.to_vec(), 1u32)]);

    (vec![primary, data_record], ncx_cncx.into_records())
}

/// Build a DATP record (stub - minimal valid record).
fn build_datp() -> Vec<u8> {
    // DATP contains precomputed TBS (Trailing Byte Sequence) index data.
    // An all-zeros DATP crashes the Kindle renderer. These bytes are from
    // a working kindlegen output for a simple comic (1 text record, 1 NCX
    // entry). The format encodes NCX-to-text-record mappings.
    // TODO: generate dynamically for multi-record text layouts.
    vec![
        0x44, 0x41, 0x54, 0x50, // "DATP"
        0x00, 0x00, 0x00, 0x0D, // header value
        0x01, 0x04, 0x00, 0x04,
        0x02, 0x00, 0x00, 0x06,
        0x19, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x01,
        0x6D, 0x02, 0x46, 0x02,
        0x66, 0x00, 0x00, 0x00,
    ]
}

/// Build a minimal INDX pair for empty tables.
fn minimal_indx() -> Vec<Vec<u8>> {
    let tagx = build_tagx(&[(1, 1, 1)]);
    let data = build_indx_data_record(&[]);
    let primary = build_indx_primary(&tagx, 1, 0, 0, &[]);
    vec![primary, data]
}

/// Write a big-endian u32 into a byte buffer.
fn put32(buf: &mut [u8], offset: usize, value: u32) {
    buf[offset..offset + 4].copy_from_slice(&value.to_be_bytes());
}

// Silence an unused import warning from `encode_vwi` — kept for
// symmetry with `encode_vwi_inv` so contributors discovering this
// module have both forms visible next to each other.
#[allow(dead_code)]
fn _vwi_import_keepalive(v: u32) -> Vec<u8> {
    encode_vwi(v)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    fn make_comic_page(aid_body: &str, img_src: &str) -> String {
        format!(
            "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><title>Page</title></head><body aid=\"{}\"><div aid=\"1\"><img src=\"{}\"/></div></body></html>",
            aid_body, img_src
        )
    }

    #[test]
    fn base32_aid_encoder() {
        assert_eq!(encode_aid_base32(0), "0");
        assert_eq!(encode_aid_base32(1), "1");
        assert_eq!(encode_aid_base32(9), "9");
        assert_eq!(encode_aid_base32(10), "A");
        assert_eq!(encode_aid_base32(31), "V");
        assert_eq!(encode_aid_base32(32), "10");
        assert_eq!(encode_aid_base32(33), "11");
        assert_eq!(encode_aid_base32(63), "1V");
        assert_eq!(encode_aid_base32(1024), "100");
    }

    #[test]
    fn base32_4char_image_recindex() {
        assert_eq!(encode_base32_4char(0), "0000");
        assert_eq!(encode_base32_4char(1), "0001");
        assert_eq!(encode_base32_4char(32), "0010");
        assert_eq!(encode_base32_4char(1024), "0100");
    }

    #[test]
    fn find_body_open_and_close() {
        let html = make_comic_page("0", "img1.jpg");
        let (_, after_open, aid) = find_body_open(html.as_bytes()).expect("body open");
        assert_eq!(aid, "0");
        // `after_open` should point to just past `<body aid="0">`
        let head = &html[..after_open];
        assert!(head.ends_with("<body aid=\"0\">"));

        let close = find_last_close_body(html.as_bytes()).expect("body close");
        assert_eq!(&html[close..close + 7], "</body>");
    }

    #[test]
    fn split_skeleton_reconstructs_original() {
        let html = make_comic_page("0", "img1.jpg");
        let split = split_skeleton_and_body(&html);
        // Skeleton must have an empty body.
        assert!(split.skeleton.contains("<body aid=\"0\"></body>"));
        // body_inner_offset must point at the first byte after `<body aid="0">`.
        assert_eq!(&split.skeleton[split.body_inner_offset..split.body_inner_offset + 7], "</body>");
        // body_inner should be the original body contents.
        assert!(split.body_inner.starts_with("<div aid=\"1\">"));
        assert!(split.body_inner.contains("<img src=\"img1.jpg\"/>"));
        assert!(!split.body_inner.contains("<body"));
        // Reconstructing should give back the original HTML byte-for-byte.
        let mut rebuilt = String::new();
        rebuilt.push_str(&split.skeleton[..split.body_inner_offset]);
        rebuilt.push_str(&split.body_inner);
        rebuilt.push_str(&split.skeleton[split.body_inner_offset..]);
        assert_eq!(rebuilt, html);
    }

    #[test]
    fn global_sequence_number_monotonic_across_pages() {
        let parts: Vec<String> = (0..5)
            .map(|i| {
                format!(
                    "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><title>P</title></head><body><div><img src=\"page{}.jpg\"/></div></body></html>",
                    i
                )
            })
            .collect();
        let href_to_recindex = std::collections::HashMap::new();
        let spine_items: Vec<(String, String)> = Vec::new();
        let (combined, skels, frags) =
            build_kf8_html(&parts, &href_to_recindex, &spine_items, false);
        assert_eq!(skels.len(), 5);
        assert_eq!(frags.len(), 5);
        // Global sequence numbers must be 0, 1, 2, 3, 4 in order.
        for (i, f) in frags.iter().enumerate() {
            assert_eq!(f.sequence_number, i, "seq #{i}");
            assert_eq!(f.file_number, i, "file_number #{i}");
        }
        // Each skeleton's start_pos should point at that skeleton inside
        // the combined buffer.
        for s in &skels {
            assert_eq!(
                &combined[s.start_pos..s.start_pos + s.length.min(6)],
                &b"<?xml "[..6]
            );
        }
        // Insert positions must fall inside the skeleton byte range.
        for (f, s) in frags.iter().zip(skels.iter()) {
            assert!(f.insert_pos >= s.start_pos);
            assert!(f.insert_pos <= s.start_pos + s.length);
        }
    }

    #[test]
    fn fragment_selector_uses_body_aid() {
        let parts = vec![make_comic_page("0", "img.jpg")];
        let href_to_recindex = std::collections::HashMap::new();
        let spine_items: Vec<(String, String)> = Vec::new();
        let (_, _, frags) = build_kf8_html(&parts, &href_to_recindex, &spine_items, false);
        assert_eq!(frags.len(), 1);
        assert_eq!(frags[0].selector, "P-//*[@aid='0']");
    }

    #[test]
    fn process_kf8_part_adds_aid_to_body() {
        let html = "<html><head></head><body><div><img src=\"a.jpg\"/></div></body></html>";
        let mut counter = 0u32;
        let lookup = std::collections::HashMap::new();
        let out = process_kf8_part(html, &mut counter, &lookup, false);
        assert!(out.contains("<body"));
        assert!(out.contains("aid=\""));
    }

    #[test]
    fn control_byte_matches_calibre_skeleton() {
        // Calibre SkelIndex: tag 1 (mask=3, vpe=1) nvals=2, tag 6
        // (mask=12, vpe=2) nvals=4 → expected control byte 0x0A.
        let tag_defs = [
            TagMeta { number: 1, values_per_entry: 1, mask: 3 },
            TagMeta { number: 6, values_per_entry: 2, mask: 12 },
        ];
        assert_eq!(control_byte_for(&tag_defs, &[2, 4]), 0x0A);
    }

    #[test]
    fn control_byte_matches_calibre_chunk() {
        // Calibre ChunkIndex: 1+1+1+2 values, masks 1/2/4/8, all with
        // nentries=1 → expected control byte 0x0F.
        let tag_defs = [
            TagMeta { number: 2, values_per_entry: 1, mask: 1 },
            TagMeta { number: 3, values_per_entry: 1, mask: 2 },
            TagMeta { number: 4, values_per_entry: 1, mask: 4 },
            TagMeta { number: 6, values_per_entry: 2, mask: 8 },
        ];
        assert_eq!(control_byte_for(&tag_defs, &[1, 1, 1, 2]), 0x0F);
    }

    #[test]
    fn skeleton_indx_records_have_correct_magic_and_length() {
        let skels = vec![SkeletonEntry {
            label: "SKEL0000000000".to_string(),
            start_pos: 0,
            length: 100,
            chunk_count: 1,
        }];
        let recs = build_skeleton_indx(&skels);
        assert_eq!(recs.len(), 2);
        assert_eq!(&recs[0][0..4], b"INDX");
        assert_eq!(&recs[1][0..4], b"INDX");
        // Primary: header_length=192 at offset 4
        assert_eq!(u32::from_be_bytes(recs[0][4..8].try_into().unwrap()), 192);
        // Primary: type = 2 at offset 16
        assert_eq!(u32::from_be_bytes(recs[0][16..20].try_into().unwrap()), 2);
        // Primary: encoding = 65001 at offset 28
        assert_eq!(u32::from_be_bytes(recs[0][28..32].try_into().unwrap()), 65001);
        // Primary: tagx_offset = 192 at offset 180
        assert_eq!(u32::from_be_bytes(recs[0][180..184].try_into().unwrap()), 192);
        // Data record: "type=1" marker at offset 12
        assert_eq!(u32::from_be_bytes(recs[1][12..16].try_into().unwrap()), 1);
    }

    #[test]
    fn fragment_indx_emits_cncx_records() {
        let frags = vec![FragmentEntry {
            insert_pos: 201,
            selector: "P-//*[@aid='0']".to_string(),
            file_number: 0,
            sequence_number: 0,
            start_pos: 0,
            length: 42,
        }];
        let (recs, cncx) = build_fragment_indx_with_cncx(&frags);
        assert_eq!(recs.len(), 2);
        assert_eq!(cncx.len(), 1, "should emit one CNCX record");
        // CNCX record should embed the selector string
        assert!(
            cncx[0].windows(b"P-//*[@aid='0']".len()).any(|w| w == b"P-//*[@aid='0']"),
            "selector not found in CNCX record"
        );
        // Primary INDX header should report num_of_cncx = 1 at offset 52.
        assert_eq!(u32::from_be_bytes(recs[0][52..56].try_into().unwrap()), 1);
    }

    #[test]
    fn kf8_trailer_has_tbs() {
        let text = b"<html><body>hello</body></html>";
        let (records, _) = compress_text_kf8(text, 1);
        assert_eq!(records.len(), 1);
        // Trailing bytes: multibyte(0x00) then TBS.
        // For record 0 (first text record): TBS = [0x82, 0x80, 0x83]
        // (type 2, NCX entry 0, size 3). Total trailing = 4 bytes.
        let rec = &records[0];
        let len = rec.len();
        assert!(len >= 4, "record too short for trailing bytes");
        // TBS for 1 skeleton: [0x82, 0x81, 0x83] (type 2, value=2*1-1=1, size 3)
        assert_eq!(rec[len - 1], 0x83, "TBS size byte should be 0x83");
        assert_eq!(rec[len - 2], 0x81, "TBS value byte should be 0x81 (inv VWI for 1)");
        assert_eq!(rec[len - 3], 0x82, "TBS type byte should be 0x82");
        // Multibyte byte is before TBS
        assert_eq!(rec[len - 4], 0x00, "multibyte byte should be 0x00");
    }

    #[test]
    fn build_kf8_section_records_are_well_formed() {
        // End-to-end sanity check: build a KF8 section for two simple
        // comic pages and verify all the top-level records exist and
        // have the correct magic bytes / tag counts.
        let parts = vec![
            make_comic_page("0", "p1.jpg"),
            make_comic_page("0", "p2.jpg"),
        ];
        let css = "body{margin:0}";
        let href_to_recindex = std::collections::HashMap::new();
        let spine_items: Vec<(String, String)> = Vec::new();
        let section = build_kf8_section(
            &parts,
            css,
            &href_to_recindex,
            &spine_items,
            true,  // no_compress for deterministic byte counts
            false, // kindlegen_parity
            "Test Book",
        );
        assert!(!section.text_records.is_empty());
        assert_eq!(&section.fdst[0..4], b"FDST");
        assert_eq!(section.flow_count, 2);
        assert_eq!(section.fragment_indx.len(), 2); // primary + data
        assert_eq!(section.skeleton_indx.len(), 2);
        assert_eq!(section.ncx_indx.len(), 2);
        assert_eq!(section.cncx_records.len(), 1, "one CNCX record for the deduped selector");
        // Skeleton and fragment primaries should declare 2 entries each.
        assert_eq!(
            u32::from_be_bytes(section.skeleton_indx[0][36..40].try_into().unwrap()),
            2
        );
        assert_eq!(
            u32::from_be_bytes(section.fragment_indx[0][36..40].try_into().unwrap()),
            2
        );
        // Fragment primary must reference one CNCX record at offset 52.
        assert_eq!(
            u32::from_be_bytes(section.fragment_indx[0][52..56].try_into().unwrap()),
            1
        );
        // Text records end with TBS trailing bytes.
        // First record: TBS = [0x82, 0x80, 0x83] (size byte 0x83).
        // Subsequent records: TBS = [0x81] (size byte 0x81).
        for (i, r) in section.text_records.iter().enumerate() {
            let last = *r.last().unwrap();
            if i == 0 {
                assert_eq!(last, 0x83, "first text record TBS size byte should be 0x83");
            } else {
                assert_eq!(last, 0x81, "subsequent text record TBS size byte should be 0x81");
            }
        }
    }

    #[test]
    fn cncx_uses_inverted_vwi() {
        // CNCX string lengths must use inverted VWI (high bit = last byte).
        // Forward VWI causes Kindle to misparse fragment selector strings.
        let mut b = crate::cncx::CncxBuilder::new();
        b.add("P-//*[@aid='0']"); // 15-byte string
        let recs = b.into_records();
        assert_eq!(recs.len(), 1);
        // First byte should be 0x8F (inverted VWI for 15: 0x0F | 0x80)
        assert_eq!(recs[0][0], 0x8F,
            "CNCX length prefix must be inverted VWI (0x8F for len 15), got 0x{:02X}",
            recs[0][0]);
    }

    #[test]
    fn ncx_indx_has_five_tags() {
        // NCX INDX must have 5 tags (offset, length, label, depth, pos_fid)
        // for Kindle firmware to render content. A 1-tag stub crashes the renderer.
        let skels = vec![SkeletonEntry {
            label: "SKEL0000000000".to_string(),
            start_pos: 0,
            length: 100,
            chunk_count: 1,
        }];
        let (ncx_recs, ncx_cncx) = build_ncx_indx("Test", &skels, 500);

        // Should produce primary + data records
        assert_eq!(ncx_recs.len(), 2, "NCX should have primary + data");
        // Should produce CNCX records
        assert!(!ncx_cncx.is_empty(), "NCX should have CNCX for labels");

        // Primary record: check TAGX has 5 tag definitions
        let primary = &ncx_recs[0];
        let tagx_off = u32::from_be_bytes(primary[180..184].try_into().unwrap()) as usize;
        assert_eq!(&primary[tagx_off..tagx_off+4], b"TAGX");
        let tagx_len = u32::from_be_bytes(primary[tagx_off+4..tagx_off+8].try_into().unwrap()) as usize;
        // Count tags (4 bytes each, ending with sentinel [0,0,0,1])
        let mut tag_count = 0;
        let mut pos = tagx_off + 12;
        while pos < tagx_off + tagx_len {
            if primary[pos+3] == 1 { break; } // sentinel
            tag_count += 1;
            pos += 4;
        }
        assert_eq!(tag_count, 5, "NCX TAGX must define 5 tags, got {}", tag_count);

        // Primary: num_cncx should be >= 1
        let num_cncx = u32::from_be_bytes(primary[52..56].try_into().unwrap());
        assert!(num_cncx >= 1, "NCX num_cncx must be >= 1, got {}", num_cncx);

        // Primary: total entries = 1
        let total = u32::from_be_bytes(primary[36..40].try_into().unwrap());
        assert_eq!(total, 1, "NCX should have 1 entry for simple book");
    }

    #[test]
    fn ncx_cncx_contains_title() {
        let skels = vec![SkeletonEntry {
            label: "SKEL0000000000".to_string(),
            start_pos: 0,
            length: 100,
            chunk_count: 1,
        }];
        let (_, ncx_cncx) = build_ncx_indx("My Comic Title", &skels, 500);
        assert_eq!(ncx_cncx.len(), 1);
        // CNCX should contain the title with inverted VWI length prefix
        let cncx = &ncx_cncx[0];
        let title = b"My Comic Title";
        // First byte: inverted VWI for 14 = 0x0E | 0x80 = 0x8E
        assert_eq!(cncx[0], 0x8E,
            "CNCX title length should be 0x8E (inv VWI for 14), got 0x{:02X}", cncx[0]);
        assert_eq!(&cncx[1..1+title.len()], title);
    }

    #[test]
    fn datp_is_32_bytes_with_content() {
        let datp = build_datp();
        assert_eq!(datp.len(), 32, "DATP must be 32 bytes (not 152-byte stub)");
        assert_eq!(&datp[0..4], b"DATP");
        // Must NOT be all zeros after header (crashes Kindle renderer)
        let content = &datp[8..];
        assert!(content.iter().any(|&b| b != 0),
            "DATP content must not be all zeros");
    }

    #[test]
    fn kf8_section_has_ncx_cncx_records() {
        let parts = vec![make_comic_page("0", "p1.jpg")];
        let section = build_kf8_section(
            &parts, "", &std::collections::HashMap::new(),
            &Vec::new(), true, false, "Test",
        );
        assert!(!section.ncx_cncx_records.is_empty(),
            "KF8 section must include NCX CNCX records");
    }

    #[test]
    fn kindle_embed_uses_image_jpg_not_jpeg() {
        // Kindle firmware doesn't recognize image/jpeg in kindle:embed URLs.
        // Must be image/jpg (non-standard but required by Kindle).
        let mut href_to_recindex = std::collections::HashMap::new();
        href_to_recindex.insert("p1.jpg".to_string(), 1usize);
        href_to_recindex.insert("p2.jpg".to_string(), 2usize);

        let parts = vec![
            make_comic_page("0", "p1.jpg"),
            make_comic_page("0", "p2.jpg"),
        ];
        let spine_items: Vec<(String, String)> = Vec::new();

        let section = build_kf8_section(
            &parts,
            "body{margin:0}",
            &href_to_recindex,
            &spine_items,
            true,  // no_compress for readable output
            false, // kindlegen_parity off (test normal mode)
            "Test Book",
        );

        // Decompress text (no_compress=true so text is uncompressed, but
        // we still need to strip the trailing bytes). Just check html_bytes
        // which is the uncompressed HTML flow snapshot.
        let html = std::str::from_utf8(&section.html_bytes)
            .expect("KF8 HTML should be valid UTF-8");

        // All kindle:embed references should use image/jpg
        assert!(
            html.contains("image/jpg"),
            "KF8 HTML must contain 'image/jpg' kindle:embed references"
        );
        assert!(
            !html.contains("image/jpeg"),
            "KF8 HTML must NOT contain 'image/jpeg' - Kindle firmware rejects it"
        );
    }
}