1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
//! Encode some input bytes into the Bao format, or slice an existing encoding.
//!
//! The Bao encoding format makes it possible to stream content bytes while verifying that they
//! match the root hash. It also supports extracting encoded slices that can be verified apart from
//! the rest of the encoding. This module handles the sending side of these operations. For the
//! receiving side, see the `decode` module.
//!
//! There are two modes of encoding, combined (the default) and outboard. The combined mode mixes
//! subtree hashes together with the input bytes, producing a single file that can be decoded by
//! itself. The outboard mode avoids copying any input bytes. The outboard encoding is much
//! smaller, but it can only be used together with the original input file.
//!
//! # Example
//!
//! ```
//! # fn main() -> Result<(), Box<std::error::Error>> {
//! use std::io::prelude::*;
//!
//! let input = b"some input";
//! let expected_hash = bao::hash::hash(input);
//!
//! let (hash, encoded_at_once) = bao::encode::encode_to_vec(b"some input");
//! assert_eq!(expected_hash, hash);
//!
//! let mut encoded_incrementally = Vec::new();
//! {
//!     // The inner block here limits the lifetime of this mutable borrow.
//!     let encoded_cursor = std::io::Cursor::new(&mut encoded_incrementally);
//!     let mut encoder = bao::encode::Writer::new(encoded_cursor);
//!     encoder.write_all(b"some input")?;
//!     let hash = encoder.finish()?;
//!     assert_eq!(expected_hash, hash);
//! }
//!
//! assert_eq!(encoded_at_once, encoded_incrementally);
//! # Ok(())
//! # }
//! ```

use arrayvec::ArrayVec;
use blake2b_simd;
use copy_in_place::copy_in_place;
use core::cmp;
use core::fmt;
use hash::Finalization::{self, NotRoot, Root};
use hash::{self, Hash, CHUNK_SIZE, HASH_SIZE, HEADER_SIZE, PARENT_SIZE};
#[cfg(feature = "std")]
use rayon;
#[cfg(feature = "std")]
use std::io;
#[cfg(feature = "std")]
use std::io::prelude::*;
#[cfg(feature = "std")]
use std::io::SeekFrom::{End, Start};

/// Encode the input bytes in the combined mode. `output.len()` must be exactly
/// `encoded_size(input.len())`.
///
/// If the `std` feature is enabled, as it is by default, this will use multiple threads via Rayon.
///
/// # Panics
///
/// Panics if the output slice is the wrong length.
///
/// # Example
///
/// ```
/// let input = b"some bytes";
/// let encoded_size = bao::encode::encoded_size(input.len() as u64);
/// assert!(encoded_size <= usize::max_value() as u128);
/// // Note that if you're allocating a new Vec like this, encode_to_vec is more convenient.
/// let mut encoded = vec![0; encoded_size as usize];
/// bao::encode::encode(input, &mut encoded);
/// ```
pub fn encode(input: &[u8], output: &mut [u8]) -> Hash {
    let content_len = input.len() as u64;
    assert_eq!(
        output.len() as u128,
        encoded_size(content_len),
        "output is the wrong length"
    );
    output[..HEADER_SIZE].copy_from_slice(&hash::encode_len(content_len));
    #[cfg(feature = "std")]
    {
        if input.len() <= hash::MAX_SINGLE_THREADED {
            encode_recurse(input, &mut output[HEADER_SIZE..], Root(content_len))
        } else {
            encode_recurse_rayon(input, &mut output[HEADER_SIZE..], Root(content_len))
        }
    }
    #[cfg(not(feature = "std"))]
    {
        encode_recurse(input, &mut output[HEADER_SIZE..], Root(content_len))
    }
}

/// Encode the first `content_len` bytes from the input buffer in the combined mode, overwriting
/// the input buffer. `buf.len()` must be exactly `encoded_size(content_len as u64)`.
///
/// If the `std` feature is enabled, as it is by default, this will use multiple threads via Rayon.
/// This function is slower than `encode`, however, because only the hashing can be parallelized;
/// copying the input bytes around has to be done on a single thread.
///
/// # Panics
///
/// Panics if the buffer is the wrong length.
///
/// # Example
///
/// ```
/// let input = b"some bytes";
/// let encoded_size = bao::encode::encoded_size(input.len() as u64);
/// assert!(encoded_size <= usize::max_value() as u128);
/// let mut buffer = input.to_vec();
/// buffer.resize(encoded_size as usize, 0);
/// bao::encode::encode_in_place(&mut buffer, input.len());
/// ```
pub fn encode_in_place(buf: &mut [u8], content_len: usize) -> Hash {
    // Note that if you change anything in this function, you should probably
    // also update benchmarks::encode_in_place_fake.
    assert_eq!(
        buf.len() as u128,
        encoded_size(content_len as u64),
        "buf is the wrong length"
    );
    layout_chunks_in_place(buf, 0, HEADER_SIZE, content_len);
    let (header, rest) = buf.split_at_mut(HEADER_SIZE);
    header.copy_from_slice(&hash::encode_len(content_len as u64));
    #[cfg(feature = "std")]
    {
        if content_len <= hash::MAX_SINGLE_THREADED {
            write_parents_in_place(rest, content_len, Root(content_len as u64))
        } else {
            write_parents_in_place_rayon(rest, content_len, Root(content_len as u64))
        }
    }
    #[cfg(not(feature = "std"))]
    {
        write_parents_in_place(rest, content_len, Root(content_len as u64))
    }
}

/// Encode the input bytes in the outboard mode. `output.len()` must be exactly
/// `outboard_size(input.len())`.
///
/// If the `std` feature is enabled, as it is by default, this will use multiple threads via Rayon.
///
/// # Panics
///
/// Panics if the output slice is the wrong length.
///
/// # Example
///
/// ```
/// let input = b"some bytes";
/// let outboard_size = bao::encode::outboard_size(input.len() as u64);
/// assert!(outboard_size <= usize::max_value() as u128);
/// // Note that if you're allocating a new Vec like this, encode_outboard_to_vec is more convenient.
/// let mut outboard = vec![0; outboard_size as usize];
/// bao::encode::encode_outboard(input, &mut outboard);
/// ```
pub fn encode_outboard(input: &[u8], output: &mut [u8]) -> Hash {
    let content_len = input.len() as u64;
    assert_eq!(
        output.len() as u128,
        outboard_size(content_len),
        "output is the wrong length"
    );
    output[..HEADER_SIZE].copy_from_slice(&hash::encode_len(content_len));
    #[cfg(feature = "std")]
    {
        if input.len() <= hash::MAX_SINGLE_THREADED {
            encode_outboard_recurse(input, &mut output[HEADER_SIZE..], Root(content_len))
        } else {
            encode_outboard_recurse_rayon(input, &mut output[HEADER_SIZE..], Root(content_len))
        }
    }
    #[cfg(not(feature = "std"))]
    {
        encode_outboard_recurse(input, &mut output[HEADER_SIZE..], Root(content_len))
    }
}

#[cfg(feature = "std")]
/// A convenience wrapper around `encode`, which allocates a new `Vec` to hold the encoding.
pub fn encode_to_vec(input: &[u8]) -> (Hash, Vec<u8>) {
    let size = encoded_size(input.len() as u64) as usize;
    // Unsafe code here could avoid the cost of initialization, but it's not much.
    let mut output = vec![0; size];
    let hash = encode(input, &mut output);
    (hash, output)
}

#[cfg(feature = "std")]
/// A convenience wrapper around `encode_outboard`, which allocates a new `Vec` to hold the
/// encoding.
pub fn encode_outboard_to_vec(input: &[u8]) -> (Hash, Vec<u8>) {
    let size = outboard_size(input.len() as u64) as usize;
    let mut output = vec![0; size];
    let hash = encode_outboard(input, &mut output);
    (hash, output)
}

fn encode_recurse(input: &[u8], output: &mut [u8], finalization: Finalization) -> Hash {
    debug_assert_eq!(
        output.len() as u128,
        encoded_subtree_size(input.len() as u64)
    );
    if input.len() <= CHUNK_SIZE {
        output.copy_from_slice(input);
        return hash::hash_chunk(input, finalization);
    }
    let left_len = hash::left_len(input.len() as u64);
    let (left_in, right_in) = input.split_at(left_len as usize);
    let (parent_out, rest) = output.split_at_mut(PARENT_SIZE);
    let (left_out, right_out) = rest.split_at_mut(encoded_subtree_size(left_len) as usize);
    let left_hash = encode_recurse(left_in, left_out, NotRoot);
    let right_hash = encode_recurse(right_in, right_out, NotRoot);
    parent_out[..HASH_SIZE].copy_from_slice(left_hash.as_bytes());
    parent_out[HASH_SIZE..].copy_from_slice(right_hash.as_bytes());
    hash::parent_hash(&left_hash, &right_hash, finalization)
}

#[cfg(feature = "std")]
fn encode_recurse_rayon(input: &[u8], output: &mut [u8], finalization: Finalization) -> Hash {
    debug_assert_eq!(
        output.len() as u128,
        encoded_subtree_size(input.len() as u64)
    );
    if input.len() <= CHUNK_SIZE {
        output.copy_from_slice(input);
        return hash::hash_chunk(input, finalization);
    }
    let left_len = hash::left_len(input.len() as u64);
    let (left_in, right_in) = input.split_at(left_len as usize);
    let (parent_out, rest) = output.split_at_mut(PARENT_SIZE);
    let (left_out, right_out) = rest.split_at_mut(encoded_subtree_size(left_len) as usize);
    let (left_hash, right_hash) = rayon::join(
        || encode_recurse_rayon(left_in, left_out, NotRoot),
        || encode_recurse_rayon(right_in, right_out, NotRoot),
    );
    parent_out[..HASH_SIZE].copy_from_slice(left_hash.as_bytes());
    parent_out[HASH_SIZE..].copy_from_slice(right_hash.as_bytes());
    hash::parent_hash(&left_hash, &right_hash, finalization)
}

fn encode_outboard_recurse(input: &[u8], output: &mut [u8], finalization: Finalization) -> Hash {
    debug_assert_eq!(
        output.len() as u128,
        outboard_subtree_size(input.len() as u64)
    );
    if input.len() <= CHUNK_SIZE {
        return hash::hash_chunk(input, finalization);
    }
    let left_len = hash::left_len(input.len() as u64);
    let (left_in, right_in) = input.split_at(left_len as usize);
    let (parent_out, rest) = output.split_at_mut(PARENT_SIZE);
    let (left_out, right_out) = rest.split_at_mut(outboard_subtree_size(left_len) as usize);
    let left_hash = encode_outboard_recurse(left_in, left_out, NotRoot);
    let right_hash = encode_outboard_recurse(right_in, right_out, NotRoot);
    parent_out[..HASH_SIZE].copy_from_slice(left_hash.as_bytes());
    parent_out[HASH_SIZE..].copy_from_slice(right_hash.as_bytes());
    hash::parent_hash(&left_hash, &right_hash, finalization)
}

#[cfg(feature = "std")]
fn encode_outboard_recurse_rayon(
    input: &[u8],
    output: &mut [u8],
    finalization: Finalization,
) -> Hash {
    debug_assert_eq!(
        output.len() as u128,
        outboard_subtree_size(input.len() as u64)
    );
    if input.len() <= CHUNK_SIZE {
        return hash::hash_chunk(input, finalization);
    }
    let left_len = hash::left_len(input.len() as u64);
    let (left_in, right_in) = input.split_at(left_len as usize);
    let (parent_out, rest) = output.split_at_mut(PARENT_SIZE);
    let (left_out, right_out) = rest.split_at_mut(outboard_subtree_size(left_len) as usize);
    let (left_hash, right_hash) = rayon::join(
        || encode_outboard_recurse_rayon(left_in, left_out, NotRoot),
        || encode_outboard_recurse_rayon(right_in, right_out, NotRoot),
    );
    parent_out[..HASH_SIZE].copy_from_slice(left_hash.as_bytes());
    parent_out[HASH_SIZE..].copy_from_slice(right_hash.as_bytes());
    hash::parent_hash(&left_hash, &right_hash, finalization)
}

// This function doesn't check for adequate space. Its caller should check.
fn layout_chunks_in_place(
    buf: &mut [u8],
    read_offset: usize,
    write_offset: usize,
    content_len: usize,
) {
    if content_len <= CHUNK_SIZE {
        copy_in_place(buf, read_offset..read_offset + content_len, write_offset);
    } else {
        let left_len = hash::left_len(content_len as u64) as usize;
        let left_write_offset = write_offset + PARENT_SIZE;
        let right_len = content_len - left_len;
        let right_read_offset = read_offset + left_len;
        let right_write_offset = left_write_offset + encoded_subtree_size(left_len as u64) as usize;
        // Encoding the left side will overwrite some of the space occupied by the right, so do the
        // right side first.
        layout_chunks_in_place(buf, right_read_offset, right_write_offset, right_len);
        layout_chunks_in_place(buf, read_offset, left_write_offset, left_len);
    }
}

// This function doesn't check for adequate space. Its caller should check.
fn write_parents_in_place(buf: &mut [u8], content_len: usize, finalization: Finalization) -> Hash {
    if content_len <= CHUNK_SIZE {
        debug_assert_eq!(content_len, buf.len());
        hash::hash_chunk(buf, finalization)
    } else {
        let left_len = hash::left_len(content_len as u64) as usize;
        let right_len = content_len - left_len;
        let split = encoded_subtree_size(left_len as u64) as usize;
        let (parent, rest) = buf.split_at_mut(PARENT_SIZE);
        let (left_slice, right_slice) = rest.split_at_mut(split);
        let left_hash = write_parents_in_place(left_slice, left_len, NotRoot);
        let right_hash = write_parents_in_place(right_slice, right_len, NotRoot);
        *array_mut_ref!(parent, 0, HASH_SIZE) = *left_hash.as_bytes();
        *array_mut_ref!(parent, HASH_SIZE, HASH_SIZE) = *right_hash.as_bytes();
        hash::parent_hash(&left_hash, &right_hash, finalization)
    }
}

// This function doesn't check for adequate space. Its caller should check.
#[cfg(feature = "std")]
fn write_parents_in_place_rayon(
    buf: &mut [u8],
    content_len: usize,
    finalization: Finalization,
) -> Hash {
    if content_len <= CHUNK_SIZE {
        debug_assert_eq!(content_len, buf.len());
        hash::hash_chunk(buf, finalization)
    } else {
        let left_len = hash::left_len(content_len as u64) as usize;
        let right_len = content_len - left_len;
        let split = encoded_subtree_size(left_len as u64) as usize;
        let (parent, rest) = buf.split_at_mut(PARENT_SIZE);
        let (left_slice, right_slice) = rest.split_at_mut(split);
        let (left_hash, right_hash) = rayon::join(
            || write_parents_in_place_rayon(left_slice, left_len, NotRoot),
            || write_parents_in_place_rayon(right_slice, right_len, NotRoot),
        );
        *array_mut_ref!(parent, 0, HASH_SIZE) = *left_hash.as_bytes();
        *array_mut_ref!(parent, HASH_SIZE, HASH_SIZE) = *right_hash.as_bytes();
        hash::parent_hash(&left_hash, &right_hash, finalization)
    }
}

/// Compute the size of a combined encoding, given the size of the input. Note that for input sizes
/// close to `u64::MAX`, the result can overflow a `u64`.
pub fn encoded_size(content_len: u64) -> u128 {
    content_len as u128 + outboard_size(content_len)
}

/// Compute the size of an outboard encoding, given the size of the input.
pub fn outboard_size(content_len: u64) -> u128 {
    // Should the return type here really by u128? Two reasons: 1) It's convenient to use the same
    // type as encoded_size(), and 2) if we're ever experimenting with very small chunk sizes, we
    // could indeed overflow u64.
    outboard_subtree_size(content_len) + HEADER_SIZE as u128
}

pub(crate) fn encoded_subtree_size(content_len: u64) -> u128 {
    content_len as u128 + outboard_subtree_size(content_len)
}

pub(crate) fn outboard_subtree_size(content_len: u64) -> u128 {
    // The number of parent nodes is always the number of chunks minus one. To see why this is true,
    // start with a single chunk and incrementally add chunks to the tree. Each new chunk always
    // brings one parent node along with it.
    let num_parents = count_chunks(content_len) - 1;
    num_parents as u128 * PARENT_SIZE as u128
}

pub(crate) fn count_chunks(content_len: u64) -> u64 {
    // Two things to watch out for here: the 0-length input still counts as 1 chunk, and we don't
    // want to overflow when content_len is u64::MAX_VALUE.
    let full_chunks: u64 = content_len / CHUNK_SIZE as u64;
    let has_partial_chunk: bool = (content_len % CHUNK_SIZE as u64) != 0;
    cmp::max(1, full_chunks + has_partial_chunk as u64)
}

pub(crate) fn chunk_size(chunk: u64, content_len: u64) -> usize {
    let chunk_start = chunk * CHUNK_SIZE as u64;
    cmp::min(CHUNK_SIZE, (content_len - chunk_start) as usize)
}

/// Prior to the final chunk, to calculate the number of post-order parent nodes for a chunk, we
/// need to know the height of the subtree for which the chunk is the rightmost. This is the same as
/// the number of trailing ones in the chunk index (counting from 0). For example, chunk number 11
/// (0b1011) has two trailing parent nodes.
///
/// Note that this is closely related to the trick we're using in hash::State::needs_merge. The
/// number of trailing zeroes at a given index is the same as the number of ones that switched off
/// when we moved rightward from the previous index.
fn post_order_parent_nodes_nonfinal(chunk: u64) -> u8 {
    (!chunk).trailing_zeros() as u8
}

/// The final chunk of a post order tree has to have a parent node for each of the not yet merged
/// subtrees behind it. This is the same as the total number of ones in the chunk index (counting
/// from 0).
fn post_order_parent_nodes_final(chunk: u64) -> u8 {
    chunk.count_ones() as u8
}

/// In pre-order there are a couple considerations for counting the number of parent nodes:
///
/// - The number of parents for the first chunk in a tree, is equal to the bit length of the index
///   of the final chunk (counting from 0). For example, a tree of 16 chunks (final chunk index 15
///   or 0b1111) has 4 leading parent nodes, while a tree of 17 chunks has 5.
/// - In the interior of the tree -- ignoring the chunks near the end for a moment -- the number of
///   parent nodes is the height of the tree for which the given chunk is the leftmost. This is
///   equal to the number of trailing zeros in the chunk index. This ends up being similar to the
///   post_order_parent_nodes_nonfinal calculation above, except offset by one.
///
/// Unlike the post-order logic above, where all the subtrees we're looking at before the final
/// chunk are complete, the pre-order case has to account for partial subtrees. For example, chunk 4
/// would normally (in any tree of 8 or more chunks) be the start of a subtree of size 4 and height
/// 2. However, if the tree has a total of 7 chunks, then the subtree starting at chunk 4 is only of
/// size 3. And if the tree has a total of 5 chunks, then chunk 4 is the final chunk and the only
/// chunk in its subtree.
///
/// To account for this, for every chunk after the first, we take the minimum of both rules, with
/// respect to the number of chunks *remaining*. For example, in the 7 chunk tree, chunk 4 starts a
/// subtree of the 3 remaining chunks. That means it still has 2 parent nodes, because a 3 chunk
/// tree is still of height 2. But in the 5 chunk tree, chunk 4 has no parent nodes at all, because
/// a 1 chunk tree is of height 0.
pub(crate) fn pre_order_parent_nodes(chunk: u64, content_len: u64) -> u8 {
    let total_chunks = count_chunks(content_len);
    debug_assert!(
        chunk < total_chunks,
        "attempted to count parent nodes after EOF"
    );
    let remaining = total_chunks - chunk;
    let starting_bound = 64 - (remaining - 1).leading_zeros();
    let interior_bound = chunk.trailing_zeros();
    cmp::min(starting_bound, interior_bound) as u8
}

#[derive(Clone)]
struct FlipperState {
    parents: ArrayVec<[hash::ParentNode; hash::MAX_DEPTH]>,
    content_len: u64,
    chunk_moved: u64,
    parents_needed: u8,
    parents_available: u8,
}

impl FlipperState {
    pub fn new(content_len: u64) -> Self {
        let total_chunks = count_chunks(content_len);
        Self {
            parents: ArrayVec::new(),
            content_len,
            chunk_moved: total_chunks,
            parents_needed: post_order_parent_nodes_final(total_chunks - 1),
            parents_available: 0,
        }
    }

    pub fn next(&self) -> FlipperNext {
        // chunk_moved() adds both the parents_available for the chunk just moved and the
        // parents_needed for the chunk to its left, so we have to do TakeParent first.
        if self.parents_available > 0 {
            FlipperNext::TakeParent
        } else if self.parents_needed > 0 {
            FlipperNext::FeedParent
        } else if self.chunk_moved > 0 {
            FlipperNext::Chunk(chunk_size(self.chunk_moved - 1, self.content_len))
        } else {
            FlipperNext::Done
        }
    }

    pub fn chunk_moved(&mut self) {
        // Add the pre-order parents available for the chunk that just moved and the post-order
        // parents needed for the chunk to its left.
        debug_assert!(self.chunk_moved > 0);
        debug_assert_eq!(self.parents_available, 0);
        debug_assert_eq!(self.parents_needed, 0);
        self.chunk_moved -= 1;
        self.parents_available = pre_order_parent_nodes(self.chunk_moved, self.content_len);
        if self.chunk_moved > 0 {
            self.parents_needed = post_order_parent_nodes_nonfinal(self.chunk_moved - 1);
        }
    }

    pub fn feed_parent(&mut self, parent: hash::ParentNode) {
        debug_assert!(self.chunk_moved > 0);
        debug_assert_eq!(self.parents_available, 0);
        debug_assert!(self.parents_needed > 0);
        self.parents_needed -= 1;
        self.parents.push(parent);
    }

    pub fn take_parent(&mut self) -> hash::ParentNode {
        debug_assert!(self.parents_available > 0);
        self.parents_available -= 1;
        self.parents.pop().expect("took too many parents")
    }
}

impl fmt::Debug for FlipperState {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "FlipperState {{ parents: {}, content_len: {}, chunk_moved: {}, parents_needed: {}, parents_available: {} }}",
               self.parents.len(), self.content_len, self.chunk_moved, self.parents_needed, self.parents_available)
    }
}

#[derive(Clone, Copy, Debug)]
enum FlipperNext {
    FeedParent,
    TakeParent,
    Chunk(usize),
    Done,
}

/// An incremental encoder. Note that you must call `finish` after you're done writing.
///
/// `Writer` supports both combined and outboard encoding, depending on which constructor you use.
///
/// `Writer` is currently only available when `std` is enabled, because `std::io::Write` is a
/// required part of its interface. However, it could be extended to support `no_std`-compatible
/// traits outside of the standard library too. Please reach out to me if you need that.
///
/// This implementation is single-threaded.
///
/// # Example
///
/// ```
/// # fn main() -> Result<(), Box<std::error::Error>> {
/// use std::io::prelude::*;
///
/// let mut encoded_incrementally = Vec::new();
/// let encoded_cursor = std::io::Cursor::new(&mut encoded_incrementally);
/// let mut encoder = bao::encode::Writer::new(encoded_cursor);
/// encoder.write_all(b"some input")?;
/// encoder.finish()?;
/// # Ok(())
/// # }
/// ```
#[cfg(feature = "std")]
#[derive(Clone, Debug)]
pub struct Writer<T: Read + Write + Seek> {
    inner: T,
    total_len: u64,
    chunk_state: blake2b_simd::State,
    tree_state: hash::State,
    outboard: bool,
}

#[cfg(feature = "std")]
impl<T: Read + Write + Seek> Writer<T> {
    /// Create a new `Writer` that will produce a combined encoding.The encoding will contain all
    /// the input bytes, so that it can be decoded without the original input file. This is what
    /// you get from `bao encode`.
    pub fn new(inner: T) -> Self {
        Self {
            inner,
            total_len: 0,
            chunk_state: hash::new_chunk_state(),
            tree_state: hash::State::new(),
            outboard: false,
        }
    }

    /// Create a new `Writer` for making an outboard encoding. That means that the encoding won't
    /// include any input bytes. Instead, the input will need to be supplied as a separate argument
    /// when the outboard encoding is later decoded. This is what you get from `bao encode
    /// --outboard`.
    pub fn new_outboard(inner: T) -> Self {
        let mut writer = Self::new(inner);
        writer.outboard = true;
        writer
    }

    /// Finalize the encoding, after all the input has been written. You can't use this type again
    /// after calling `finish`.
    ///
    /// The underlying strategy of the `Writer` is to first store the tree in a post-order layout,
    /// and then to go back and flip the entire thing into pre-order. That makes it possible to
    /// stream input without knowing its length in advance, which is a core requirement of the
    /// `std::io::Write` interface. The downside is that `finish` is a relatively expensive step.
    pub fn finish(&mut self) -> io::Result<Hash> {
        // First finish the post-order encoding.
        let root_hash;
        if self.total_len <= CHUNK_SIZE as u64 {
            root_hash = hash::finalize_hash(&mut self.chunk_state, Root(self.total_len));
        } else {
            let chunk_hash = hash::finalize_hash(&mut self.chunk_state, NotRoot);
            self.tree_state
                .push_subtree(&chunk_hash, self.chunk_state.count() as usize);
            loop {
                match self.tree_state.merge_finish() {
                    hash::StateFinish::Parent(parent) => self.inner.write_all(&parent)?,
                    hash::StateFinish::Root(root) => {
                        root_hash = root;
                        break;
                    }
                }
            }
        }
        self.inner.write_all(&hash::encode_len(self.total_len))?;

        // Then flip the tree to be pre-order.
        self.flip_post_order_stream()?;

        Ok(root_hash)
    }

    fn flip_post_order_stream(&mut self) -> io::Result<()> {
        let mut write_cursor = self.inner.seek(End(0))?;
        let mut read_cursor = write_cursor - HEADER_SIZE as u64;
        let mut header = [0; HEADER_SIZE];
        self.inner.seek(Start(read_cursor))?;
        self.inner.read_exact(&mut header)?;
        let content_len = hash::decode_len(&header);
        let mut flipper = FlipperState::new(content_len);
        loop {
            match flipper.next() {
                FlipperNext::FeedParent => {
                    let mut parent = [0; PARENT_SIZE];
                    self.inner.seek(Start(read_cursor - PARENT_SIZE as u64))?;
                    self.inner.read_exact(&mut parent)?;
                    read_cursor -= PARENT_SIZE as u64;
                    flipper.feed_parent(parent);
                }
                FlipperNext::TakeParent => {
                    let parent = flipper.take_parent();
                    self.inner.seek(Start(write_cursor - PARENT_SIZE as u64))?;
                    self.inner.write_all(&parent)?;
                    write_cursor -= PARENT_SIZE as u64;
                }
                FlipperNext::Chunk(size) => {
                    // In outboard moded, we skip over chunks.
                    if !self.outboard {
                        let mut chunk = [0; CHUNK_SIZE];
                        self.inner.seek(Start(read_cursor - size as u64))?;
                        self.inner.read_exact(&mut chunk[..size])?;
                        read_cursor -= size as u64;
                        self.inner.seek(Start(write_cursor - size as u64))?;
                        self.inner.write_all(&chunk[..size])?;
                        write_cursor -= size as u64;
                    }
                    flipper.chunk_moved();
                }
                FlipperNext::Done => {
                    debug_assert_eq!(HEADER_SIZE as u64, write_cursor);
                    self.inner.seek(Start(0))?;
                    self.inner.write_all(&header)?;
                    return Ok(());
                }
            }
        }
    }
}

#[cfg(feature = "std")]
impl<T: Read + Write + Seek> Write for Writer<T> {
    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
        if buf.is_empty() {
            // Without more bytes coming, we're not sure how to finalize.
            return Ok(0);
        }
        if self.chunk_state.count() as usize == CHUNK_SIZE {
            let chunk_hash = hash::finalize_hash(&mut self.chunk_state, NotRoot);
            self.chunk_state = hash::new_chunk_state();
            self.tree_state.push_subtree(&chunk_hash, CHUNK_SIZE);
            while let Some(parent) = self.tree_state.merge_parent() {
                self.inner.write_all(&parent)?;
            }
        }
        let want = CHUNK_SIZE - self.chunk_state.count() as usize;
        let take = cmp::min(want, buf.len());
        // The outboard mode skips writing content to the stream.
        let written = if self.outboard {
            take
        } else {
            self.inner.write(&buf[..take])?
        };
        self.chunk_state.update(&buf[..written]);
        self.total_len += written as u64;
        Ok(written)
    }

    fn flush(&mut self) -> io::Result<()> {
        self.inner.flush()
    }
}

// This is in its own module to enforce privacy. For example, callers should only ever read
// content_len by calling len_next().
use self::parse_state::StateNext;
pub(crate) mod parse_state {
    use super::*;

    #[derive(Clone, Debug)]
    pub(crate) struct ParseState {
        content_len: Option<u64>,
        next_chunk: u64,
        upcoming_parents: u8,
        stack_depth: u8,
        encoded_offset: u128,
        length_verified: bool,
        at_root: bool,
    }

    impl ParseState {
        pub(crate) fn new() -> Self {
            Self {
                content_len: None,
                next_chunk: 0,
                upcoming_parents: 0,
                stack_depth: 1,
                encoded_offset: 0,
                length_verified: false,
                at_root: true,
            }
        }

        pub(crate) fn position(&self) -> u64 {
            if let Some(content_len) = self.content_len {
                cmp::min(
                    content_len,
                    self.next_chunk.saturating_mul(CHUNK_SIZE as u64),
                )
            } else {
                0
            }
        }

        // VerifyState needs this to know when to pop nodes during a seek.
        pub(crate) fn stack_depth(&self) -> usize {
            self.stack_depth as usize
        }

        // As with len_next, the ParseState doesn't strictly need to know about finalizations to do
        // its job. But its callers need to finalize, and we want to tightly gate access to
        // content_len (so that it doesn't get accidentally used without verifying it), so we
        // centralize the logic here.
        pub(crate) fn finalization(&self) -> Finalization {
            let content_len = self.content_len.expect("finalization with no len");
            if self.at_root {
                Root(content_len)
            } else {
                NotRoot
            }
        }

        fn reset_to_root(&mut self) {
            self.next_chunk = 0;
            self.upcoming_parents = pre_order_parent_nodes(0, self.content_len.unwrap());
            self.encoded_offset = HEADER_SIZE as u128;
            self.at_root = true;
            self.stack_depth = 1;
        }

        // Strictly speaking, since ParseState doesn't verify anything, it could just return the
        // content_len from a parsed header without any extra fuss. However, all of the users of this
        // struct either need to do verifying themselves (VerifyState) or will produce output that
        // needs to have all the verifiable data in it (SliceExtractor). So it makes sense to
        // centralize the length reading logic here.
        //
        // Note that if reading the length returns StateNext::Chunk (leading the caller to call
        // feed_chunk), the content position will no longer be at the start, as with a standard read.
        // All of our callers buffer the last chunk, so this won't ultimately cause the user to skip
        // over any input. But a caller that didn't buffer anything would need to account for this
        // somehow.
        pub(crate) fn len_next(&self) -> LenNext {
            match (self.content_len, self.length_verified) {
                (None, false) => LenNext::Next(StateNext::Header),
                (None, true) => unreachable!(),
                (Some(len), false) => {
                    if self.upcoming_parents > 0 {
                        LenNext::Next(StateNext::Parent)
                    } else {
                        LenNext::Next(StateNext::Chunk {
                            size: len as usize,
                            finalization: Root(len),
                        })
                    }
                }
                (Some(len), true) => LenNext::Len(len),
            }
        }

        fn is_eof(&self) -> bool {
            match self.len_next() {
                LenNext::Len(len) => self.next_chunk >= count_chunks(len),
                LenNext::Next(_) => false,
            }
        }

        pub(crate) fn read_next(&self) -> Option<StateNext> {
            let content_len = match self.len_next() {
                LenNext::Next(next) => return Some(next),
                LenNext::Len(len) => len,
            };
            if self.is_eof() {
                None
            } else if self.upcoming_parents > 0 {
                Some(StateNext::Parent)
            } else {
                Some(StateNext::Chunk {
                    size: chunk_size(self.next_chunk, content_len),
                    finalization: NotRoot,
                })
            }
        }

        // The buffered_bytes argument tells the parser how many content bytes immediately prior to
        // the next_chunk the caller is storing. (This is generally exactly the previous chunk, but
        // in a multi-threaded reader it could be the size of a larger pipeline of buffers.) If the
        // seek is into that region, it will tell the caller to just adjust its buffer start,
        // rather than seeking backwards and repeating reads.
        //
        // Returns (maybe buffer start, maybe seek, maybe state next). A None buffer start means
        // that the buffer needs to be purged (such that subsequent calls to seek would pass
        // buffered_bytes=0), otherwise the buffer should be retained and its cursor set to the new
        // start value. A non-None seek value means that the caller should execute a seek on the
        // underlying reader, with the offset measured from the start. No state next means the seek
        // is done, though the first two arguments still need to be respected first.
        pub(crate) fn seek_next(
            &mut self,
            seek_to: u64,
            buffered_bytes: usize,
        ) -> (Option<usize>, Option<u128>, Option<StateNext>) {
            let content_len = match self.len_next() {
                LenNext::Next(next) => {
                    debug_assert_eq!(0, buffered_bytes);
                    return (None, None, Some(next));
                }
                LenNext::Len(len) => len,
            };

            // Cap the seek_to at the content_len. This simplifies buffer adjustment and EOF
            // checking, since content_len is the max position().
            let seek_to = cmp::min(seek_to, content_len);

            // If the seek can be handled with just a buffer adjustment, do that. This includes
            // seeks into the middle of a chunk we just read, possibly as a result of the a LenNext
            // above.
            let leftmost_buffered = self.position() - buffered_bytes as u64;
            if leftmost_buffered <= seek_to && seek_to <= self.position() {
                let new_buf_start = (seek_to - leftmost_buffered) as usize;
                return (Some(new_buf_start), None, None);
            }

            // If the seek is further to our left than just a buffer adjustment, reset the whole
            // parser and stack, so that we can re-seek from the beginning. Note that this is one
            // of the two case (along with popping subtrees from the stack below) where the call
            // will need to execute an actual seek in the underlying stream.
            let mut maybe_seek_offset = None;
            let leftmost_buffered = self.position() - buffered_bytes as u64;
            if seek_to < leftmost_buffered {
                self.reset_to_root();
                maybe_seek_offset = Some(self.encoded_offset);
            }

            loop {
                // If the target is the current position, the seek is finished. This includes EOF.
                if seek_to == self.position() {
                    return (None, maybe_seek_offset, None);
                }

                // If the target is within the current subtree, we either need to descend in the
                // tree or read the next chunk for a buffer adjustment.
                if seek_to < self.subtree_end() {
                    if self.upcoming_parents > 0 {
                        return (None, maybe_seek_offset, Some(StateNext::Parent));
                    } else {
                        debug_assert!(self.subtree_size() <= CHUNK_SIZE as u64);
                        return (
                            None,
                            maybe_seek_offset,
                            Some(StateNext::Chunk {
                                size: self.subtree_size() as usize,
                                finalization: self.finalization(),
                            }),
                        );
                    }
                }

                // Otherwise jump out of the current subtree and loop.
                self.stack_depth -= 1;
                self.encoded_offset += encoded_subtree_size(self.subtree_size());
                maybe_seek_offset = Some(self.encoded_offset);
                self.next_chunk += count_chunks(self.subtree_size());
                if !self.is_eof() {
                    // upcoming_parents is only meaningful if we're before EOF.
                    self.upcoming_parents = pre_order_parent_nodes(self.next_chunk, content_len);
                }
            }
        }

        pub(crate) fn feed_header(&mut self, header: &[u8; HEADER_SIZE]) {
            assert!(self.content_len.is_none(), "second call to feed_header");
            let content_len = hash::decode_len(header);
            self.content_len = Some(content_len);
            self.reset_to_root();
        }

        pub(crate) fn advance_parent(&mut self) {
            assert!(
                self.upcoming_parents > 0,
                "too many calls to advance_parent"
            );
            self.upcoming_parents -= 1;
            self.encoded_offset += PARENT_SIZE as u128;
            self.length_verified = true;
            self.at_root = false;
            self.stack_depth += 1;
        }

        pub(crate) fn advance_chunk(&mut self) {
            assert_eq!(
                0, self.upcoming_parents,
                "advance_chunk with non-zero upcoming parents"
            );
            self.encoded_offset += self.subtree_size() as u128;
            self.next_chunk += 1;
            self.length_verified = true;
            self.at_root = false;
            self.stack_depth -= 1;
            // Note that is_eof() depends on the flag changes we just made.
            if !self.is_eof() {
                // upcoming_parents is only meaningful if we're before EOF.
                self.upcoming_parents =
                    pre_order_parent_nodes(self.next_chunk, self.content_len.unwrap());
            }
        }

        fn subtree_size(&self) -> u64 {
            debug_assert!(!self.is_eof());
            let content_len = self.content_len.unwrap();
            // The following should avoid overflow even if content_len is 2^64-1. upcoming_parents was
            // computed from the chunk count, and as long as chunks are larger than 1 byte, it will
            // always be less than 64.
            let max_subtree_size = (1 << self.upcoming_parents) * CHUNK_SIZE as u64;
            cmp::min(content_len - self.position(), max_subtree_size)
        }

        fn subtree_end(&self) -> u64 {
            debug_assert!(!self.is_eof());
            self.position() + self.subtree_size()
        }
    }

    #[derive(Clone, Copy, Debug)]
    pub(crate) enum StateNext {
        Header,
        Parent,
        Chunk {
            size: usize,
            finalization: Finalization,
        },
    }

    #[derive(Clone, Copy, Debug)]
    pub(crate) enum LenNext {
        Len(u64),
        Next(StateNext),
    }
}

/// An incremental slice extractor, which reads encoded bytes and produces a slice.
///
/// `SliceExtractor` supports reading both the combined and outboard encoding, depending on which
/// constructor you use. Though to be clear, there's no such thing as an "outboard slice" per se.
/// Slices always include subtree hashes inline with the content, as a combined encoding does.
///
/// Note that slices always split the encoding at chunk boundaries. Bao's chunk size is currently
/// 4096 bytes, so using `slice_start` and `slice_len` arguments that are a multiple that avoids
/// wasting space. Also, slicing when there's less than a full chunk of input is pointless.
///
/// Extracting a slice doesn't re-hash any of the bytes. As a result, it's fast compared to
/// decoding. You can quickly convert an outboard encoding to a combined encoding by "extracting" a
/// slice with a `slice_start` of zero and a `slice_len` equal to the original input length.
///
/// See the `decode` module for decoding slices.
///
/// # Example
///
/// ```
/// # fn main() -> Result<(), Box<std::error::Error>> {
/// use std::io::prelude::*;
///
/// let input = vec![0; 1_000_000];
/// let (_, encoded) = bao::encode::encode_to_vec(&input);
/// // These parameters are multiples of the chunk size, which avoids unnecessary overhead.
/// let slice_start = 65536;
/// let slice_len = 8192;
/// let encoded_cursor = std::io::Cursor::new(&encoded);
/// let mut extractor = bao::encode::SliceExtractor::new(encoded_cursor, slice_start, slice_len);
/// let mut slice = Vec::new();
/// extractor.read_to_end(&mut slice)?;
///
/// // The slice includes some overhead to store the necessary subtree hashes, but it's not much.
/// assert_eq!(8712, slice.len());
/// # Ok(())
/// # }
/// ```
#[cfg(feature = "std")]
pub struct SliceExtractor<T: Read + Seek, O: Read + Seek> {
    input: T,
    outboard: Option<O>,
    slice_start: u64,
    slice_len: u64,
    slice_bytes_read: u64,
    previous_chunk_size: usize,
    parser: parse_state::ParseState,
    buf: [u8; CHUNK_SIZE],
    buf_start: usize,
    buf_end: usize,
    seek_done: bool,
}

#[cfg(feature = "std")]
impl<T: Read + Seek> SliceExtractor<T, T> {
    /// Create a new `SliceExtractor` to read from a combined encoding. Note that `slice_start` and
    /// `slice_len` are with respect to the *content* of the encoding, that is, the *original*
    /// input bytes. This corresponds to `bao slice slice_start slice_len`.
    pub fn new(input: T, slice_start: u64, slice_len: u64) -> Self {
        // TODO: normalize zero-length slices?
        Self::new_inner(input, None, slice_start, slice_len)
    }
}

#[cfg(feature = "std")]
impl<T: Read + Seek, O: Read + Seek> SliceExtractor<T, O> {
    /// Create a new `SliceExtractor` to read from an unmodified input file and an outboard
    /// encoding of that same file (see `Writer::new_outboard`). As with `SliceExtractor::new`,
    /// `slice_start` and `slice_len` are with respect to the *content* of the encoding, that is,
    /// the *original* input bytes. This corresponds to `bao slice slice_start slice_len
    /// --outboard`.
    pub fn new_outboard(input: T, outboard: O, slice_start: u64, slice_len: u64) -> Self {
        Self::new_inner(input, Some(outboard), slice_start, slice_len)
    }

    fn new_inner(input: T, outboard: Option<O>, slice_start: u64, slice_len: u64) -> Self {
        Self {
            input,
            outboard,
            slice_start,
            slice_len,
            slice_bytes_read: 0,
            previous_chunk_size: 0,
            parser: parse_state::ParseState::new(),
            buf: [0; CHUNK_SIZE],
            buf_start: 0,
            buf_end: 0,
            seek_done: false,
        }
    }

    fn buf_len(&self) -> usize {
        self.buf_end - self.buf_start
    }

    // Note that unlike the regular Reader, the header bytes go into the output buffer.
    fn read_header(&mut self) -> io::Result<()> {
        let header = array_mut_ref!(self.buf, 0, HEADER_SIZE);
        if let Some(ref mut outboard) = self.outboard {
            outboard.read_exact(header)?;
        } else {
            self.input.read_exact(header)?;
        }
        self.buf_start = 0;
        self.buf_end = HEADER_SIZE;
        self.parser.feed_header(header);
        Ok(())
    }

    // Note that unlike the regular Reader, the parent bytes go into the output buffer.
    fn read_parent(&mut self) -> io::Result<()> {
        let parent = array_mut_ref!(self.buf, 0, PARENT_SIZE);
        if let Some(ref mut outboard) = self.outboard {
            outboard.read_exact(parent)?;
        } else {
            self.input.read_exact(parent)?;
        }
        self.buf_start = 0;
        self.buf_end = PARENT_SIZE;
        self.parser.advance_parent();
        Ok(())
    }

    fn read_chunk(&mut self, size: usize) -> io::Result<()> {
        debug_assert_eq!(0, self.buf_len(), "read_chunk with nonempty buffer");
        let chunk = &mut self.buf[..size];
        self.input.read_exact(chunk)?;
        self.buf_start = 0;
        self.buf_end = size;
        // After reading a chunk, increment slice_bytes_read. This will stop the read loop once
        // we've read everything the caller asked for. Note that if the seek indicates we should
        // skip partway into a chunk, we'll decrement slice_bytes_read to account for the skip.
        self.slice_bytes_read += size as u64;
        self.parser.advance_chunk();
        // Record the size of the chunk we just read. Unlike the other readers, because this one
        // keeps header and parent bytes in the output buffer, we can't just rely on buf_end.
        self.previous_chunk_size = size;
        Ok(())
    }

    fn make_progress_and_buffer_output(&mut self) -> io::Result<()> {
        // If we haven't finished the seek yet, do a step of that. That will buffer some output,
        // unless we just finished seeking.
        if !self.seek_done {
            // Also note that this reader, unlike the others, has to account for
            // previous_chunk_size separately from buf_end.
            let (maybe_start, maybe_seek_offset, maybe_next) = self
                .parser
                .seek_next(self.slice_start, self.previous_chunk_size);
            if let Some(start) = maybe_start {
                // If the seek needs us to skip into the middle of the buffer, we don't actually
                // skip bytes, because the recipient will need everything for decoding. However, we
                // decrement slice_bytes_read, so that the skipped bytes don't count against what
                // the caller asked for.
                self.slice_bytes_read -= start as u64;
            } else {
                // Seek never needs to clear the buffer, because there's only one seek.
                debug_assert_eq!(0, self.buf_len());
                debug_assert_eq!(0, self.previous_chunk_size);
            }
            if let Some(offset) = maybe_seek_offset {
                if let Some(ref mut outboard) = self.outboard {
                    // As with Reader in the outboard case, the outboard extractor has to seek both of
                    // its inner readers. The content position of the state goes into the content
                    // reader, and the rest of the reported seek offset goes into the outboard reader.
                    let content_position = self.parser.position();
                    self.input.seek(io::SeekFrom::Start(content_position))?;
                    let outboard_offset = offset - content_position as u128;
                    outboard.seek(io::SeekFrom::Start(cast_offset(outboard_offset)?))?;
                } else {
                    self.input.seek(io::SeekFrom::Start(cast_offset(offset)?))?;
                }
            }
            match maybe_next {
                Some(StateNext::Header) => return self.read_header(),
                Some(StateNext::Parent) => return self.read_parent(),
                Some(StateNext::Chunk {
                    size,
                    finalization: _,
                }) => return self.read_chunk(size),
                None => self.seek_done = true, // Fall through to read.
            }
        }

        // If we haven't finished the read yet, do a step of that. If we've already supplied all
        // the requested bytes, however, don't read any more.
        if self.slice_bytes_read < self.slice_len {
            match self.parser.read_next() {
                Some(StateNext::Header) => unreachable!(),
                Some(StateNext::Parent) => return self.read_parent(),
                Some(StateNext::Chunk {
                    size,
                    finalization: _,
                }) => return self.read_chunk(size),
                None => {} // EOF
            }
        }

        Ok(())
    }
}

#[cfg(feature = "std")]
impl<T: Read + Seek, O: Read + Seek> Read for SliceExtractor<T, O> {
    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        // If we don't have any output ready to go, try to read more.
        if self.buf_len() == 0 {
            self.make_progress_and_buffer_output()?;
        }

        // Unless we're at EOF, the buffer either already had some bytes or just got refilled.
        // Return as much as we can from it.
        let n = cmp::min(buf.len(), self.buf_len());
        buf[..n].copy_from_slice(&self.buf[self.buf_start..][..n]);
        self.buf_start += n;
        Ok(n)
    }
}

#[cfg(feature = "std")]
pub(crate) fn cast_offset(offset: u128) -> io::Result<u64> {
    if offset > u64::max_value() as u128 {
        Err(io::Error::new(
            io::ErrorKind::Other,
            "seek offset overflowed u64",
        ))
    } else {
        Ok(offset as u64)
    }
}

#[cfg(test)]
mod test {
    extern crate tempfile;

    use super::*;
    use decode::make_test_input;
    use std::io::Cursor;

    #[test]
    fn test_encoded_size() {
        for &case in hash::TEST_CASES {
            let input = make_test_input(case);
            let (_, encoded) = encode_to_vec(&input);
            assert_eq!(encoded.len() as u128, encoded_size(case as u64));
            assert_eq!(encoded.len(), encoded.capacity());
            assert_eq!(
                encoded.len() as u128,
                case as u128 + outboard_size(case as u64)
            );
        }
    }

    #[test]
    fn test_encode() {
        for &case in hash::TEST_CASES {
            println!("case {}", case);
            let input = make_test_input(case);
            let expected_hash = hash::hash(&input);
            let (to_vec_hash, output) = encode_to_vec(&input);
            assert_eq!(expected_hash, to_vec_hash);

            let mut serial_output = vec![0; encoded_subtree_size(case as u64) as usize];
            let serial_hash = encode_recurse(&input, &mut serial_output, Root(case as u64));
            assert_eq!(expected_hash, serial_hash);
            assert_eq!(&output[HEADER_SIZE..], &*serial_output);

            let mut parallel_output = vec![0; encoded_subtree_size(case as u64) as usize];
            let parallel_hash =
                encode_recurse_rayon(&input, &mut parallel_output, Root(case as u64));
            assert_eq!(expected_hash, parallel_hash);
            assert_eq!(&output[HEADER_SIZE..], &*parallel_output);

            let mut highlevel_output = vec![0; encoded_size(case as u64) as usize];
            let highlevel_hash = encode(&input, &mut highlevel_output);
            assert_eq!(expected_hash, highlevel_hash);
            assert_eq!(output, highlevel_output);

            let mut highlevel_in_place_output = input.clone();
            highlevel_in_place_output.resize(encoded_size(case as u64) as usize, 0);
            let highlevel_in_place_hash = encode_in_place(&mut highlevel_in_place_output, case);
            assert_eq!(expected_hash, highlevel_in_place_hash);
            assert_eq!(output, highlevel_in_place_output);

            let mut writer_output = Vec::new();
            {
                let mut writer = Writer::new(Cursor::new(&mut writer_output));
                writer.write_all(&input).unwrap();
                let writer_hash = writer.finish().unwrap();
                assert_eq!(expected_hash, writer_hash);
            }
            assert_eq!(output, writer_output);
        }
    }

    #[test]
    fn test_outboard_encode() {
        for &case in hash::TEST_CASES {
            println!("case {}", case);
            let input = make_test_input(case);
            let expected_hash = hash::hash(&input);
            let (to_vec_hash, outboard) = encode_outboard_to_vec(&input);
            assert_eq!(expected_hash, to_vec_hash);

            let mut serial_output = vec![0; outboard_subtree_size(case as u64) as usize];
            let serial_hash =
                encode_outboard_recurse(&input, &mut serial_output, Root(case as u64));
            assert_eq!(expected_hash, serial_hash);
            assert_eq!(&outboard[HEADER_SIZE..], &*serial_output);

            let mut parallel_outboard = vec![0; outboard_subtree_size(case as u64) as usize];
            let parallel_hash =
                encode_outboard_recurse_rayon(&input, &mut parallel_outboard, Root(case as u64));
            assert_eq!(expected_hash, parallel_hash);
            assert_eq!(&outboard[HEADER_SIZE..], &*parallel_outboard);

            let mut highlevel_outboard = vec![0; outboard_size(case as u64) as usize];
            let highlevel_hash = encode_outboard(&input, &mut highlevel_outboard);
            assert_eq!(expected_hash, highlevel_hash);
            assert_eq!(outboard, highlevel_outboard);

            let mut writer_outboard = Vec::new();
            {
                let mut writer = Writer::new_outboard(Cursor::new(&mut writer_outboard));
                writer.write_all(&input).unwrap();
                let writer_hash = writer.finish().unwrap();
                assert_eq!(expected_hash, writer_hash);
            }
            assert_eq!(outboard, writer_outboard);
        }
    }

    // This is another way to calculate the number of parent nodes, which takes longer but is less
    // magical. We use it for testing below.
    fn make_pre_post_list(total_chunks: u64) -> Vec<(u8, u8)> {
        fn recurse(start: u64, size: u64, answers: &mut Vec<(u8, u8)>) {
            assert!(size > 0);
            if size == 1 {
                return;
            }
            answers[start as usize].0 += 1;
            answers[(start + size - 1) as usize].1 += 1;
            let split = hash::largest_power_of_two_leq(size - 1);
            recurse(start, split, answers);
            recurse(start + split, size - split, answers);
        }
        let mut answers = vec![(0, 0); total_chunks as usize];
        recurse(0, total_chunks, &mut answers);
        answers
    }

    // Sanity check the helper above.
    #[test]
    fn test_make_pre_post_list() {
        assert_eq!(make_pre_post_list(1), vec![(0, 0)]);
        assert_eq!(make_pre_post_list(2), vec![(1, 0), (0, 1)]);
        assert_eq!(make_pre_post_list(3), vec![(2, 0), (0, 1), (0, 1)]);
        assert_eq!(make_pre_post_list(4), vec![(2, 0), (0, 1), (1, 0), (0, 2)]);
        assert_eq!(
            make_pre_post_list(5),
            vec![(3, 0), (0, 1), (1, 0), (0, 2), (0, 1)]
        );
    }

    #[test]
    fn test_parent_nodes() {
        for total_chunks in 1..100 {
            let content_len = total_chunks * CHUNK_SIZE as u64;
            let pre_post_list = make_pre_post_list(total_chunks);
            for chunk in 0..total_chunks {
                let (expected_pre, expected_post) = pre_post_list[chunk as usize];
                let pre = pre_order_parent_nodes(chunk, content_len);
                let post = if chunk < total_chunks - 1 {
                    post_order_parent_nodes_nonfinal(chunk)
                } else {
                    post_order_parent_nodes_final(chunk)
                };
                assert_eq!(
                    expected_pre, pre,
                    "incorrect pre-order parent nodes for chunk {} of total {}",
                    chunk, total_chunks
                );
                assert_eq!(
                    expected_post, post,
                    "incorrect post-order parent nodes for chunk {} of total {}",
                    chunk, total_chunks
                );
            }
        }
    }
}