1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
use std::{mem::MaybeUninit, ptr};
use memchr::memchr;
use crate::{types, NormalizeChunk, NormalizeChunkResult, Result};
/// LF normalization format implementation.
///
/// Will convert all line endings that are not LF (i.e. CRLF or CR alone) into LF.
pub struct LF;
impl NormalizeChunk for LF {
type State = bool;
fn max_output_size_for_chunk(
chunk_size: usize,
_state: Option<&Self::State>,
_is_last_chunk: bool,
) -> usize {
chunk_size
}
fn normalize_chunk(
input: &[u8],
output: &mut [MaybeUninit<u8>],
state: Option<&Self::State>,
is_last_chunk: bool,
) -> Result<NormalizeChunkResult<Self::State>> {
let output_required = Self::max_output_size_for_chunk(input.len(), state, is_last_chunk);
if output.len() < output_required {
return Err(crate::Error::OutputBufferTooSmall {
required: output_required,
});
}
let preceded_by_cr = state.copied().unwrap_or(false);
if input.is_empty() {
// If this is the last chunk we're no longer preceded_by_cr, if
// it's not than we return the input.
return Ok(NormalizeChunkResult::new(
0,
Some(preceded_by_cr && !is_last_chunk),
));
}
let mut scan_pos = 0;
let mut read_pos = 0;
let mut write_pos = 0;
if input.first() == Some(&types::LF) && preceded_by_cr {
// We found:
// - a LF preceeded by a CR from the previous chunk
// The LF was already written when that CR was detected so we can
// just skipt this LF.
scan_pos = 1;
read_pos = 1;
}
loop {
if let Some(i) = memchr(types::CR, &input[scan_pos..]).map(|i| i + scan_pos) {
// SAFETY: i is in-bounds because it was found by memchr.
let c = unsafe { *input.get_unchecked(i) };
match (c, input.get(i + 1).copied()) {
(types::CR, Some(types::LF)) => {
// We found:
// - a CR followed by a LF
// Copy everything up to i, update scan_pos to skip the CRLF and
// update read_pos to only skip the CR.
let bytes_now = i - read_pos;
// SAFETY: read_pos..i is in-bounds because i was found by memchr1 and we've
// established at the top that output is large enough for worst-case expansion.
unsafe {
ptr::copy_nonoverlapping(
input.as_ptr().add(read_pos),
output.as_mut_ptr().add(write_pos).cast::<u8>(),
bytes_now,
);
}
scan_pos = i + 2;
read_pos = i + 1;
write_pos += bytes_now;
}
(types::CR, next) => {
// We found:
// - a CR followed by anything but an LF
// - a CR at the last position
// Copy everything up to i, output an LF and and depending on whether next is_some
// update scan_pos, read_pos and write_pos or break with a result.
let bytes_now = i - read_pos;
// SAFETY: read_pos..i is in-bounds because i was found by memchr1 and we've
// established at the top that output is large enough for worst-case expansion.
unsafe {
ptr::copy_nonoverlapping(
input.as_ptr().add(read_pos),
output.as_mut_ptr().add(write_pos).cast::<u8>(),
bytes_now,
);
*output.get_unchecked_mut(write_pos + bytes_now) =
MaybeUninit::new(types::LF);
}
if next.is_none() {
break Ok(NormalizeChunkResult::new(
write_pos + bytes_now + 1,
Some(!is_last_chunk),
));
}
scan_pos = i + 1;
read_pos = i + 1;
write_pos += bytes_now + 1;
}
_ => unreachable!("unreachable pattern match case"),
}
} else {
// We found:
// - the end of the input
let bytes_now = input.len() - read_pos;
// SAFETY: read_pos..end is in-bounds because 0 <= read_pos <= end and we've
// established at the top that output is large enough for worst-case expansion.
unsafe {
ptr::copy_nonoverlapping(
input.as_ptr().add(read_pos),
output.as_mut_ptr().add(write_pos).cast::<u8>(),
bytes_now,
);
}
break Ok(NormalizeChunkResult::new(
write_pos + bytes_now,
Some(false),
));
}
}
}
}