1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
use std::{mem::MaybeUninit, ptr};
use memchr::memchr2;
use crate::{
formats::{NormalizeChunk, NormalizeChunkResult},
types::{CR, LF},
Error, Result,
};
/// CRLF normalization format implementation.
///
/// Will convert all line endings that are not CRLF (i.e. LF or CR alone) into CRLF.
pub struct CRLF;
impl NormalizeChunk for CRLF {
type State = bool;
fn max_output_size_for_chunk(
chunk_size: usize,
_state: Option<&Self::State>,
is_last_chunk: bool,
) -> usize {
// Worst case: every byte in input needs conversion, plus one extra byte for the
// trailing CR from last chunk.
//
// Example worst-case scenarios:
// - input: "\n\n\n", preceded_by_cr: false, is_last_chunk: any -> output: "\r\n\r\n\r\n" (2n)
// - input: "\n\n\n", preceded_by_cr: true, is_last_chunk: any -> output: "\n\r\n\r\n" (2n - 1)
// - input: "\r\r\r", preceded_by_cr: false, is_last_chunk: false -> output: "\r\n\r\n\r" (2n - 1)
// - input: "\r\r\r", preceded_by_cr: true, is_last_chunk: false -> output: "\n\r\n\r\n\r" (2n)
// - input: "\r\r\r", preceded_by_cr: false, is_last_chunk: true -> output: "\r\n\r\n\r\n" (2n)
// - input: "\r\r\r", preceded_by_cr: true, is_last_chunk: true -> output: "\n\r\n\r\n\r\n" (2n + 1)
//
// So the only case where we need 2n + 1 bytes is when `preceded_by_cr` is true, the last byte is a `\r` and
// `is_last_chunk` is true. In all other cases, 2n bytes is sufficient. However, for simpliucity we'll only
// look at `is_last_chunk`. We could just require 2n + 1 bytes always, but that would be surprising for
// callers that intuitively expect 2n to be sufficient in all cases, or at least when not processing the
// last chunk.
chunk_size * 2 + usize::from(is_last_chunk)
}
fn normalize_chunk(
input: &[u8],
output: &mut [MaybeUninit<u8>],
state: Option<&Self::State>,
is_last_chunk: bool,
) -> Result<NormalizeChunkResult<Self::State>> {
let output_required = Self::max_output_size_for_chunk(input.len(), state, is_last_chunk);
if output.len() < output_required {
return Err(Error::OutputBufferTooSmall {
required: output_required,
});
}
let preceded_by_cr = state.copied().unwrap_or(false);
if input.is_empty() && !is_last_chunk {
// Special case: empty input and not last chunk
return Ok(NormalizeChunkResult::new(0, Some(preceded_by_cr)));
}
let mut scan_pos = 0;
let mut read_pos = 0;
let mut write_pos = 0;
if input.first() == Some(&LF) && preceded_by_cr {
// We found:
// - a LF preceeded by a CR from the previous chunk
scan_pos = 1;
} else if preceded_by_cr {
// We found:
// - not a LF preceeded by a CR from the previous chunk, or
// - empty input preceeded by a CR from the previous chunk
output[0] = MaybeUninit::new(LF);
write_pos = 1;
}
loop {
if let Some(i) = memchr2(CR, LF, &input[scan_pos..]).map(|i| i + scan_pos) {
// SAFETY: i is in-bounds because it was found by memchr2.
let c = unsafe { *input.get_unchecked(i) };
match (c, input.get(i + 1).copied()) {
(CR, Some(LF)) => {
// We found:
// - a CR followed by a LF
// Intentionally don't copy now — advance scan_pos to skip the CRLF
// so we'll include the CRLF in a later large bulk copy from read_pos.
scan_pos = i + 2;
}
(CR, Some(_)) | (LF, _) => {
// We found:
// - a LF not preceeded by a CR, or
// - a CR not followed by a LF and not at the last position
let bytes_now = i - read_pos;
// SAFETY: read_pos..i is in-bounds because i was found by memchr2 and we've
// established at the top that output is large enough for worst-case expansion.
unsafe {
ptr::copy_nonoverlapping(
input.as_ptr().add(read_pos),
output.as_mut_ptr().add(write_pos).cast::<u8>(),
bytes_now,
);
*output.get_unchecked_mut(write_pos + bytes_now) = MaybeUninit::new(CR);
*output.get_unchecked_mut(write_pos + bytes_now + 1) =
MaybeUninit::new(LF);
}
read_pos = i + 1;
scan_pos = read_pos;
write_pos += bytes_now + 2;
}
(CR, None) => {
// We found:
// - a CR at the last position
let bytes_now = input.len() - read_pos;
// SAFETY: read_pos..end is in-bounds because 0 <= read_pos <= end and we've
// established at the top that output is large enough for worst-case expansion.
unsafe {
ptr::copy_nonoverlapping(
input.as_ptr().add(read_pos),
output.as_mut_ptr().add(write_pos).cast::<u8>(),
bytes_now,
);
if is_last_chunk {
*output.get_unchecked_mut(write_pos + bytes_now) =
MaybeUninit::new(LF);
}
}
break Ok(NormalizeChunkResult::new(
write_pos + bytes_now + usize::from(is_last_chunk),
Some(!is_last_chunk),
));
}
_ => unreachable!("unreachable pattern match case"),
}
} else {
// We found:
// - the end of the input
let bytes_now = input.len() - read_pos;
// SAFETY: read_pos..end is in-bounds because 0 <= read_pos <= end and we've
// established at the top that output is large enough for worst-case expansion.
unsafe {
ptr::copy_nonoverlapping(
input.as_ptr().add(read_pos),
output.as_mut_ptr().add(write_pos).cast::<u8>(),
bytes_now,
);
}
break Ok(NormalizeChunkResult::new(
write_pos + bytes_now,
Some(false),
));
}
}
}
}