litcheck_core/text/
mod.rs

1mod display;
2pub use self::display::DisplayCommaSeparated;
3
4use std::borrow::Cow;
5
6use crate::range::Range;
7
8/// The style of line endings used in a file
9#[derive(Debug, Copy, Clone)]
10pub enum LineEnding {
11    /// The line ending was `\n`
12    Lf,
13    /// The line ending was `\r\n`
14    Crlf,
15    /// No line ending was found before EOF
16    None,
17}
18
19/// Represents a newline location and style
20#[derive(Debug, Copy, Clone)]
21pub struct Newline {
22    pub offset: usize,
23    pub ty: LineEnding,
24}
25impl Newline {
26    #[inline(always)]
27    pub fn offset(&self) -> usize {
28        self.offset
29    }
30
31    #[inline]
32    pub fn next_line_start(&self) -> usize {
33        match self.ty {
34            LineEnding::Lf => self.offset + 1,
35            LineEnding::Crlf => self.offset + 2,
36            LineEnding::None => self.offset,
37        }
38    }
39
40    #[inline]
41    pub fn is_crlf(&self) -> bool {
42        matches!(self.ty, LineEnding::Crlf)
43    }
44
45    /// Find the next [Newline] from the start of `buffer`
46    pub fn next(buffer: &[u8]) -> Self {
47        Self::next_from(buffer, 0)
48    }
49
50    /// Find the [Newline] from `offset` in `buffer`
51    ///
52    /// If there are no more newlines, the newline index will be equal
53    /// to `buffer.len()`, i.e. end of file.
54    pub fn next_from(buffer: &[u8], offset: usize) -> Self {
55        match memchr::memchr(b'\n', &buffer[offset..]) {
56            Some(index) => {
57                let index = offset + index;
58                if index > 0 {
59                    let line_end = index - 1;
60                    match unsafe { *buffer.get_unchecked(line_end) } {
61                        b'\r' => Self {
62                            ty: LineEnding::Crlf,
63                            offset: line_end,
64                        },
65                        _ => Self {
66                            ty: LineEnding::Lf,
67                            offset: index,
68                        },
69                    }
70                } else {
71                    Self {
72                        ty: LineEnding::Lf,
73                        offset: index,
74                    }
75                }
76            }
77            None => Self {
78                ty: LineEnding::None,
79                offset: buffer.len(),
80            },
81        }
82    }
83
84    /// Find the last [Newline] in `buffer`
85    pub fn prev(buffer: &[u8]) -> Self {
86        Self::prev_from(buffer, 0)
87    }
88
89    /// Find the last [Newline] in `buffer`, searching backwards from `offset`.
90    ///
91    /// If there are no newlines, the newline offset returned will be zero,
92    /// i.e. beginning of file. Same if the buffer is empty.
93    pub fn prev_from(buffer: &[u8], offset: usize) -> Self {
94        match memchr::memrchr(b'\n', &buffer[..offset]) {
95            Some(index) => {
96                if index > 0 {
97                    let prev_line_end = index - 1;
98                    match unsafe { *buffer.get_unchecked(prev_line_end) } {
99                        b'\r' => Self {
100                            ty: LineEnding::Crlf,
101                            offset: prev_line_end,
102                        },
103                        _ => Self {
104                            ty: LineEnding::Crlf,
105                            offset: index,
106                        },
107                    }
108                } else {
109                    Self {
110                        ty: LineEnding::Lf,
111                        offset: index,
112                    }
113                }
114            }
115            None => Self {
116                ty: LineEnding::None,
117                offset: 0,
118            },
119        }
120    }
121}
122
123pub fn find_next_lf_or_eof(buffer: &[u8], range: Range<usize>) -> Option<usize> {
124    memchr::memchr(b'\n', &buffer[range]).map(|idx| range.start + idx)
125}
126
127pub fn find_next_crlf_or_eof(buffer: &[u8], range: Range<usize>) -> Option<usize> {
128    match memchr::memchr(b'\n', &buffer[range]) {
129        Some(0) => None,
130        Some(index) => {
131            let line_end = index - 1;
132            match unsafe { *buffer.get_unchecked(line_end) } {
133                b'\r' => Some(line_end),
134                _ => None,
135            }
136        }
137        None => None,
138    }
139}
140
141pub fn find_prev_lf_or_eof(buffer: &[u8], range: Range<usize>) -> Option<usize> {
142    memchr::memrchr(b'\n', &buffer[range]).map(|idx| range.start + idx)
143}
144
145pub fn find_prev_crlf_or_eof(buffer: &[u8], range: Range<usize>) -> Option<usize> {
146    match memchr::memrchr(b'\n', &buffer[range]) {
147        Some(0) => None,
148        Some(index) => {
149            let line_end = index - 1;
150            match unsafe { *buffer.get_unchecked(line_end) } {
151                b'\r' => Some(line_end),
152                _ => None,
153            }
154        }
155        None => None,
156    }
157}
158
159/// Returns true if `offset` in the underlying buffer falls on a valid UTF-8 codepoint boundary.
160///
161/// This is only specified if the input buffer is valid UTF-8, no guarantees otherwise.
162pub fn is_char_boundary(buffer: &[u8], offset: usize) -> bool {
163    // NOTE: This is inlined from regex_automata/src/util/utf8.rs (is_boundary)
164    // which is dual-licensed Apache 2.0/MIT.
165    match buffer.get(offset) {
166        // The end of the buffer is technically a valid boundary
167        None => offset == buffer.len(),
168        // Other than ASCII (where the most significant bit is never set),
169        // valid starting bytes always have their most significant two bits
170        // set, where as continuation bytes never have their second most
171        // significant bit set. Therefore, this only returns true when bytes[i]
172        // corresponds to a byte that begins a valid UTF-8 encoding of a
173        // Unicode scalar value.
174        Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
175    }
176}
177
178pub fn canonicalize_horizontal_whitespace(
179    s: Cow<'_, str>,
180    strict_whitespace: bool,
181) -> Cow<'_, str> {
182    if strict_whitespace {
183        return s;
184    }
185
186    if s.contains(is_non_canonical_horizontal_whitespace) {
187        Cow::Owned(s.replace(is_non_canonical_horizontal_whitespace, " "))
188    } else {
189        s
190    }
191}
192
193#[inline]
194fn is_non_canonical_horizontal_whitespace(c: char) -> bool {
195    match c {
196        '\t' => true,
197        // Unicode Space_Separator category, sans space (which we are canonicalizing to)
198        '\u{00A0}'
199        | '\u{1680}'
200        | '\u{2000}'..='\u{200A}'
201        | '\u{202F}'
202        | '\u{205F}'
203        | '\u{3000}' => true,
204        _ => false,
205    }
206}