floem_editor_core/
line_ending.rs

1use std::{iter::Peekable, ops::Range};
2
3use lapce_xi_rope::{DeltaBuilder, Rope, RopeDelta};
4use memchr::{memchr, memchr2};
5use std::sync::LazyLock;
6
7// Cached ropes for the two line endings
8static CR_LF: LazyLock<Rope> = LazyLock::new(|| Rope::from("\r\n"));
9static LF: LazyLock<Rope> = LazyLock::new(|| Rope::from("\n"));
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12pub enum LineEnding {
13    /// `\r\n` Windows  
14    CrLf,
15    /// `\n` Unix
16    Lf,
17}
18impl LineEnding {
19    /// Replace the line endings (`\n`, `\r\n`, `\r`) used in `text` with the line ending named by
20    /// `self`.
21    pub fn normalize(self, text: &Rope) -> Rope {
22        self.normalize_delta(text)
23            .map(|d| d.apply(text))
24            .unwrap_or_else(|| text.clone())
25    }
26
27    pub fn normalize_delta(self, text: &Rope) -> Option<RopeDelta> {
28        let mut builder = DeltaBuilder::new(text.len());
29
30        let le = if self == LineEnding::Lf {
31            LF.clone()
32        } else {
33            CR_LF.clone()
34        };
35
36        let mut had_entries = false;
37        for (range, kind) in FullLeChunkSearch::new(text.iter_chunks(..)) {
38            had_entries = true;
39            match kind {
40                LeChunkKind::CrLf => {
41                    if self == LineEnding::Lf {
42                        builder.replace(range, LF.clone());
43                    }
44                }
45                LeChunkKind::Lf => {
46                    if self == LineEnding::CrLf {
47                        builder.replace(range, CR_LF.clone());
48                    }
49                }
50                LeChunkKind::Cr => {
51                    builder.replace(range, le.clone());
52                }
53            }
54        }
55
56        if had_entries {
57            let delta = builder.build();
58            Some(delta)
59        } else {
60            None
61        }
62    }
63
64    /// Only replace the carriage return line-endings.
65    pub fn normalize_limited(self, text: &Rope) -> Rope {
66        let mut builder = DeltaBuilder::new(text.len());
67
68        let le = if self == LineEnding::Lf {
69            LF.clone()
70        } else {
71            CR_LF.clone()
72        };
73
74        let mut had_entries = false;
75        for offset in LoneCrChunkSearch::new(text.iter_chunks(..)) {
76            had_entries = true;
77            builder.replace(offset..offset + 1, le.clone());
78        }
79
80        if had_entries {
81            let delta = builder.build();
82            delta.apply(text)
83        } else {
84            text.clone()
85        }
86    }
87
88    /// Get the name of the line ending
89    pub fn as_str(&self) -> &'static str {
90        match self {
91            LineEnding::CrLf => "CRLF",
92            LineEnding::Lf => "LF",
93        }
94    }
95}
96
97#[derive(Debug, Clone, Copy)]
98pub enum LineEndingDetermination {
99    CrLf,
100    Lf,
101    Mixed,
102    Unknown,
103}
104impl LineEndingDetermination {
105    // TODO: should we just do a simpler routine of checking the first few lines?
106    // Based off of xi-rope's line-ending determination logic
107    pub fn determine(text: &Rope) -> Self {
108        let mut crlf = false;
109        let mut lf = false;
110
111        for chunk in text.iter_chunks(..) {
112            match LineEndingDetermination::determine_str(chunk) {
113                LineEndingDetermination::CrLf => crlf = true,
114                LineEndingDetermination::Lf => lf = true,
115                LineEndingDetermination::Mixed => {
116                    return LineEndingDetermination::Mixed;
117                }
118                LineEndingDetermination::Unknown => {}
119            }
120        }
121
122        match (crlf, lf) {
123            (true, true) => LineEndingDetermination::Mixed,
124            (true, false) => LineEndingDetermination::CrLf,
125            (false, true) => LineEndingDetermination::Lf,
126            (false, false) => LineEndingDetermination::Unknown,
127        }
128    }
129
130    fn determine_str(chunk: &str) -> LineEndingDetermination {
131        let bytes = chunk.as_bytes();
132        let newline = memchr2(b'\n', b'\r', bytes);
133        match newline {
134            Some(x) if bytes[x] == b'\r' && bytes.len() > x + 1 && bytes[x + 1] == b'\n' => {
135                LineEndingDetermination::CrLf
136            }
137            Some(x) if bytes[x] == b'\n' => LineEndingDetermination::Lf,
138            Some(_) => LineEndingDetermination::Mixed,
139            None => LineEndingDetermination::Unknown,
140        }
141    }
142
143    pub fn unwrap_or(self, le: LineEnding) -> LineEnding {
144        match self {
145            LineEndingDetermination::CrLf => LineEnding::CrLf,
146            LineEndingDetermination::Lf => LineEnding::Lf,
147            LineEndingDetermination::Mixed | LineEndingDetermination::Unknown => le,
148        }
149    }
150}
151
152#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
153enum LeChunkKind {
154    CrLf,
155    Lf,
156    Cr,
157}
158
159/// Line ending chunk searcher
160struct FullLeChunkSearch<'a, I: Iterator<Item = &'a str>> {
161    offset: usize,
162    /// Offset within the chunk itself
163    chunk_pos: usize,
164    chunks: Peekable<I>,
165}
166impl<'a, I: Iterator<Item = &'a str>> FullLeChunkSearch<'a, I> {
167    fn new(chunks: I) -> Self {
168        Self {
169            offset: 0,
170            chunk_pos: 0,
171            chunks: chunks.peekable(),
172        }
173    }
174
175    /// Get the current chunk, updating the current chunk if needed
176    fn get_chunk(&mut self) -> Option<&'a str> {
177        let chunk = self.chunks.peek()?;
178        if self.chunk_pos >= chunk.len() {
179            self.advance_chunk();
180            Some(*self.chunks.peek()?)
181        } else {
182            Some(chunk)
183        }
184    }
185
186    fn advance_chunk(&mut self) -> Option<()> {
187        let chunk = self.chunks.next()?;
188        self.offset += chunk.len();
189        self.chunk_pos = 0;
190
191        Some(())
192    }
193}
194impl<'a, I: Iterator<Item = &'a str>> Iterator for FullLeChunkSearch<'a, I> {
195    type Item = (Range<usize>, LeChunkKind);
196
197    fn next(&mut self) -> Option<Self::Item> {
198        let chunk = self.get_chunk()?;
199
200        let bytes = &chunk.as_bytes()[self.chunk_pos..];
201
202        let newline = memchr2(b'\n', b'\r', bytes);
203        match newline {
204            // CrLf
205            Some(x) if bytes[x] == b'\r' && bytes.len() > x + 1 && bytes[x + 1] == b'\n' => {
206                let start = self.offset + self.chunk_pos + x;
207                let end = start + 2;
208
209                self.chunk_pos += x + 2;
210                Some((start..end, LeChunkKind::CrLf))
211            }
212            // Lf
213            Some(x) if bytes[x] == b'\n' => {
214                let start = self.offset + self.chunk_pos + x;
215                let end = start + 1;
216
217                self.chunk_pos += x + 1;
218                Some((start..end, LeChunkKind::Lf))
219            }
220            Some(x) => {
221                // Typically this only occurs for a lone `\r`.
222                // However, we need to handle the case where the `\r` is the last character in the
223                // chunk whilst the next chunk starts with a `\n`.
224                assert_eq!(bytes[x], b'\r');
225
226                let start = self.offset + self.chunk_pos + x;
227                self.chunk_pos += x + 1;
228
229                let v = if self.chunk_pos == chunk.len() {
230                    if let Some(next_chunk) = self.get_chunk() {
231                        let next_chunk = &next_chunk.as_bytes()[self.chunk_pos..];
232                        if next_chunk.starts_with(b"\n") {
233                            self.chunk_pos += 1;
234                            Some((start..start + 2, LeChunkKind::CrLf))
235                        } else {
236                            None
237                        }
238                    } else {
239                        None
240                    }
241                } else {
242                    None
243                };
244
245                Some(v.unwrap_or_else(|| {
246                    // There is no \n so it is a lone `\r`
247                    // (Which is used in MacOS, or sometimes due to bugged line endings)
248                    let end = start + 1;
249                    (start..end, LeChunkKind::Cr)
250                }))
251            }
252            None => {
253                self.advance_chunk();
254                self.next()
255            }
256        }
257    }
258}
259
260/// Iterator that searches for lone carriage returns ('\r') in chunks of text.
261struct LoneCrChunkSearch<'a, I: Iterator<Item = &'a str>> {
262    /// Offset of the start of the current chunk
263    offset: usize,
264    chunk_pos: usize,
265    chunks: Peekable<I>,
266}
267
268impl<'a, I: Iterator<Item = &'a str>> LoneCrChunkSearch<'a, I> {
269    fn new(chunks: I) -> Self {
270        Self {
271            offset: 0,
272            chunk_pos: 0,
273            chunks: chunks.peekable(),
274        }
275    }
276
277    /// Get the current chunk, or if chunk pos is past the end of the chunk, then
278    /// advance to the next chunk and get it.
279    fn get_chunk(&mut self) -> Option<&'a str> {
280        let chunk = self.chunks.peek()?;
281        if self.chunk_pos >= chunk.len() {
282            self.advance_chunk();
283            Some(*self.chunks.peek()?)
284        } else {
285            Some(chunk)
286        }
287    }
288
289    fn advance_chunk(&mut self) -> Option<()> {
290        let chunk = self.chunks.next()?;
291        self.offset += chunk.len();
292        self.chunk_pos = 0;
293
294        Some(())
295    }
296}
297
298impl<'a, I: Iterator<Item = &'a str>> Iterator for LoneCrChunkSearch<'a, I> {
299    type Item = usize;
300
301    fn next(&mut self) -> Option<Self::Item> {
302        loop {
303            let chunk = self.get_chunk()?;
304
305            let bytes = &chunk.as_bytes()[self.chunk_pos..];
306
307            let newline = memchr(b'\r', bytes);
308            match newline {
309                Some(x) => {
310                    let offset = self.offset + self.chunk_pos + x;
311
312                    // Check if the next character is '\n' (indicating \r\n)
313                    self.chunk_pos += x + 1;
314                    if self.chunk_pos < chunk.len() && chunk.as_bytes()[self.chunk_pos] == b'\n' {
315                        // Skip \r\n sequences
316                        self.chunk_pos += 1;
317                    } else if let Some(chunk_b) = self.get_chunk() {
318                        let chunk_b = &chunk_b.as_bytes()[self.chunk_pos..];
319                        if chunk_b.starts_with(b"\n") {
320                            // Skip \r\n sequences across chunks
321                            self.chunk_pos += 1;
322                        } else {
323                            // Lone \r
324                            return Some(offset);
325                        }
326                    } else {
327                        // Lone \r at the end
328                        return Some(offset);
329                    }
330                }
331                None => {
332                    self.advance_chunk();
333                }
334            }
335        }
336    }
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342
343    #[test]
344    fn normalize() {
345        let text = Rope::from("hello\r\nworld toast and jam\nthe end\nhi");
346        let normalized = LineEnding::CrLf.normalize(&text);
347        assert_eq!(
348            normalized.slice_to_cow(..),
349            "hello\r\nworld toast and jam\r\nthe end\r\nhi"
350        );
351
352        let text = Rope::from("\n");
353        let normalized = LineEnding::Lf.normalize(&text);
354        assert_eq!(normalized.slice_to_cow(..), "\n");
355        let normalized = LineEnding::CrLf.normalize(&text);
356        assert_eq!(normalized.slice_to_cow(..), "\r\n");
357
358        let text = Rope::from("\r\n");
359        let normalized = LineEnding::Lf.normalize(&text);
360        assert_eq!(normalized.slice_to_cow(..), "\n");
361        let normalized = LineEnding::CrLf.normalize(&text);
362        assert_eq!(normalized.slice_to_cow(..), "\r\n");
363
364        // `\r` is always normalized to the line ending of the file
365        let text = Rope::from("\r");
366        let normalized = LineEnding::Lf.normalize(&text);
367        assert_eq!(normalized.slice_to_cow(..), "\n");
368        let normalized = LineEnding::CrLf.normalize(&text);
369        assert_eq!(normalized.slice_to_cow(..), "\r\n");
370        let normalized = LineEnding::Lf.normalize_limited(&text);
371        assert_eq!(normalized.slice_to_cow(..), "\n");
372
373        let text = Rope::from("\rtest");
374        let normalized = LineEnding::Lf.normalize(&text);
375        assert_eq!(normalized.slice_to_cow(..), "\ntest");
376        let normalized = LineEnding::CrLf.normalize(&text);
377        assert_eq!(normalized.slice_to_cow(..), "\r\ntest");
378        let normalized = LineEnding::Lf.normalize_limited(&text);
379        assert_eq!(normalized.slice_to_cow(..), "\ntest");
380    }
381
382    #[test]
383    fn chunk_search() {
384        let text = Rope::from("hello\r\nworld toast and jam\nthe end\nhi");
385        let c = FullLeChunkSearch::new(text.iter_chunks(..));
386        assert_eq!(
387            c.collect::<Vec<_>>(),
388            vec![
389                (5..7, LeChunkKind::CrLf),
390                (26..27, LeChunkKind::Lf),
391                (34..35, LeChunkKind::Lf),
392            ]
393        );
394        let c = LoneCrChunkSearch::new(text.iter_chunks(..));
395        assert_eq!(c.collect::<Vec<_>>(), Vec::new());
396
397        // Test searching across different chunks of text
398        // (Using a non-Rope iterator to simplify creation, however it should behave the same)
399        let text = ["a\n", "\n5", "\r\ne\r", "\ntest\r", "\rv"];
400        let multi_chunk = FullLeChunkSearch::new(text.into_iter());
401        assert_eq!(
402            multi_chunk.collect::<Vec<_>>(),
403            vec![
404                (1..2, LeChunkKind::Lf),
405                (2..3, LeChunkKind::Lf),
406                (4..6, LeChunkKind::CrLf),
407                (7..9, LeChunkKind::CrLf),
408                (13..14, LeChunkKind::Cr),
409                (14..15, LeChunkKind::Cr),
410            ]
411        );
412
413        let multi_chunk = LoneCrChunkSearch::new(text.into_iter());
414        assert_eq!(multi_chunk.collect::<Vec<_>>(), vec![13, 14]);
415
416        let text = ["\n\rb"];
417        let chunks = FullLeChunkSearch::new(text.into_iter());
418        assert_eq!(
419            chunks.collect::<Vec<_>>(),
420            vec![(0..1, LeChunkKind::Lf), (1..2, LeChunkKind::Cr)]
421        );
422
423        let text = ["\n\rb"];
424        let chunks = LoneCrChunkSearch::new(text.into_iter());
425        assert_eq!(chunks.collect::<Vec<_>>(), vec![1]);
426    }
427}