Skip to main content

duat_core/text/strs/
mod.rs

1//! The equivalent of [`str`] for Duat.
2//!
3//! This module defines the [`Strs`] struct, which is a type that,
4//! much like `str`, only shows up as `&Strs`. This is the only avenue
5//! of direct interaction that the end user will get in regards to the
6//! bytes of the [`Text`].
7//!
8//! The `Strs` are backed by an [`StrsBuf`] struct, which owns the
9//! allocation of the [`GapBuffer`] of the utf-8 text.
10//!
11//! [`Text`]: super::Text
12//! [`StrsBuf`]: buf::StrsBuf
13//! [`GapBuffer`]: gap_buf::GapBuffer
14use std::{ops::Range, sync::LazyLock};
15
16pub(crate) use crate::text::strs::buf::StrsBuf;
17use crate::{
18    opts::PrintOpts,
19    text::{Point, TextIndex, TextRange, strs::buf::assert_utf8_boundary},
20};
21
22mod buf;
23mod line_ranges;
24
25/// The [`str`] equivalent for Duat.
26///
27/// Due to Duat's use of a [gap buffer] to represent the [`Text`],
28/// this struct doesn't represent just one slice like a `&str` does.
29/// Instead, it represents two slices (either being possibly empty).
30///
31/// This is done by making use of a custom dynamically sized type,
32/// allowing for near identical utilization.
33///
34/// [gap buffer]: https://en.wikipedia.org/wiki/Gap_buffer
35/// [`Text`]: super::Text
36// I need it to be like this so it is a wide pointer, which can be
37// turned into a reference that can be used for indexing and std::ops::Deref.
38#[repr(transparent)]
39pub struct Strs(StrsDST);
40
41impl Strs {
42    /// Returns a new `Strs`.
43    pub(super) fn new(bytes: &StrsBuf, start: u32, len: u32) -> &Self {
44        // Safety: Since the ordering of fields shouldn't change, this should
45        // be fine.
46        let start_and_len = unsafe { std::mem::transmute::<[u32; 2], usize>([start, len]) };
47        let ptr = std::ptr::slice_from_raw_parts(bytes as *const StrsBuf, start_and_len);
48
49        // Safety: Since the address is valid, this should be converted to a
50        // quasi wide pointer, where the metadata is size-like, but encodes
51        // two values instead of one.
52        unsafe { &*(ptr as *const Self) }
53    }
54
55    ////////// Querying functions
56
57    /// The [`Point`] at the end of the `Strs`.
58    ///
59    /// This is the equivalent of `strs.range().start`.
60    pub fn start_point(&self) -> Point {
61        let formed = FormedStrs::new(self);
62        if formed.start == 0 {
63            Point::default()
64        } else {
65            let slices = unsafe {
66                let (s0, s1) = formed.buf.gapbuf.as_slices();
67                [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
68            };
69            formed
70                .buf
71                .line_ranges
72                .point_by_key(formed.start as usize, |[b, _]| b, slices)
73                .unwrap_or_else(|| formed.buf.line_ranges.max(slices))
74        }
75    }
76
77    /// The [`Point`] at the end of the `Strs`.
78    ///
79    /// This is the equivalent of `strs.range().end`.
80    pub fn end_point(&self) -> Point {
81        let formed = FormedStrs::new(self);
82
83        let slices = unsafe {
84            let (s0, s1) = formed.buf.gapbuf.as_slices();
85            [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
86        };
87
88        let byte = formed.start as usize + formed.len as usize;
89        if byte == formed.buf.gapbuf.len() {
90            formed.buf.line_ranges.max(slices)
91        } else {
92            formed
93                .buf
94                .line_ranges
95                .point_by_key(byte, |[b, _]| b, slices)
96                .unwrap_or_else(|| formed.buf.line_ranges.max(slices))
97        }
98    }
99
100    /// The `char` at a given position.
101    ///
102    /// This position can either be a [`Point`] or a byte index. Will
103    /// return [`None`] if the position is greater than or equal to
104    /// `self.len()` or if it is not located in a utf8 character
105    /// boundary.
106    #[track_caller]
107    pub fn char_at(&self, p: impl TextIndex) -> Option<char> {
108        let formed = FormedStrs::new(self);
109        let range = formed
110            .buf
111            .gapbuf
112            .range(formed.start as usize..formed.start as usize + formed.len as usize);
113
114        if range
115            .get(p.to_byte_index())
116            .is_none_or(|b| utf8_char_width(*b) == 0)
117        {
118            return None;
119        }
120
121        let [s0, s1] = self.to_array();
122        Some(if p.to_byte_index() < s0.len() {
123            s0[p.to_byte_index()..].chars().next().unwrap()
124        } else {
125            s1[p.to_byte_index() - s0.len()..]
126                .chars()
127                .next()
128                .unwrap_or_else(|| panic!("{self:#?}"))
129        })
130    }
131
132    /// The [`Point`] corresponding to the byte position, 0 indexed.
133    ///
134    /// If the byte position would fall in between two characters
135    /// (because the first one comprises more than one byte), the
136    /// first character is chosen as the [`Point`] where the byte is
137    /// located.
138    ///
139    /// # Panics
140    ///
141    /// Will panic if `b` is greater than the length of the text.
142    #[inline(always)]
143    #[track_caller]
144    pub fn point_at_byte(&self, byte: usize) -> Point {
145        assert!(
146            byte <= self.len(),
147            "byte out of bounds: the len is {}, but the byte is {byte}",
148            self.len()
149        );
150
151        let formed = FormedStrs::new(self);
152
153        if byte == self.len() {
154            self.end_point()
155        } else if byte == 0 {
156            Point::default()
157        } else {
158            let slices = unsafe {
159                let (s0, s1) = formed.buf.gapbuf.as_slices();
160                [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
161            };
162            formed
163                .buf
164                .line_ranges
165                .point_by_key(byte + formed.start as usize, |[b, _]| b, slices)
166                .unwrap()
167        }
168    }
169
170    /// The [`Point`] associated with the `c`th char.
171    ///
172    /// # Panics
173    ///
174    /// Will panic if `c` is greater than the number of chars in the
175    /// text.
176    #[inline(always)]
177    #[track_caller]
178    pub fn point_at_char(&self, char: usize) -> Point {
179        let end_point = self.end_point();
180        assert!(
181            char <= end_point.char(),
182            "char out of bounds: the len is {}, but the char is {char}",
183            end_point.char()
184        );
185
186        let formed = FormedStrs::new(self);
187
188        if char == end_point.char() {
189            end_point
190        } else {
191            let slices = unsafe {
192                let (s0, s1) = formed.buf.gapbuf.as_slices();
193                [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
194            };
195
196            let start = formed
197                .buf
198                .line_ranges
199                .point_by_key(formed.start as usize, |[b, _]| b, slices)
200                .unwrap();
201
202            formed
203                .buf
204                .line_ranges
205                .point_by_key(start.char() + char, |[_, c]| c, slices)
206                .unwrap()
207        }
208    }
209
210    /// The [`Point`] where the `l`th line starts, 0 indexed.
211    ///
212    /// If `l == number_of_lines`, returns the last point of the
213    /// `Strs`.
214    ///
215    /// # Panics
216    ///
217    /// Will panic if the number `l` is greater than the number of
218    /// lines on the text.
219    #[inline(always)]
220    #[track_caller]
221    pub fn point_at_coords(&self, line: usize, column: usize) -> Point {
222        let end_point = self.end_point();
223        assert!(
224            line <= end_point.line(),
225            "line out of bounds: the len is {}, but the line is {line}",
226            end_point.line()
227        );
228
229        let formed = FormedStrs::new(self);
230
231        if line == end_point.line() {
232            end_point
233        } else {
234            let slices = unsafe {
235                let (s0, s1) = formed.buf.gapbuf.as_slices();
236                [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
237            };
238            let line = {
239                let start = self.point_at_byte(formed.start as usize);
240                start.line() + line
241            };
242
243            let point = formed.buf.line_ranges.point_at_coords(line, column, slices);
244
245            if let Some(point) = point {
246                point
247            } else {
248                let next_line_start = if line + 1 == end_point.line() {
249                    end_point
250                } else {
251                    formed
252                        .buf
253                        .line_ranges
254                        .point_at_coords(line + 1, 0, slices)
255                        .unwrap()
256                };
257                next_line_start.rev('\n')
258            }
259        }
260    }
261
262    /// The `Strs` for the `n`th line.
263    ///
264    /// # Panics
265    ///
266    /// Panics if `n >= self.len().line()`
267    #[track_caller]
268    pub fn line(&self, n: usize) -> &Strs {
269        let end_point = self.end_point();
270        assert!(
271            n < end_point.line(),
272            "line out of bounds: the len is {}, but the line is {n}",
273            end_point.line()
274        );
275
276        let start = self.point_at_coords(n, 0);
277        let end = if n + 1 == end_point.line() {
278            end_point
279        } else {
280            self.point_at_coords(n + 1, 0)
281        };
282
283        &self[start..end]
284    }
285
286    /// Returns an `Strs` for the whole [`Text`] that this one belongs
287    /// to.
288    ///
289    /// You should use this function if you want to create an api that
290    /// requires the whole `Strs` to be used as an argument. You can
291    /// accept any `Strs`, then just transform it to the full one
292    /// using this function.
293    ///
294    /// If this `Strs` is from `Strs::empty`, then this will
295    /// return itself.
296    ///
297    /// [`Text`]: super::Text
298    pub fn full(&self) -> &Strs {
299        FormedStrs::new(self).buf
300    }
301
302    /// The last [`Point`] associated with a `char`
303    ///
304    /// This function takes into account the whole [`Text`], not just
305    /// the parts contained in the `Strs`. And since a `Text` can't be
306    /// empty, it will always return a [`Point`] associated with the
307    /// `\n` character.
308    ///
309    /// [`len`]: Self::len
310    /// [`Text`]: crate::text::Text
311    pub fn last_point(&self) -> Point {
312        let formed = FormedStrs::new(self);
313        formed.buf.end_point().rev('\n')
314    }
315
316    /// Tries to get a subslice of the `Strs`
317    ///
318    /// It will return [`None`] if the range does not start or end in
319    /// valid utf8 boundaries. If you expect the value to alway be
320    /// `Some`, consider using the index operator (`[]`) instead.
321    ///
322    /// This method is conceptually similar to [`&str::get`], but the
323    /// reference is to an `Strs` struct. This struct points to a
324    /// subslice of the `Strs`s, which is actually two slices,
325    /// given the internal gap buffer representation.
326    ///
327    /// This type is supposed to act nearly identically with the
328    /// [`str`] type, only differing in the fact that its maximum
329    /// lenght is [`u32::MAX`], not [`usize::MAX`].
330    ///
331    /// [`&str`]: str
332    /// [`Text`]: crate::text::Text
333    /// [range]: TextRange
334    /// [`&str::get`]: str::get
335    pub fn get(&self, range: impl TextRange) -> Option<&Strs> {
336        let formed = FormedStrs::new(self);
337        let range = {
338            let range = range.try_to_range(formed.len as usize)?;
339            range.start + formed.start as usize..range.end + formed.start as usize
340        };
341
342        let (s0, s1) = formed.buf.gapbuf.range(range.clone()).as_slices();
343
344        // Check if the slices match utf8 boundaries.
345        if s0.first().is_some_and(|b| utf8_char_width(*b) == 0)
346            || s1.first().is_some_and(|b| utf8_char_width(*b) == 0)
347            || formed
348                .buf
349                .gapbuf
350                .get(range.end)
351                .is_some_and(|b| utf8_char_width(*b) == 0)
352        {
353            return None;
354        }
355
356        Some(Strs::new(
357            formed.buf,
358            range.start as u32,
359            range.len() as u32,
360        ))
361    }
362
363    /// An empty `Strs`, useful in some circumstances.
364    ///
365    /// This is the equivalent of a literal `""`.
366    pub fn empty() -> &'static Self {
367        Strs::new(&EMPTY_BYTES, 0, 0)
368    }
369
370    /// Returns a struct of two `&[u8]` representing a [`TextRange`]
371    /// from the slice.
372    #[track_caller]
373    #[inline]
374    pub fn slices(&self, range: impl TextRange) -> [&[u8]; 2] {
375        let formed = FormedStrs::new(self);
376        let range = {
377            let range = range.to_range(formed.len as usize);
378            range.start + formed.start as usize..range.end + formed.start as usize
379        };
380
381        let (s0, s1) = formed.buf.gapbuf.range(range).as_slices();
382        [s0, s1]
383    }
384
385    /// Converts this `Strs` into an array of its two parts.
386    pub fn to_array(&self) -> [&str; 2] {
387        let formed = FormedStrs::new(self);
388        let range = formed.start as usize..(formed.start + formed.len) as usize;
389
390        let (s0, s1) = formed.buf.gapbuf.range(range).as_slices();
391
392        // Safety: The creation of a &Strs necessitates a valid utf8 range.
393        [unsafe { std::str::from_utf8_unchecked(s0) }, unsafe {
394            std::str::from_utf8_unchecked(s1)
395        }]
396    }
397
398    /// Returns an iterator over the lines in a given range
399    ///
400    /// The lines are inclusive, that is, it will iterate over the
401    /// whole line, not just the parts within the range.
402    ///
403    /// [range]: TextRange
404    pub fn lines(&self) -> Lines<'_> {
405        let formed = FormedStrs::new(self);
406        Lines::new(
407            formed.buf,
408            formed.start as usize,
409            (formed.start + formed.len) as usize,
410        )
411    }
412
413    /// Returns and [`Iterator`] over the [bytes] of this `Strs`
414    ///
415    /// [bytes]: u8
416    #[inline]
417    pub fn bytes(&self) -> impl DoubleEndedIterator<Item = u8> {
418        self.slices(..)
419            .into_iter()
420            .flat_map(|slice| slice.iter().copied())
421    }
422
423    /// Returns and [`Iterator`] over the [`char`]s of this `Strs`
424    pub fn chars(&self) -> impl DoubleEndedIterator<Item = char> {
425        self.to_array().into_iter().flat_map(str::chars)
426    }
427
428    /// Returns an [`Iterator`] over the [`char`]s and their indices.
429    pub fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)> {
430        let [s0, s1] = self.to_array();
431        s0.char_indices()
432            .chain(s1.char_indices().map(move |(b, c)| (b + s0.len(), c)))
433    }
434
435    /// A [`Range<usize>`] of the bytes on this `Strs`.
436    pub fn byte_range(&self) -> Range<usize> {
437        let formed = FormedStrs::new(self);
438        formed.start as usize..(formed.start + formed.len) as usize
439    }
440
441    /// A [`Range<Point>`] of the bytes on this `Strs`.
442    ///
443    /// If you just care about the byte indices (most likely), check
444    /// out [`Strs::byte_range`] isntead.
445    pub fn range(&self) -> Range<Point> {
446        let formed = FormedStrs::new(self);
447        let range = self.byte_range();
448
449        formed.buf.point_at_byte(range.start)..formed.buf.point_at_byte(range.end)
450    }
451
452    /// Gets the indentation level of this `Strs`
453    ///
454    /// This assumes that it is a line in the `Strs`, ending with
455    /// `\n` or `\r\n`.
456    ///
457    /// This is the total "amount of spaces", that is, how many `' '`
458    /// character equivalents are here. This depends on your
459    /// [`PrintOpts`] because of the `tabstop` field.
460    #[track_caller]
461    #[inline]
462    pub fn indent(&self, opts: PrintOpts) -> usize {
463        self.bytes()
464            .take_while(|&byte| byte == b' ' || byte == b'\t')
465            .fold(0, |sum, byte| {
466                if byte == b' ' {
467                    sum + 1
468                } else {
469                    sum + opts.tabstop as usize - (sum % opts.tabstop as usize)
470                }
471            })
472    }
473
474    /// The lenght of this `Strs`, in bytes.
475    pub fn len(&self) -> usize {
476        FormedStrs::new(self).len as usize
477    }
478
479    /// Wether the len of this `Strs` is equal to 0.
480    #[must_use]
481    pub fn is_empty(&self) -> bool {
482        self.len() == 0
483    }
484
485    /// Wether this is an empty line or not
486    ///
487    /// An empty line is either `\n` or `\r\n`. If you want to check
488    /// wether a line is just whitespace, you can do this:
489    ///
490    /// ```
491    /// # duat_core::doc_duat!(duat);
492    /// use duat::prelude::*;
493    ///
494    /// fn is_whitespace(line: &Strs) -> bool {
495    ///     line.chars().all(|char| char.is_whitespace())
496    /// }
497    /// ```
498    pub fn is_empty_line(&self) -> bool {
499        self == "\n" || self == "\r\n"
500    }
501
502    /// Get the current version of the `StrsBuf`
503    ///
504    /// This version is irrespective of undos/redos, that is, an undo
505    /// will also bump the version number up. You can use this for
506    /// communicating version changes, such as with an LSP.
507    pub fn version(&self) -> u64 {
508        FormedStrs::new(self).buf.version()
509    }
510}
511
512impl<Idx: TextRange> std::ops::Index<Idx> for Strs {
513    type Output = Self;
514
515    fn index(&self, index: Idx) -> &Self::Output {
516        let formed = FormedStrs::new(self);
517        let range = index.to_range(formed.len as usize);
518
519        assert_utf8_boundary(formed.buf, range.start);
520        assert_utf8_boundary(formed.buf, range.end);
521
522        Self::new(
523            formed.buf,
524            range.start as u32 + formed.start,
525            range.len() as u32,
526        )
527    }
528}
529
530impl std::fmt::Display for Strs {
531    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
532        let [s0, s1] = self.to_array();
533        write!(f, "{s0}{s1}")
534    }
535}
536
537impl std::fmt::Debug for Strs {
538    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
539        let [s0, s1] = self.to_array();
540        write!(f, "{:?}", format!("{s0}{s1}").as_str())
541    }
542}
543
544static EMPTY_BYTES: LazyLock<StrsBuf> = LazyLock::new(StrsBuf::default);
545
546/// Given a first byte, determines how many bytes are in this UTF-8
547/// character.
548#[must_use]
549#[inline]
550pub const fn utf8_char_width(b: u8) -> usize {
551    // https://tools.ietf.org/html/rfc3629
552    const UTF8_CHAR_WIDTH: &[u8; 256] = &[
553        // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
554        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
555        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
556        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
557        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
558        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
559        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
560        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
561        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
562        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
563        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
564        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
565        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
566        0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
567        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
568        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
569        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
570    ];
571    UTF8_CHAR_WIDTH[b as usize] as usize
572}
573
574/// An [`Iterator`] over the lines on an [`Strs`]
575pub struct Lines<'b> {
576    buf: &'b StrsBuf,
577    start: usize,
578    end: usize,
579    finger_back: usize,
580}
581
582impl<'b> Lines<'b> {
583    fn new(bytes: &'b StrsBuf, start: usize, end: usize) -> Self {
584        Self { buf: bytes, start, end, finger_back: end }
585    }
586
587    fn next_match_back(&mut self) -> Option<usize> {
588        let range = self.start..self.finger_back;
589        let (s0, s1) = self.buf.gapbuf.range(range).as_slices();
590
591        let pos = s0.iter().chain(s1.iter()).rev().position(|b| *b == b'\n');
592        match pos {
593            Some(pos) => {
594                self.finger_back -= pos + 1;
595                Some(self.finger_back)
596            }
597            None => {
598                self.finger_back = self.start;
599                None
600            }
601        }
602    }
603}
604
605impl<'b> Iterator for Lines<'b> {
606    type Item = &'b Strs;
607
608    fn next(&mut self) -> Option<Self::Item> {
609        if self.start == self.end {
610            return None;
611        }
612
613        let range = self.start..self.finger_back;
614        let (s0, s1) = self.buf.gapbuf.range(range.clone()).as_slices();
615
616        Some(match s0.iter().chain(s1.iter()).position(|b| *b == b'\n') {
617            Some(pos) => {
618                let line = Strs::new(self.buf, self.start as u32, pos as u32 + 1);
619                self.start += pos + 1;
620                line
621            }
622            None => {
623                let len = self.end - self.start;
624                let line = Strs::new(self.buf, self.start as u32, len as u32);
625                self.start = self.end;
626                line
627            }
628        })
629    }
630
631    fn last(mut self) -> Option<Self::Item> {
632        self.next_back()
633    }
634}
635
636impl DoubleEndedIterator for Lines<'_> {
637    fn next_back(&mut self) -> Option<Self::Item> {
638        if self.start == self.end {
639            return None;
640        }
641
642        Some(match self.next_match_back() {
643            Some(start) => {
644                if start + 1 == self.end {
645                    let start = match self.next_match_back() {
646                        Some(start) => start + 1,
647                        None => self.start,
648                    };
649                    let len = self.end - start;
650                    let line = Strs::new(self.buf, start as u32, len as u32);
651                    self.end = start;
652                    line
653                } else {
654                    let len = self.end - (start + 1);
655                    let line = Strs::new(self.buf, start as u32 + 1, len as u32);
656                    self.end = start + 1;
657                    line
658                }
659            }
660            None => {
661                let len = self.end - self.start;
662                self.end = self.start;
663                Strs::new(self.buf, self.start as u32, len as u32)
664            }
665        })
666    }
667}
668
669impl std::iter::FusedIterator for Lines<'_> {}
670
671/// A deconstructed internal representation of an [`Strs`],
672/// useful for not doing a bunch of unsafe operations.
673struct FormedStrs<'b> {
674    buf: &'b StrsBuf,
675    start: u32,
676    len: u32,
677}
678
679impl<'b> FormedStrs<'b> {
680    fn new(strs: &'b Strs) -> Self {
681        let ptr = strs as *const Strs as *const [StrsBuf];
682        // Safety: Since the len was created by the inverted operation, this
683        // should be fine.
684        let [start, len] = unsafe { std::mem::transmute::<usize, [u32; 2]>(ptr.len()) };
685
686        Self {
687            // Safety: When creating the Strs, a valid StrsBuf was at the address.
688            buf: unsafe { &*(ptr as *const StrsBuf) },
689            start,
690            len,
691        }
692    }
693}
694
695#[repr(transparent)]
696struct StrsDST([StrsBuf]);