Skip to main content

duat_core/text/strs/
mod.rs

1//! The equivalent of [`str`] for Duat.
2//!
3//! This module defines the [`Strs`] struct, which is a type that,
4//! much like `str`, only shows up as `&Strs`. This is the only avenue
5//! of direct interaction that the end user will get in regards to the
6//! bytes of the [`Text`].
7//!
8//! The `Strs` are backed by an [`StrsBuf`] struct, which owns the
9//! allocation of the [`GapBuffer`] of the utf-8 text.
10//!
11//! [`Text`]: super::Text
12//! [`StrsBuf`]: buf::StrsBuf
13//! [`GapBuffer`]: gap_buf::GapBuffer
14use std::{ops::Range, sync::LazyLock};
15
16pub use buf::StrsBuf;
17
18use crate::{
19    opts::PrintOpts,
20    text::{Point, TextIndex, TextRange, strs::buf::assert_utf8_boundary},
21};
22
23mod buf;
24mod line_ranges;
25
26/// The [`str`] equivalent for Duat.
27///
28/// Due to Duat's use of a [gap buffer] to represent the [`Text`],
29/// this struct doesn't represent just one slice like a `&str` does.
30/// Instead, it represents two slices (either being possibly empty).
31///
32/// This is done by making use of a custom dynamically sized type,
33/// allowing for near identical utilization.
34///
35/// [gap buffer]: https://en.wikipedia.org/wiki/Gap_buffer
36/// [`Text`]: super::Text
37// I need it to be like this so it is a wide pointer, which can be
38// turned into a reference that can be used for indexing and std::ops::Deref.
39#[repr(transparent)]
40pub struct Strs(StrsDST);
41
42impl Strs {
43    /// Returns a new `Strs`.
44    pub(super) fn new(bytes: &StrsBuf, start: u32, len: u32) -> &Self {
45        // Safety: Since the ordering of fields shouldn't change, this should
46        // be fine.
47        let start_and_len = unsafe { std::mem::transmute::<[u32; 2], usize>([start, len]) };
48        let ptr = std::ptr::slice_from_raw_parts(bytes as *const StrsBuf, start_and_len);
49
50        // Safety: Since the address is valid, this should be converted to a
51        // quasi wide pointer, where the metadata is size-like, but encodes
52        // two values instead of one.
53        unsafe { &*(ptr as *const Self) }
54    }
55
56    ////////// Querying functions
57
58    /// The [`Point`] at the end of the `Strs`.
59    ///
60    /// This is the equivalent of `strs.range().start`.
61    pub fn start_point(&self) -> Point {
62        let formed = FormedStrs::new(self);
63        if formed.start == 0 {
64            Point::default()
65        } else {
66            let slices = unsafe {
67                let (s0, s1) = formed.bytes.buf.as_slices();
68                [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
69            };
70            formed
71                .bytes
72                .line_ranges
73                .point_by_key(formed.start as usize, |[b, _]| b, slices)
74                .unwrap_or_else(|| formed.bytes.line_ranges.max(slices))
75        }
76    }
77
78    /// The [`Point`] at the end of the `Strs`.
79    ///
80    /// This is the equivalent of `strs.range().end`.
81    pub fn end_point(&self) -> Point {
82        let formed = FormedStrs::new(self);
83
84        let slices = unsafe {
85            let (s0, s1) = formed.bytes.buf.as_slices();
86            [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
87        };
88
89        let byte = formed.start as usize + formed.len as usize;
90        if byte == formed.bytes.buf.len() {
91            formed.bytes.line_ranges.max(slices)
92        } else {
93            formed
94                .bytes
95                .line_ranges
96                .point_by_key(byte, |[b, _]| b, slices)
97                .unwrap_or_else(|| formed.bytes.line_ranges.max(slices))
98        }
99    }
100
101    /// The `char` at a given position.
102    ///
103    /// This position can either be a [`Point`] or a byte index. Will
104    /// return [`None`] if the position is greater than or equal to
105    /// `self.len()` or if it is not located in a utf8 character
106    /// boundary.
107    #[track_caller]
108    pub fn char_at(&self, p: impl TextIndex) -> Option<char> {
109        let formed = FormedStrs::new(self);
110        let range = formed
111            .bytes
112            .buf
113            .range(formed.start as usize..formed.start as usize + formed.len as usize);
114
115        if range
116            .get(p.to_byte_index())
117            .is_none_or(|b| utf8_char_width(*b) == 0)
118        {
119            return None;
120        }
121
122        let [s0, s1] = self.to_array();
123        Some(if p.to_byte_index() < s0.len() {
124            s0[p.to_byte_index()..].chars().next().unwrap()
125        } else {
126            s1[p.to_byte_index() - s0.len()..]
127                .chars()
128                .next()
129                .unwrap_or_else(|| panic!("{self:#?}"))
130        })
131    }
132
133    /// The [`Point`] corresponding to the byte position, 0 indexed.
134    ///
135    /// If the byte position would fall in between two characters
136    /// (because the first one comprises more than one byte), the
137    /// first character is chosen as the [`Point`] where the byte is
138    /// located.
139    ///
140    /// # Panics
141    ///
142    /// Will panic if `b` is greater than the length of the text.
143    #[inline(always)]
144    #[track_caller]
145    pub fn point_at_byte(&self, byte: usize) -> Point {
146        assert!(
147            byte <= self.len(),
148            "byte out of bounds: the len is {}, but the byte is {byte}",
149            self.len()
150        );
151
152        let formed = FormedStrs::new(self);
153
154        if byte == self.len() {
155            self.end_point()
156        } else {
157            let slices = unsafe {
158                let (s0, s1) = formed.bytes.buf.as_slices();
159                [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
160            };
161            formed
162                .bytes
163                .line_ranges
164                .point_by_key(byte + formed.start as usize, |[b, _]| b, slices)
165                .unwrap()
166        }
167    }
168
169    /// The [`Point`] associated with the `c`th char.
170    ///
171    /// # Panics
172    ///
173    /// Will panic if `c` is greater than the number of chars in the
174    /// text.
175    #[inline(always)]
176    #[track_caller]
177    pub fn point_at_char(&self, char: usize) -> Point {
178        let end_point = self.end_point();
179        assert!(
180            char <= end_point.char(),
181            "char out of bounds: the len is {}, but the char is {char}",
182            end_point.char()
183        );
184
185        let formed = FormedStrs::new(self);
186
187        if char == end_point.char() {
188            end_point
189        } else {
190            let slices = unsafe {
191                let (s0, s1) = formed.bytes.buf.as_slices();
192                [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
193            };
194
195            let start = formed
196                .bytes
197                .line_ranges
198                .point_by_key(formed.start as usize, |[b, _]| b, slices)
199                .unwrap();
200
201            formed
202                .bytes
203                .line_ranges
204                .point_by_key(start.char() + char, |[_, c]| c, slices)
205                .unwrap()
206        }
207    }
208
209    /// The [`Point`] where the `l`th line starts, 0 indexed.
210    ///
211    /// If `l == number_of_lines`, returns the last point of the
212    /// `Strs`.
213    ///
214    /// # Panics
215    ///
216    /// Will panic if the number `l` is greater than the number of
217    /// lines on the text.
218    #[inline(always)]
219    #[track_caller]
220    pub fn point_at_coords(&self, line: usize, column: usize) -> Point {
221        let end_point = self.end_point();
222        assert!(
223            line <= end_point.line(),
224            "line out of bounds: the len is {}, but the line is {line}",
225            end_point.line()
226        );
227
228        let formed = FormedStrs::new(self);
229
230        if line == 0 {
231            self.point_at_byte(formed.start as usize)
232        } else if line == end_point.line() {
233            end_point
234        } else {
235            let slices = unsafe {
236                let (s0, s1) = formed.bytes.buf.as_slices();
237                [str::from_utf8_unchecked(s0), str::from_utf8_unchecked(s1)]
238            };
239            let line = {
240                let start = self.point_at_byte(formed.start as usize);
241                start.line() + line
242            };
243
244            let point = formed
245                .bytes
246                .line_ranges
247                .point_at_coords(line, column, slices);
248
249            if let Some(point) = point {
250                point
251            } else {
252                let next_line_start = if line + 1 == end_point.line() {
253                    end_point
254                } else {
255                    formed
256                        .bytes
257                        .line_ranges
258                        .point_at_coords(line + 1, 0, slices)
259                        .unwrap()
260                };
261                next_line_start.rev('\n')
262            }
263        }
264    }
265
266    /// The `Strs` for the `n`th line.
267    ///
268    /// # Panics
269    ///
270    /// Panics if `n >= self.len().line()`
271    #[track_caller]
272    pub fn line(&self, n: usize) -> &Strs {
273        let end_point = self.end_point();
274        assert!(
275            n < end_point.line(),
276            "line out of bounds: the len is {}, but the line is {n}",
277            end_point.line()
278        );
279
280        let start = self.point_at_coords(n, 0);
281        let end = if n + 1 == end_point.line() {
282            end_point
283        } else {
284            self.point_at_coords(n + 1, 0)
285        };
286
287        &self[start..end]
288    }
289
290    /// Returns an `Strs` for the whole [`Text`] that this one belongs
291    /// to.
292    ///
293    /// You should use this function if you want to create an api that
294    /// requires the whole `Strs` to be used as an argument. You can
295    /// accept any `Strs`, then just transform it to the full one
296    /// using this function.
297    ///
298    /// If this `Strs` is from `Strs::empty`, then this will
299    /// return itself.
300    ///
301    /// [`Text`]: super::Text
302    pub fn full(&self) -> &Strs {
303        FormedStrs::new(self).bytes
304    }
305
306    /// The last [`Point`] associated with a `char`
307    ///
308    /// This function takes into account the whole [`Text`], not just
309    /// the parts contained in the `Strs`. And since a `Text` can't be
310    /// empty, it will always return a [`Point`] associated with the
311    /// `\n` character.
312    ///
313    /// [`len`]: Self::len
314    /// [`Text`]: crate::text::Text
315    pub fn last_point(&self) -> Point {
316        let formed = FormedStrs::new(self);
317        formed.bytes.end_point().rev('\n')
318    }
319
320    /// Tries to get a subslice of the `Strs`
321    ///
322    /// It will return [`None`] if the range does not start or end in
323    /// valid utf8 boundaries. If you expect the value to alway be
324    /// `Some`, consider using the index operator (`[]`) instead.
325    ///
326    /// This method is conceptually similar to [`&str::get`], but the
327    /// reference is to an `Strs` struct. This struct points to a
328    /// subslice of the `Strs`s, which is actually two slices,
329    /// given the internal gap buffer representation.
330    ///
331    /// This type is supposed to act nearly identically with the
332    /// [`str`] type, only differing in the fact that its maximum
333    /// lenght is [`u32::MAX`], not [`usize::MAX`].
334    ///
335    /// [`&str`]: str
336    /// [`Text`]: crate::text::Text
337    /// [range]: TextRange
338    /// [`&str::get`]: str::get
339    pub fn get(&self, range: impl TextRange) -> Option<&Strs> {
340        let formed = FormedStrs::new(self);
341        let range = {
342            let range = range.try_to_range(formed.len as usize)?;
343            range.start + formed.start as usize..range.end + formed.start as usize
344        };
345
346        let (s0, s1) = formed.bytes.buf.range(range.clone()).as_slices();
347
348        // Check if the slices match utf8 boundaries.
349        if s0.first().is_some_and(|b| utf8_char_width(*b) == 0)
350            || s1.first().is_some_and(|b| utf8_char_width(*b) == 0)
351            || formed
352                .bytes
353                .buf
354                .get(range.end)
355                .is_some_and(|b| utf8_char_width(*b) == 0)
356        {
357            return None;
358        }
359
360        Some(Strs::new(
361            formed.bytes,
362            range.start as u32,
363            range.len() as u32,
364        ))
365    }
366
367    /// An empty `Strs`, useful in some circumstances.
368    ///
369    /// This is the equivalent of a literal `""`.
370    pub fn empty() -> &'static Self {
371        Strs::new(&EMPTY_BYTES, 0, 0)
372    }
373
374    /// Returns a struct of two `&[u8]` representing a [`TextRange`]
375    /// from the slice.
376    #[track_caller]
377    pub fn slices(&self, range: impl TextRange) -> [&[u8]; 2] {
378        let formed = FormedStrs::new(self);
379        let range = {
380            let range = range.to_range(formed.len as usize);
381            range.start + formed.start as usize..range.end + formed.start as usize
382        };
383
384        let (s0, s1) = formed.bytes.buf.range(range).as_slices();
385        [s0, s1]
386    }
387
388    /// Converts this `Strs` into an array of its two parts.
389    pub fn to_array(&self) -> [&str; 2] {
390        let formed = FormedStrs::new(self);
391        let range = formed.start as usize..(formed.start + formed.len) as usize;
392
393        let (s0, s1) = formed.bytes.buf.range(range).as_slices();
394
395        // Safety: The creation of a &Strs necessitates a valid utf8 range.
396        [unsafe { std::str::from_utf8_unchecked(s0) }, unsafe {
397            std::str::from_utf8_unchecked(s1)
398        }]
399    }
400
401    /// Returns an iterator over the lines in a given range
402    ///
403    /// The lines are inclusive, that is, it will iterate over the
404    /// whole line, not just the parts within the range.
405    ///
406    /// [range]: TextRange
407    pub fn lines(&self) -> Lines<'_> {
408        let formed = FormedStrs::new(self);
409        Lines::new(
410            formed.bytes,
411            formed.start as usize,
412            (formed.start + formed.len) as usize,
413        )
414    }
415
416    /// Returns and [`Iterator`] over the [bytes] of this `Strs`
417    ///
418    /// [bytes]: u8
419    pub fn bytes(&self) -> impl DoubleEndedIterator<Item = u8> {
420        self.slices(..)
421            .into_iter()
422            .flat_map(|slice| slice.iter().copied())
423    }
424
425    /// Returns and [`Iterator`] over the [`char`]s of this `Strs`
426    pub fn chars(&self) -> impl DoubleEndedIterator<Item = char> {
427        self.to_array().into_iter().flat_map(str::chars)
428    }
429
430    /// Returns an [`Iterator`] over the [`char`]s and their indices.
431    pub fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)> {
432        let [s0, s1] = self.to_array();
433        s0.char_indices()
434            .chain(s1.char_indices().map(move |(b, c)| (b + s0.len(), c)))
435    }
436
437    /// A [`Range<usize>`] of the bytes on this `Strs`.
438    pub fn byte_range(&self) -> Range<usize> {
439        let formed = FormedStrs::new(self);
440        formed.start as usize..(formed.start + formed.len) as usize
441    }
442
443    /// A [`Range<Point>`] of the bytes on this `Strs`.
444    ///
445    /// If you just care about the byte indices (most likely), check
446    /// out [`Strs::byte_range`] isntead.
447    pub fn range(&self) -> Range<Point> {
448        let formed = FormedStrs::new(self);
449        let range = self.byte_range();
450
451        formed.bytes.point_at_byte(range.start)..formed.bytes.point_at_byte(range.end)
452    }
453
454    /// Gets the indentation level of this `Strs`
455    ///
456    /// This assumes that it is a line in the `Strs`, ending with
457    /// `\n` or `\r\n`.
458    ///
459    /// This is the total "amount of spaces", that is, how many `' '`
460    /// character equivalents are here. This depends on your
461    /// [`PrintOpts`] because of the `tabstop` field.
462    #[track_caller]
463    pub fn indent(&self, opts: PrintOpts) -> usize {
464        self.chars()
465            .take_while(|&char| char == ' ' || char == '\t')
466            .fold(0, |sum, char| {
467                if char == ' ' {
468                    sum + 1
469                } else {
470                    sum + opts.tabstop as usize - (sum % opts.tabstop as usize)
471                }
472            })
473    }
474
475    /// The lenght of this `Strs`, in bytes.
476    pub fn len(&self) -> usize {
477        FormedStrs::new(self).len as usize
478    }
479
480    /// Wether the len of this `Strs` is equal to 0.
481    #[must_use]
482    pub fn is_empty(&self) -> bool {
483        self.len() == 0
484    }
485
486    /// Wether this is an empty line or not
487    ///
488    /// An empty line is either `\n` or `\r\n`. If you want to check
489    /// wether a line is just whitespace, you can do this:
490    ///
491    /// ```
492    /// # duat_core::doc_duat!(duat);
493    /// use duat::prelude::*;
494    ///
495    /// fn is_whitespace(line: &Strs) -> bool {
496    ///     line.chars().all(|char| char.is_whitespace())
497    /// }
498    /// ```
499    pub fn is_empty_line(&self) -> bool {
500        self == "\n" || self == "\r\n"
501    }
502}
503
504impl<Idx: TextRange> std::ops::Index<Idx> for Strs {
505    type Output = Self;
506
507    fn index(&self, index: Idx) -> &Self::Output {
508        let formed = FormedStrs::new(self);
509        let range = index.to_range(formed.len as usize);
510
511        assert_utf8_boundary(formed.bytes, range.start);
512        assert_utf8_boundary(formed.bytes, range.end);
513
514        Self::new(
515            formed.bytes,
516            range.start as u32 + formed.start,
517            range.len() as u32,
518        )
519    }
520}
521
522impl std::fmt::Display for Strs {
523    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
524        let [s0, s1] = self.to_array();
525        write!(f, "{s0}{s1}")
526    }
527}
528
529impl std::fmt::Debug for Strs {
530    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
531        let [s0, s1] = self.to_array();
532        write!(f, "{:?}", format!("{s0}{s1}").as_str())
533    }
534}
535
536static EMPTY_BYTES: LazyLock<StrsBuf> = LazyLock::new(StrsBuf::default);
537
538/// Given a first byte, determines how many bytes are in this UTF-8
539/// character.
540#[must_use]
541#[inline]
542pub const fn utf8_char_width(b: u8) -> usize {
543    // https://tools.ietf.org/html/rfc3629
544    const UTF8_CHAR_WIDTH: &[u8; 256] = &[
545        // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
546        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
547        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
548        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
549        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
550        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
551        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
552        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
553        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
554        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
555        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
556        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
557        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
558        0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
559        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
560        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
561        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
562    ];
563    UTF8_CHAR_WIDTH[b as usize] as usize
564}
565
566/// An [`Iterator`] over the lines on an [`Strs`]
567pub struct Lines<'b> {
568    bytes: &'b StrsBuf,
569    start: usize,
570    end: usize,
571    finger_back: usize,
572}
573
574impl<'b> Lines<'b> {
575    fn new(bytes: &'b StrsBuf, start: usize, end: usize) -> Self {
576        Self { bytes, start, end, finger_back: end }
577    }
578
579    fn next_match_back(&mut self) -> Option<usize> {
580        let range = self.start..self.finger_back;
581        let (s0, s1) = self.bytes.buf.range(range).as_slices();
582
583        let pos = s0.iter().chain(s1.iter()).rev().position(|b| *b == b'\n');
584        match pos {
585            Some(pos) => {
586                self.finger_back -= pos + 1;
587                Some(self.finger_back)
588            }
589            None => {
590                self.finger_back = self.start;
591                None
592            }
593        }
594    }
595}
596
597impl<'b> Iterator for Lines<'b> {
598    type Item = &'b Strs;
599
600    fn next(&mut self) -> Option<Self::Item> {
601        if self.start == self.end {
602            return None;
603        }
604
605        let range = self.start..self.finger_back;
606        let (s0, s1) = self.bytes.buf.range(range.clone()).as_slices();
607
608        Some(match s0.iter().chain(s1.iter()).position(|b| *b == b'\n') {
609            Some(pos) => {
610                let line = Strs::new(self.bytes, self.start as u32, pos as u32 + 1);
611                self.start += pos + 1;
612                line
613            }
614            None => {
615                let len = self.end - self.start;
616                let line = Strs::new(self.bytes, self.start as u32, len as u32);
617                self.start = self.end;
618                line
619            }
620        })
621    }
622
623    fn last(mut self) -> Option<Self::Item> {
624        self.next_back()
625    }
626}
627
628impl DoubleEndedIterator for Lines<'_> {
629    fn next_back(&mut self) -> Option<Self::Item> {
630        if self.start == self.end {
631            return None;
632        }
633
634        Some(match self.next_match_back() {
635            Some(start) => {
636                if start + 1 == self.end {
637                    let start = match self.next_match_back() {
638                        Some(start) => start + 1,
639                        None => self.start,
640                    };
641                    let len = self.end - start;
642                    let line = Strs::new(self.bytes, start as u32, len as u32);
643                    self.end = start;
644                    line
645                } else {
646                    let len = self.end - (start + 1);
647                    let line = Strs::new(self.bytes, start as u32 + 1, len as u32);
648                    self.end = start + 1;
649                    line
650                }
651            }
652            None => {
653                let len = self.end - self.start;
654                self.end = self.start;
655                Strs::new(self.bytes, self.start as u32, len as u32)
656            }
657        })
658    }
659}
660
661impl std::iter::FusedIterator for Lines<'_> {}
662
663/// A deconstructed internal representation of an [`Strs`],
664/// useful for not doing a bunch of unsafe operations.
665struct FormedStrs<'b> {
666    bytes: &'b StrsBuf,
667    start: u32,
668    len: u32,
669}
670
671impl<'b> FormedStrs<'b> {
672    fn new(strs: &'b Strs) -> Self {
673        let ptr = strs as *const Strs as *const [StrsBuf];
674        // Safety: Since the len was created by the inverted operation, this
675        // should be fine.
676        let [start, len] = unsafe { std::mem::transmute::<usize, [u32; 2]>(ptr.len()) };
677
678        Self {
679            // Safety: When creating the Strs, a valid StrsBuf was at the address.
680            bytes: unsafe { &*(ptr as *const StrsBuf) },
681            start,
682            len,
683        }
684    }
685}
686
687#[repr(transparent)]
688struct StrsDST([StrsBuf]);