nickel_lang_core/term/
string.rs

1use std::ops::{Deref, DerefMut};
2
3use serde::{Deserialize, Serialize};
4use unicode_segmentation::UnicodeSegmentation;
5
6use super::{array::Array, CompiledRegex, Number, Term};
7use crate::identifier::{Ident, LocIdent};
8
9/// A Nickel string is really just a Rust `String`, overlayed with some
10/// methods implementing custom logic (in particular, functions which
11/// avoid ever breaking up Unicode extended grapheme clusters.)
12#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
13pub struct NickelString(String);
14
15// Conversions to & from String-like things.
16
17impl<S> From<S> for NickelString
18where
19    String: From<S>,
20{
21    fn from(inner: S) -> Self {
22        Self(inner.into())
23    }
24}
25
26impl From<&NickelString> for String {
27    fn from(s: &NickelString) -> Self {
28        s.clone().into_inner()
29    }
30}
31
32impl From<NickelString> for Ident {
33    fn from(s: NickelString) -> Self {
34        Ident::from(s.0)
35    }
36}
37
38impl From<NickelString> for LocIdent {
39    fn from(s: NickelString) -> Self {
40        LocIdent::from(s.0)
41    }
42}
43
44impl<'a> From<&'a NickelString> for LocIdent {
45    fn from(s: &'a NickelString) -> Self {
46        LocIdent::from(s.0.as_str())
47    }
48}
49
50// The below impls broadly allow `NclString`s to be treated just like
51// Rust `String`s.
52
53impl Deref for NickelString {
54    type Target = String;
55
56    fn deref(&self) -> &String {
57        &self.0
58    }
59}
60
61impl DerefMut for NickelString {
62    fn deref_mut(&mut self) -> &mut String {
63        &mut self.0
64    }
65}
66
67impl std::fmt::Display for NickelString {
68    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
69        write!(f, "{}", self.0)
70    }
71}
72
73impl AsRef<str> for NickelString {
74    fn as_ref(&self) -> &str {
75        self.0.as_str()
76    }
77}
78
79impl NickelString {
80    /// Creates a new empty `NclString`. Which is to say, it creates a new,
81    /// empty `String`.
82    pub fn new() -> NickelString {
83        String::new().into()
84    }
85
86    /// The number of Unicode extended grapheme clusters the string contains.
87    ///
88    /// This method has `O(self.len())` time complexity.
89    pub fn length(&self) -> usize {
90        self.grapheme_cluster_count()
91    }
92
93    /// Returns the number of Unicode extended grapheme clusters the string
94    /// contains.
95    ///
96    /// This method has `O(self.len())` time complexity.
97    #[inline]
98    fn grapheme_cluster_count(&self) -> usize {
99        self.graphemes(true).count()
100    }
101
102    /// Returns an [`Array`] of Nickel strings, each one
103    /// containing a single Unicode extended grapheme cluster.
104    ///
105    /// This method has `O(self.len())` time complexity.
106    pub fn characters(&self) -> Array {
107        self.grapheme_clusters()
108    }
109
110    /// Returns an [`Array`] of Nickel strings, each one
111    /// containing a single Unicode extended grapheme cluster.
112    ///
113    /// This method has `O(self.len())` time complexity.
114    #[inline]
115    fn grapheme_clusters(&self) -> Array {
116        self.graphemes(true)
117            .map(|g| Term::Str(g.into()).into())
118            .collect()
119    }
120
121    /// Splits the string on `separator`, returning an [`Array`] of Nickel
122    /// strings.
123    ///
124    /// This method has `O(self.len() * separator.len())` time complexity when
125    /// the separator is non-empty, or `O(self.len())` otherwise.
126    pub fn split(&self, separator: &str) -> Array {
127        if separator.is_empty() {
128            // If there's no separator, we split the string between
129            // each grapheme cluster.
130            self.grapheme_clusters()
131        } else {
132            use grapheme_cluster_preservation::SearchEvent;
133
134            let mut result = Vec::new();
135
136            // We build our result vec by searching through the string for the separator.
137            grapheme_cluster_preservation::search(self, separator).for_each(|e| match e {
138                SearchEvent::Match {
139                    // If we hit a match, then everything we saw between the previous
140                    // match and now is a split...
141                    since_last_match: split,
142                } // ...then we do the same with whatever's left at the end.
143                | SearchEvent::LastNonMatch { non_match: split } => {
144                    result.push(Term::Str(split.into()).into())
145                }
146            });
147
148            Array::from_iter(result)
149        }
150    }
151
152    /// Returns `true` if `needle` is contained in `self`, and `false` otherwise.
153    ///
154    /// Note that in contrast to Rust `String`'s `contains` method, this method
155    /// will not consider a Unicode codepoint to be contained in a string if
156    /// it exists as part of a larger extended grapheme cluster.
157    ///
158    /// The time complexity of this method is `O(self.len() * needle.len())`.
159    pub fn contains(&self, needle: &str) -> bool {
160        if needle.is_empty() {
161            true
162        } else {
163            use grapheme_cluster_preservation::SearchEvent;
164
165            grapheme_cluster_preservation::search(self, needle)
166                .any(|e| matches!(e, SearchEvent::Match { .. }))
167        }
168    }
169
170    /// Returns a new `NclString` replacing every occurence of `from` with `to`.
171    ///
172    /// This method has time complexity `O(self.len() * from.len())`.
173    pub fn replace(&self, from: &str, to: &str) -> NickelString {
174        let mut result = String::with_capacity(self.capacity());
175
176        if from.is_empty() {
177            // If `from` is empty then we:
178            //   1. insert `to` at the beginning.
179            result.push_str(to);
180            //   2. insert `to` after each cluster.
181            self.graphemes(true)
182                .flat_map(|grapheme| [grapheme, to])
183                .for_each(|s| result.push_str(s))
184        } else {
185            use grapheme_cluster_preservation::SearchEvent;
186
187            grapheme_cluster_preservation::search(self, from).for_each(|e| match e {
188                // If we hit a match...
189                SearchEvent::Match { since_last_match } => {
190                    // ...we write everything we've seen since the last match...
191                    result.push_str(since_last_match);
192                    // ...and then we replace the matched `from` with `to`.
193                    result.push_str(to);
194                }
195                // We also need to write anything remaining after the last match.
196                SearchEvent::LastNonMatch { non_match } => result.push_str(non_match),
197            });
198        }
199
200        result.into()
201    }
202
203    /// Returns the substring of `self` between `start` and `end`.
204    ///
205    /// Returns an error if:
206    ///    - either start or end is not a fraction
207    ///    - start < 0
208    ///    - start > end
209    ///
210    /// The time complexity of this method is `O(self.len())`.
211    pub fn substring(&self, start: &Number, end: &Number) -> Result<NickelString, SubstringError> {
212        let Ok(start_usize) = usize::try_from(start) else {
213            return Err(SubstringError::NonIntStart(start.clone()));
214        };
215
216        let Ok(end_usize) = usize::try_from(end) else {
217            return Err(SubstringError::NonIntEnd(end.clone()));
218        };
219
220        let mut from_start = self.graphemes(true).skip(start_usize).peekable();
221
222        // If `from_start` is `None` then we skipped past the end of `self`.
223        if from_start.peek().is_none() {
224            Err(SubstringError::StartOutOfBounds {
225                start: start.clone(),
226                str_len: self.grapheme_cluster_count(),
227            })
228        } else if end_usize < start_usize {
229            Err(SubstringError::EndOutOfBounds {
230                start: start.clone(),
231                end: end.clone(),
232                str_len: self.grapheme_cluster_count(),
233            })
234        } else {
235            let wanted_substr_len = end_usize - start_usize;
236            let substr: String = from_start.take(wanted_substr_len).collect();
237            let substr: NickelString = substr.into();
238            // If `substr.grapheme_cluster_count()` is smaller than we wanted
239            // then end was larger than the length of the string.
240            if substr.grapheme_cluster_count() != wanted_substr_len {
241                Err(SubstringError::EndOutOfBounds {
242                    start: start.clone(),
243                    end: end.clone(),
244                    str_len: self.grapheme_cluster_count(),
245                })
246            } else {
247                Ok(substr)
248            }
249        }
250    }
251
252    /// Returns `true` if `self` matches `regex` and `false` otherwise.
253    ///
254    /// Note that this function returns `false` if a match occurs in the middle
255    /// of a Unicode extended grapheme cluster.
256    ///
257    /// The time complexity of this method is `O(self.len())`.
258    pub fn matches_regex(&self, regex: &CompiledRegex) -> Term {
259        use grapheme_cluster_preservation::regex;
260
261        Term::Bool(regex::find_iter(self, regex).next().is_some())
262    }
263
264    /// Returns a new string in which every occurence of `regex` in `self` is
265    /// replaced by `replacement`.
266    ///
267    /// Note that this function will not replace matches that begin or end
268    /// in the middle of a Unicode extended grapheme cluster.
269    ///
270    /// The time complexity of this method is `O(self.len())`.
271    pub fn replace_regex(&self, regex: &CompiledRegex, replacement: &NickelString) -> NickelString {
272        use grapheme_cluster_preservation::regex;
273
274        let mut result = String::new();
275        let mut prev_match_end = 0;
276        for m in regex::find_iter(self, regex) {
277            // Push everything between the last match and this one
278            result.push_str(&self[prev_match_end..m.start()]);
279            // Push the replacement
280            result.push_str(replacement);
281            // Skip to the end of the match
282            prev_match_end = m.end();
283        }
284        // Push whatever remains between the end of the match & the end of the
285        // string.
286        result.push_str(&self[prev_match_end..]);
287
288        result.into()
289    }
290
291    /// Find the first match in `self` for a given `regex`, and return the
292    /// match itself, the index in `self` where it appears, and any capture
293    /// groups specified.
294    ///
295    /// Note that matches will be ignored if either the match itself or any
296    /// of its capture groups begin or end in the middle of a Unicode extended
297    /// grapheme cluster.
298    ///
299    /// The time complexity of this method is `O(self.len())`.
300    pub fn find_regex(&self, regex: &CompiledRegex) -> Option<RegexFindResult> {
301        self.find_all_regex(regex).next()
302    }
303
304    /// Find all matches in `self` for a given `regex`, returning an iterator
305    /// over the match itself, the index in `self` where it appears, and any
306    /// capture groups specified.
307    ///
308    /// Note that matches will be ignored if either the match itself or any of
309    /// its capture groups begin or end in the middle of a Unicode extended
310    /// grapheme cluster.
311    pub fn find_all_regex<'a>(
312        &'a self,
313        regex: &'a CompiledRegex,
314    ) -> impl Iterator<Item = RegexFindResult> + 'a {
315        use grapheme_cluster_preservation::regex;
316
317        regex::captures_iter(self, regex).map(|capt| {
318            // If we found a capture group, we extract the whole match...
319            let first_match = capt.get(0).unwrap();
320            // and then convert each group into a `NickelString`
321            let groups = capt
322                .iter()
323                .skip(1)
324                .map(|s_opt| s_opt.map(|s| s.as_str().into()))
325                .collect();
326
327            // The indices returned by the `regex` crate are byte offsets into
328            // the string, but we need to return the index into the Nickel string,
329            // i.e., the grapheme cluster index which starts at this byte offset.
330            let adjusted_index = self
331                .grapheme_indices(true)
332                .enumerate()
333                .find_map(|(grapheme_idx, (byte_offset, _))| {
334                    if byte_offset == first_match.start() {
335                        Some(grapheme_idx.into())
336                    } else {
337                        None
338                    }
339                })
340                .expect("We already know that `first_match.start()` occurs on a cluster boundary.");
341
342            RegexFindResult {
343                matched: first_match.as_str().into(),
344                index: adjusted_index,
345                groups,
346            }
347        })
348    }
349
350    /// Consumes `self`, returning the Rust `String`.
351    pub fn into_inner(self) -> String {
352        self.0
353    }
354}
355
356impl Default for NickelString {
357    fn default() -> Self {
358        Self::new()
359    }
360}
361
362pub struct RegexFindResult {
363    pub matched: NickelString,
364    pub index: Number,
365    /// If a capture group didn't match, we store a `None`. This `None` placeholders
366    /// make the indexing predictable, so it's possible to associate captures with
367    /// parenthesis groupings in the original regex.
368    pub groups: Vec<Option<NickelString>>,
369}
370
371/// Errors returned by `NickelString`'s `substring` method.
372pub enum SubstringError {
373    /// The start index was not an int
374    NonIntStart(Number),
375    /// The end index was not an int
376    NonIntEnd(Number),
377    /// The start index was not within the bounds of the string
378    StartOutOfBounds { start: Number, str_len: usize },
379    EndOutOfBounds {
380        start: Number,
381        end: Number,
382        str_len: usize,
383    },
384}
385
386impl std::fmt::Display for SubstringError {
387    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
388        use SubstringError::*;
389
390        write!(f, "substring: ")?;
391
392        match self {
393            NonIntStart(start) => write!(
394                f,
395                "expected 2nd argument (start) to be a positive integer smaller than {}, \
396                got {start}",
397                usize::MAX
398            ),
399            NonIntEnd(end) => write!(
400                f,
401                "expected 3rd argument (end) to be a positive integer smaller than {}, got {end}",
402                usize::MAX
403            ),
404            StartOutOfBounds { start, str_len } => write!(
405                f,
406                "index out of bounds. \
407                Expected 2nd argument (start) to be between 0 and {str_len}, got {start}"
408            ),
409            EndOutOfBounds {
410                start,
411                end,
412                str_len,
413            } => write!(
414                f,
415                "index out of bounds. \
416                Expected 3rd argument (end) to be between {start} and {str_len}, got {end}"
417            ),
418        }
419    }
420}
421
422/// Types and functions designed to make it as easy as possible not to accidentally
423/// break up Unicode extended grapheme clusters in string operations.
424mod grapheme_cluster_preservation {
425    use unicode_segmentation::GraphemeCursor;
426
427    /// Returns a [`SearchIter`], which is an `Iterator` for iterating through
428    /// `haystack` in order to find `needle`.
429    ///
430    /// [`SearchIter`] produces [`SearchEvent`]s which can be used to
431    /// implement a variety of different behaviours.
432    pub fn search<'a>(haystack: &'a str, needle: &'a str) -> SearchIter<'a> {
433        SearchIter {
434            haystack,
435            needle,
436            cursor: GraphemeCursor::new(0, haystack.len(), true),
437            current_split_start: 0,
438            last_boundary: 0,
439        }
440    }
441
442    /// An event emitted by [`SearchIter`].
443    pub enum SearchEvent<'a> {
444        /// The `needle` was found within `haystack`.
445        /// Since the caller already has a reference to `needle`, we don't
446        /// include it in this event. However, we do include every part of
447        /// `haystack` which was traversed up until the match happened.
448        Match {
449            /// The slice of the string from the end of the last match
450            /// up to this one.
451            since_last_match: &'a str,
452        },
453        /// Iteration through `haystack` has finished, but there was
454        /// an unconsumed (non-matching) slice which the caller may wish
455        /// to do something with.
456        LastNonMatch {
457            /// Any non-matching text from the end of the string.
458            non_match: &'a str,
459        },
460    }
461
462    /// Our base algorithm for implementing a variety of grapheme cluster
463    /// preserving methods on [`NclString`]. As the name suggests, it is an
464    /// `Iterator`. However, rather than iterating through grapheme clusters
465    /// directly, its `Item` is a [`SearchEvent`].
466    pub struct SearchIter<'a> {
467        haystack: &'a str,
468        needle: &'a str,
469        cursor: GraphemeCursor,
470        current_split_start: usize,
471        last_boundary: usize,
472    }
473
474    impl<'a> Iterator for SearchIter<'a> {
475        type Item = SearchEvent<'a>;
476
477        fn next(&mut self) -> Option<Self::Item> {
478            use indoc::indoc;
479
480            let unwrap_explanation = indoc! {"
481                None of the GraphemeIncomplete errors are possible here:
482                    - PreContext and PrevChunk only happen if chunk_start is nonzero.
483                    - NextChunk only happens if the chunk is smaller than the cursor's len parameter
484                      but we pass `haystack` and `hackstack.len()`` respectively.
485                    - InvalidOffset can't happen because we check that `haystack` contains `needle`
486                      in the range (last_boundary, last_boundary + needle.len())
487            "};
488
489            loop {
490                let nxt = self
491                    .cursor
492                    .next_boundary(self.haystack, 0)
493                    .expect(unwrap_explanation);
494                match nxt {
495                    Some(next_boundary) => {
496                        // To check whether a match has occured, we'll first check whether
497                        // the slice of `haystack` starting at the `last_boundary` begins
498                        // with `needle`. If it does, we *also* need to check that this
499                        // instance of `needle` doesn't end in the middle of a cluster.
500                        // This is to avoid the situtaion where we have a string like
501                        // `"a🧑‍🤝‍🧑"` and we get a match by searching for `"a🧑"`.
502                        let does_match_intersect_cluster = || {
503                            let mut tmp_cursor = self.cursor.clone();
504                            tmp_cursor.set_cursor(self.last_boundary + self.needle.len());
505                            !tmp_cursor
506                                .is_boundary(self.haystack, 0)
507                                .expect(unwrap_explanation)
508                        };
509
510                        if self.haystack[self.last_boundary..].starts_with(self.needle)
511                            && !does_match_intersect_cluster()
512                        {
513                            // Yay, we found a match! We start by grabbing the slice of
514                            // the string we traversed while looking for the match.
515                            let since_last_match =
516                                &self.haystack[self.current_split_start..self.last_boundary];
517                            // Now we move the cursor to after the match, to avoid needlessly
518                            // traversing that part of the string again.
519                            // Note that in particular this means that if we seach for "aa" in
520                            // a string like "aaaa" we will get two matches: the first pair,
521                            // and then the second.
522                            let match_end = self.last_boundary + self.needle.len();
523                            self.cursor.set_cursor(match_end);
524                            self.current_split_start = match_end;
525                            self.last_boundary = match_end;
526                            // Finally we return the part of the string we traversed
527                            // to get to this match back to the caller.
528                            break Some(SearchEvent::Match { since_last_match });
529                        } else {
530                            // This is the only codepath which doesn't break
531                            // the loop. This is because we didn't match `needle`
532                            // so we're just going to move on to the next
533                            // cluster boundary.
534                            self.last_boundary = next_boundary;
535                        }
536                    }
537                    None => {
538                        // We finished iterating through the grapheme clusters, but
539                        // there could still be a final chunk of the string that we
540                        // haven't told the caller about yet.
541                        if self.current_split_start < self.haystack.len() {
542                            // We pass that slice back here & advance the `current_split_start`
543                            // pointer to the end of the string to ensure we don't hit this
544                            // condition again & will correctly return None on the next call.
545                            let non_match = &self.haystack[self.current_split_start..];
546                            self.current_split_start = self.haystack.len();
547                            break Some(SearchEvent::LastNonMatch { non_match });
548                        } else {
549                            break None;
550                        }
551                    }
552                }
553            }
554        }
555    }
556
557    /// Functions for working with regex without accidentally breaking up
558    /// Unicode extended grapheme clusters.
559    pub mod regex {
560        use regex::Regex;
561        use unicode_segmentation::GraphemeCursor;
562
563        /// An iterator which finds occurences of a given `Regex` pattern in
564        /// some source `str`, while filtering out any matches which either
565        /// begin or end in the middle of a Unicode extended grapheme cluster.
566        pub fn find_iter<'a>(
567            haystack: &'a str,
568            needle: &'a Regex,
569        ) -> impl Iterator<Item = regex::Match<'a>> {
570            needle
571                .find_iter(haystack)
572                .filter(|m| does_match_start_and_end_on_boundary(haystack, m))
573        }
574
575        /// Find all  `needle` matches in `haystack`, filtering out any match
576        /// where either the match itself, or any of its capture groups, begin
577        /// or end in the middle of a Unicode extended grapheme cluster.
578        pub fn captures_iter<'a>(
579            haystack: &'a str,
580            needle: &'a Regex,
581        ) -> impl Iterator<Item = regex::Captures<'a>> {
582            needle.captures_iter(haystack).filter(|c| {
583                c.iter().all(|maybe_match| {
584                    maybe_match.is_none_or(|m| does_match_start_and_end_on_boundary(haystack, &m))
585                })
586            })
587        }
588
589        fn does_match_start_and_end_on_boundary(haystack: &str, m: &regex::Match<'_>) -> bool {
590            let mut cursor = GraphemeCursor::new(0, haystack.len(), true);
591            cursor.set_cursor(m.start());
592            let starts_on_boundary = cursor.is_boundary(haystack, 0).expect("bad start");
593            cursor.set_cursor(m.end());
594            let ends_on_boundary = cursor.is_boundary(haystack, 0).expect("bad end");
595
596            starts_on_boundary && ends_on_boundary
597        }
598    }
599}