Skip to main content

nickel_lang_core/term/
string.rs

1use std::ops::{Deref, DerefMut};
2
3use serde::{Deserialize, Serialize};
4use unicode_segmentation::UnicodeSegmentation;
5
6use super::{CompiledRegex, Number};
7use crate::{
8    eval::value::{Array, NickelValue},
9    identifier::{Ident, LocIdent},
10};
11
12/// A Nickel string is really just a Rust `String`, overlayed with some
13/// methods implementing custom logic (in particular, functions which
14/// avoid ever breaking up Unicode extended grapheme clusters.)
15#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
16pub struct NickelString(String);
17
18// Conversions to & from String-like things.
19
20impl<S> From<S> for NickelString
21where
22    String: From<S>,
23{
24    fn from(inner: S) -> Self {
25        Self(inner.into())
26    }
27}
28
29impl From<&NickelString> for String {
30    fn from(s: &NickelString) -> Self {
31        s.clone().into_inner()
32    }
33}
34
35impl From<NickelString> for Ident {
36    fn from(s: NickelString) -> Self {
37        Ident::from(s.0)
38    }
39}
40
41impl From<NickelString> for LocIdent {
42    fn from(s: NickelString) -> Self {
43        LocIdent::from(s.0)
44    }
45}
46
47impl<'a> From<&'a NickelString> for LocIdent {
48    fn from(s: &'a NickelString) -> Self {
49        LocIdent::from(s.0.as_str())
50    }
51}
52
53// The below impls broadly allow `NickelString`s to be treated just like Rust `String`s.
54
55impl Deref for NickelString {
56    type Target = String;
57
58    fn deref(&self) -> &String {
59        &self.0
60    }
61}
62
63impl DerefMut for NickelString {
64    fn deref_mut(&mut self) -> &mut String {
65        &mut self.0
66    }
67}
68
69impl std::fmt::Display for NickelString {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71        write!(f, "{}", self.0)
72    }
73}
74
75impl AsRef<str> for NickelString {
76    fn as_ref(&self) -> &str {
77        self.0.as_str()
78    }
79}
80
81impl NickelString {
82    /// Creates a new empty `NclString`. Which is to say, it creates a new,
83    /// empty `String`.
84    pub fn new() -> NickelString {
85        String::new().into()
86    }
87
88    /// The number of Unicode extended grapheme clusters the string contains.
89    ///
90    /// This method has `O(self.len())` time complexity.
91    pub fn length(&self) -> usize {
92        self.grapheme_cluster_count()
93    }
94
95    /// Returns the number of Unicode extended grapheme clusters the string
96    /// contains.
97    ///
98    /// This method has `O(self.len())` time complexity.
99    #[inline]
100    fn grapheme_cluster_count(&self) -> usize {
101        self.graphemes(true).count()
102    }
103
104    /// Returns an [`Array`] of Nickel strings, each one
105    /// containing a single Unicode extended grapheme cluster.
106    ///
107    /// This method has `O(self.len())` time complexity.
108    pub fn characters(&self) -> Array {
109        self.grapheme_clusters()
110    }
111
112    /// Returns an [`Array`] of Nickel strings, each one
113    /// containing a single Unicode extended grapheme cluster.
114    ///
115    /// This method has `O(self.len())` time complexity.
116    #[inline]
117    fn grapheme_clusters(&self) -> Array {
118        self.graphemes(true)
119            .map(NickelValue::string_posless)
120            .collect()
121    }
122
123    /// Splits the string on `separator`, returning an [`Array`] of Nickel
124    /// strings.
125    ///
126    /// This method has `O(self.len() * separator.len())` time complexity when
127    /// the separator is non-empty, or `O(self.len())` otherwise.
128    pub fn split(&self, separator: &str) -> Array {
129        if separator.is_empty() {
130            // If there's no separator, we split the string between
131            // each grapheme cluster.
132            self.grapheme_clusters()
133        } else {
134            use grapheme_cluster_preservation::SearchEvent;
135
136            let mut result = Vec::new();
137
138            // We build our result vec by searching through the string for the separator.
139            grapheme_cluster_preservation::search(self, separator).for_each(|e| match e {
140                SearchEvent::Match {
141                    // If we hit a match, then everything we saw between the previous
142                    // match and now is a split...
143                    since_last_match: split,
144                } // ...then we do the same with whatever's left at the end.
145                | SearchEvent::LastNonMatch { non_match: split } => {
146                    result.push(NickelValue::string_posless(split))
147                }
148            });
149
150            Array::from_iter(result)
151        }
152    }
153
154    /// Returns `true` if `needle` is contained in `self`, and `false` otherwise.
155    ///
156    /// Note that in contrast to Rust `String`'s `contains` method, this method
157    /// will not consider a Unicode codepoint to be contained in a string if
158    /// it exists as part of a larger extended grapheme cluster.
159    ///
160    /// The time complexity of this method is `O(self.len() * needle.len())`.
161    pub fn contains(&self, needle: &str) -> bool {
162        if needle.is_empty() {
163            true
164        } else {
165            use grapheme_cluster_preservation::SearchEvent;
166
167            grapheme_cluster_preservation::search(self, needle)
168                .any(|e| matches!(e, SearchEvent::Match { .. }))
169        }
170    }
171
172    /// Returns a new `NclString` replacing every occurence of `from` with `to`.
173    ///
174    /// This method has time complexity `O(self.len() * from.len())`.
175    pub fn replace(&self, from: &str, to: &str) -> NickelString {
176        let mut result = String::with_capacity(self.capacity());
177
178        if from.is_empty() {
179            // If `from` is empty then we:
180            //   1. insert `to` at the beginning.
181            result.push_str(to);
182            //   2. insert `to` after each cluster.
183            self.graphemes(true)
184                .flat_map(|grapheme| [grapheme, to])
185                .for_each(|s| result.push_str(s))
186        } else {
187            use grapheme_cluster_preservation::SearchEvent;
188
189            grapheme_cluster_preservation::search(self, from).for_each(|e| match e {
190                // If we hit a match...
191                SearchEvent::Match { since_last_match } => {
192                    // ...we write everything we've seen since the last match...
193                    result.push_str(since_last_match);
194                    // ...and then we replace the matched `from` with `to`.
195                    result.push_str(to);
196                }
197                // We also need to write anything remaining after the last match.
198                SearchEvent::LastNonMatch { non_match } => result.push_str(non_match),
199            });
200        }
201
202        result.into()
203    }
204
205    /// Returns the substring of `self` between `start` and `end`.
206    ///
207    /// Returns an error if:
208    ///    - either start or end is not a fraction
209    ///    - start < 0
210    ///    - start > end
211    ///
212    /// The time complexity of this method is `O(self.len())`.
213    pub fn substring(&self, start: &Number, end: &Number) -> Result<NickelString, SubstringError> {
214        let Ok(start_usize) = usize::try_from(start) else {
215            return Err(SubstringError::NonIntStart(start.clone()));
216        };
217
218        let Ok(end_usize) = usize::try_from(end) else {
219            return Err(SubstringError::NonIntEnd(end.clone()));
220        };
221
222        let mut from_start = self.graphemes(true).skip(start_usize).peekable();
223
224        // If `from_start` is `None` then we skipped past the end of `self`.
225        if from_start.peek().is_none() {
226            Err(SubstringError::StartOutOfBounds {
227                start: start.clone(),
228                str_len: self.grapheme_cluster_count(),
229            })
230        } else if end_usize < start_usize {
231            Err(SubstringError::EndOutOfBounds {
232                start: start.clone(),
233                end: end.clone(),
234                str_len: self.grapheme_cluster_count(),
235            })
236        } else {
237            let wanted_substr_len = end_usize - start_usize;
238            let substr: String = from_start.take(wanted_substr_len).collect();
239            let substr: NickelString = substr.into();
240            // If `substr.grapheme_cluster_count()` is smaller than we wanted
241            // then end was larger than the length of the string.
242            if substr.grapheme_cluster_count() != wanted_substr_len {
243                Err(SubstringError::EndOutOfBounds {
244                    start: start.clone(),
245                    end: end.clone(),
246                    str_len: self.grapheme_cluster_count(),
247                })
248            } else {
249                Ok(substr)
250            }
251        }
252    }
253
254    /// Returns `true` if `self` matches `regex` and `false` otherwise.
255    ///
256    /// Note that this function returns `false` if a match occurs in the middle
257    /// of a Unicode extended grapheme cluster.
258    ///
259    /// The time complexity of this method is `O(self.len())`.
260    pub fn matches_regex(&self, regex: &CompiledRegex) -> NickelValue {
261        use grapheme_cluster_preservation::regex;
262
263        NickelValue::bool_value_posless(regex::find_iter(self, regex).next().is_some())
264    }
265
266    /// Returns a new string in which every occurence of `regex` in `self` is
267    /// replaced by `replacement`.
268    ///
269    /// Note that this function will not replace matches that begin or end
270    /// in the middle of a Unicode extended grapheme cluster.
271    ///
272    /// The time complexity of this method is `O(self.len())`.
273    pub fn replace_regex(&self, regex: &CompiledRegex, replacement: &NickelString) -> NickelString {
274        use grapheme_cluster_preservation::regex;
275
276        let mut result = String::new();
277        let mut prev_match_end = 0;
278        for m in regex::find_iter(self, regex) {
279            // Push everything between the last match and this one
280            result.push_str(&self[prev_match_end..m.start()]);
281            // Push the replacement
282            result.push_str(replacement);
283            // Skip to the end of the match
284            prev_match_end = m.end();
285        }
286        // Push whatever remains between the end of the match & the end of the
287        // string.
288        result.push_str(&self[prev_match_end..]);
289
290        result.into()
291    }
292
293    /// Find the first match in `self` for a given `regex`, and return the
294    /// match itself, the index in `self` where it appears, and any capture
295    /// groups specified.
296    ///
297    /// Note that matches will be ignored if either the match itself or any
298    /// of its capture groups begin or end in the middle of a Unicode extended
299    /// grapheme cluster.
300    ///
301    /// The time complexity of this method is `O(self.len())`.
302    pub fn find_regex(&self, regex: &CompiledRegex) -> Option<RegexFindResult> {
303        self.find_all_regex(regex).next()
304    }
305
306    /// Find all matches in `self` for a given `regex`, returning an iterator
307    /// over the match itself, the index in `self` where it appears, and any
308    /// capture groups specified.
309    ///
310    /// Note that matches will be ignored if either the match itself or any of
311    /// its capture groups begin or end in the middle of a Unicode extended
312    /// grapheme cluster.
313    pub fn find_all_regex<'a>(
314        &'a self,
315        regex: &'a CompiledRegex,
316    ) -> impl Iterator<Item = RegexFindResult> + 'a {
317        use grapheme_cluster_preservation::regex;
318
319        regex::captures_iter(self, regex).map(|capt| {
320            // If we found a capture group, we extract the whole match...
321            let first_match = capt.get(0).unwrap();
322            // and then convert each group into a `NickelString`
323            let groups = capt
324                .iter()
325                .skip(1)
326                .map(|s_opt| s_opt.map(|s| s.as_str().into()))
327                .collect();
328
329            // The indices returned by the `regex` crate are byte offsets into
330            // the string, but we need to return the index into the Nickel string,
331            // i.e., the grapheme cluster index which starts at this byte offset.
332            let adjusted_index = self
333                .grapheme_indices(true)
334                .enumerate()
335                .find_map(|(grapheme_idx, (byte_offset, _))| {
336                    if byte_offset == first_match.start() {
337                        Some(grapheme_idx.into())
338                    } else {
339                        None
340                    }
341                })
342                .expect("We already know that `first_match.start()` occurs on a cluster boundary.");
343
344            RegexFindResult {
345                matched: first_match.as_str().into(),
346                index: adjusted_index,
347                groups,
348            }
349        })
350    }
351
352    /// Consumes `self`, returning the Rust `String`.
353    pub fn into_inner(self) -> String {
354        self.0
355    }
356}
357
358impl Default for NickelString {
359    fn default() -> Self {
360        Self::new()
361    }
362}
363
364pub struct RegexFindResult {
365    pub matched: NickelString,
366    pub index: Number,
367    /// If a capture group didn't match, we store a `None`. This `None` placeholders
368    /// make the indexing predictable, so it's possible to associate captures with
369    /// parenthesis groupings in the original regex.
370    pub groups: Vec<Option<NickelString>>,
371}
372
373/// Errors returned by `NickelString`'s `substring` method.
374pub enum SubstringError {
375    /// The start index was not an int
376    NonIntStart(Number),
377    /// The end index was not an int
378    NonIntEnd(Number),
379    /// The start index was not within the bounds of the string
380    StartOutOfBounds { start: Number, str_len: usize },
381    EndOutOfBounds {
382        start: Number,
383        end: Number,
384        str_len: usize,
385    },
386}
387
388impl std::fmt::Display for SubstringError {
389    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
390        use SubstringError::*;
391
392        write!(f, "substring: ")?;
393
394        match self {
395            NonIntStart(start) => write!(
396                f,
397                "expected 2nd argument (start) to be a positive integer smaller than {}, \
398                got {start}",
399                usize::MAX
400            ),
401            NonIntEnd(end) => write!(
402                f,
403                "expected 3rd argument (end) to be a positive integer smaller than {}, got {end}",
404                usize::MAX
405            ),
406            StartOutOfBounds { start, str_len } => write!(
407                f,
408                "index out of bounds. \
409                Expected 2nd argument (start) to be between 0 and {str_len}, got {start}"
410            ),
411            EndOutOfBounds {
412                start,
413                end,
414                str_len,
415            } => write!(
416                f,
417                "index out of bounds. \
418                Expected 3rd argument (end) to be between {start} and {str_len}, got {end}"
419            ),
420        }
421    }
422}
423
424/// Types and functions designed to make it as easy as possible not to accidentally
425/// break up Unicode extended grapheme clusters in string operations.
426mod grapheme_cluster_preservation {
427    use unicode_segmentation::GraphemeCursor;
428
429    /// Returns a [`SearchIter`], which is an `Iterator` for iterating through
430    /// `haystack` in order to find `needle`.
431    ///
432    /// [`SearchIter`] produces [`SearchEvent`]s which can be used to
433    /// implement a variety of different behaviours.
434    pub fn search<'a>(haystack: &'a str, needle: &'a str) -> SearchIter<'a> {
435        SearchIter {
436            haystack,
437            needle,
438            cursor: GraphemeCursor::new(0, haystack.len(), true),
439            current_split_start: 0,
440            last_boundary: 0,
441        }
442    }
443
444    /// An event emitted by [`SearchIter`].
445    pub enum SearchEvent<'a> {
446        /// The `needle` was found within `haystack`.
447        /// Since the caller already has a reference to `needle`, we don't
448        /// include it in this event. However, we do include every part of
449        /// `haystack` which was traversed up until the match happened.
450        Match {
451            /// The slice of the string from the end of the last match
452            /// up to this one.
453            since_last_match: &'a str,
454        },
455        /// Iteration through `haystack` has finished, but there was
456        /// an unconsumed (non-matching) slice which the caller may wish
457        /// to do something with.
458        LastNonMatch {
459            /// Any non-matching text from the end of the string.
460            non_match: &'a str,
461        },
462    }
463
464    /// Our base algorithm for implementing a variety of grapheme cluster
465    /// preserving methods on [`NclString`]. As the name suggests, it is an
466    /// `Iterator`. However, rather than iterating through grapheme clusters
467    /// directly, its `Item` is a [`SearchEvent`].
468    pub struct SearchIter<'a> {
469        haystack: &'a str,
470        needle: &'a str,
471        cursor: GraphemeCursor,
472        current_split_start: usize,
473        last_boundary: usize,
474    }
475
476    impl<'a> Iterator for SearchIter<'a> {
477        type Item = SearchEvent<'a>;
478
479        fn next(&mut self) -> Option<Self::Item> {
480            use indoc::indoc;
481
482            let unwrap_explanation = indoc! {"
483                None of the GraphemeIncomplete errors are possible here:
484                    - PreContext and PrevChunk only happen if chunk_start is nonzero.
485                    - NextChunk only happens if the chunk is smaller than the cursor's len parameter
486                      but we pass `haystack` and `hackstack.len()`` respectively.
487                    - InvalidOffset can't happen because we check that `haystack` contains `needle`
488                      in the range (last_boundary, last_boundary + needle.len())
489            "};
490
491            loop {
492                let nxt = self
493                    .cursor
494                    .next_boundary(self.haystack, 0)
495                    .expect(unwrap_explanation);
496                match nxt {
497                    Some(next_boundary) => {
498                        // To check whether a match has occured, we'll first check whether
499                        // the slice of `haystack` starting at the `last_boundary` begins
500                        // with `needle`. If it does, we *also* need to check that this
501                        // instance of `needle` doesn't end in the middle of a cluster.
502                        // This is to avoid the situtaion where we have a string like
503                        // `"a🧑‍🤝‍🧑"` and we get a match by searching for `"a🧑"`.
504                        let does_match_intersect_cluster = || {
505                            let mut tmp_cursor = self.cursor.clone();
506                            tmp_cursor.set_cursor(self.last_boundary + self.needle.len());
507                            !tmp_cursor
508                                .is_boundary(self.haystack, 0)
509                                .expect(unwrap_explanation)
510                        };
511
512                        if self.haystack[self.last_boundary..].starts_with(self.needle)
513                            && !does_match_intersect_cluster()
514                        {
515                            // Yay, we found a match! We start by grabbing the slice of
516                            // the string we traversed while looking for the match.
517                            let since_last_match =
518                                &self.haystack[self.current_split_start..self.last_boundary];
519                            // Now we move the cursor to after the match, to avoid needlessly
520                            // traversing that part of the string again.
521                            // Note that in particular this means that if we seach for "aa" in
522                            // a string like "aaaa" we will get two matches: the first pair,
523                            // and then the second.
524                            let match_end = self.last_boundary + self.needle.len();
525                            self.cursor.set_cursor(match_end);
526                            self.current_split_start = match_end;
527                            self.last_boundary = match_end;
528                            // Finally we return the part of the string we traversed
529                            // to get to this match back to the caller.
530                            break Some(SearchEvent::Match { since_last_match });
531                        } else {
532                            // This is the only codepath which doesn't break
533                            // the loop. This is because we didn't match `needle`
534                            // so we're just going to move on to the next
535                            // cluster boundary.
536                            self.last_boundary = next_boundary;
537                        }
538                    }
539                    None => {
540                        // We finished iterating through the grapheme clusters, but
541                        // there could still be a final chunk of the string that we
542                        // haven't told the caller about yet.
543                        if self.current_split_start < self.haystack.len() {
544                            // We pass that slice back here & advance the `current_split_start`
545                            // pointer to the end of the string to ensure we don't hit this
546                            // condition again & will correctly return None on the next call.
547                            let non_match = &self.haystack[self.current_split_start..];
548                            self.current_split_start = self.haystack.len();
549                            break Some(SearchEvent::LastNonMatch { non_match });
550                        } else {
551                            break None;
552                        }
553                    }
554                }
555            }
556        }
557    }
558
559    /// Functions for working with regex without accidentally breaking up
560    /// Unicode extended grapheme clusters.
561    pub mod regex {
562        use regex::Regex;
563        use unicode_segmentation::GraphemeCursor;
564
565        /// An iterator which finds occurences of a given `Regex` pattern in
566        /// some source `str`, while filtering out any matches which either
567        /// begin or end in the middle of a Unicode extended grapheme cluster.
568        pub fn find_iter<'a>(
569            haystack: &'a str,
570            needle: &'a Regex,
571        ) -> impl Iterator<Item = regex::Match<'a>> {
572            needle
573                .find_iter(haystack)
574                .filter(|m| does_match_start_and_end_on_boundary(haystack, m))
575        }
576
577        /// Find all  `needle` matches in `haystack`, filtering out any match
578        /// where either the match itself, or any of its capture groups, begin
579        /// or end in the middle of a Unicode extended grapheme cluster.
580        pub fn captures_iter<'a>(
581            haystack: &'a str,
582            needle: &'a Regex,
583        ) -> impl Iterator<Item = regex::Captures<'a>> {
584            needle.captures_iter(haystack).filter(|c| {
585                c.iter().all(|maybe_match| {
586                    maybe_match.is_none_or(|m| does_match_start_and_end_on_boundary(haystack, &m))
587                })
588            })
589        }
590
591        fn does_match_start_and_end_on_boundary(haystack: &str, m: &regex::Match<'_>) -> bool {
592            let mut cursor = GraphemeCursor::new(0, haystack.len(), true);
593            cursor.set_cursor(m.start());
594            let starts_on_boundary = cursor.is_boundary(haystack, 0).expect("bad start");
595            cursor.set_cursor(m.end());
596            let ends_on_boundary = cursor.is_boundary(haystack, 0).expect("bad end");
597
598            starts_on_boundary && ends_on_boundary
599        }
600    }
601}