nickel_lang_core/term/string.rs
1use std::ops::{Deref, DerefMut};
2
3use serde::{Deserialize, Serialize};
4use unicode_segmentation::UnicodeSegmentation;
5
6use super::{array::Array, CompiledRegex, Number, Term};
7use crate::identifier::{Ident, LocIdent};
8
9/// A Nickel string is really just a Rust `String`, overlayed with some
10/// methods implementing custom logic (in particular, functions which
11/// avoid ever breaking up Unicode extended grapheme clusters.)
12#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
13pub struct NickelString(String);
14
15// Conversions to & from String-like things.
16
17impl<S> From<S> for NickelString
18where
19 String: From<S>,
20{
21 fn from(inner: S) -> Self {
22 Self(inner.into())
23 }
24}
25
26impl From<&NickelString> for String {
27 fn from(s: &NickelString) -> Self {
28 s.clone().into_inner()
29 }
30}
31
32impl From<NickelString> for Ident {
33 fn from(s: NickelString) -> Self {
34 Ident::from(s.0)
35 }
36}
37
38impl From<NickelString> for LocIdent {
39 fn from(s: NickelString) -> Self {
40 LocIdent::from(s.0)
41 }
42}
43
44impl<'a> From<&'a NickelString> for LocIdent {
45 fn from(s: &'a NickelString) -> Self {
46 LocIdent::from(s.0.as_str())
47 }
48}
49
50// The below impls broadly allow `NclString`s to be treated just like
51// Rust `String`s.
52
53impl Deref for NickelString {
54 type Target = String;
55
56 fn deref(&self) -> &String {
57 &self.0
58 }
59}
60
61impl DerefMut for NickelString {
62 fn deref_mut(&mut self) -> &mut String {
63 &mut self.0
64 }
65}
66
67impl std::fmt::Display for NickelString {
68 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
69 write!(f, "{}", self.0)
70 }
71}
72
73impl AsRef<str> for NickelString {
74 fn as_ref(&self) -> &str {
75 self.0.as_str()
76 }
77}
78
79impl NickelString {
80 /// Creates a new empty `NclString`. Which is to say, it creates a new,
81 /// empty `String`.
82 pub fn new() -> NickelString {
83 String::new().into()
84 }
85
86 /// The number of Unicode extended grapheme clusters the string contains.
87 ///
88 /// This method has `O(self.len())` time complexity.
89 pub fn length(&self) -> usize {
90 self.grapheme_cluster_count()
91 }
92
93 /// Returns the number of Unicode extended grapheme clusters the string
94 /// contains.
95 ///
96 /// This method has `O(self.len())` time complexity.
97 #[inline]
98 fn grapheme_cluster_count(&self) -> usize {
99 self.graphemes(true).count()
100 }
101
102 /// Returns an [`Array`] of Nickel strings, each one
103 /// containing a single Unicode extended grapheme cluster.
104 ///
105 /// This method has `O(self.len())` time complexity.
106 pub fn characters(&self) -> Array {
107 self.grapheme_clusters()
108 }
109
110 /// Returns an [`Array`] of Nickel strings, each one
111 /// containing a single Unicode extended grapheme cluster.
112 ///
113 /// This method has `O(self.len())` time complexity.
114 #[inline]
115 fn grapheme_clusters(&self) -> Array {
116 self.graphemes(true)
117 .map(|g| Term::Str(g.into()).into())
118 .collect()
119 }
120
121 /// Splits the string on `separator`, returning an [`Array`] of Nickel
122 /// strings.
123 ///
124 /// This method has `O(self.len() * separator.len())` time complexity when
125 /// the separator is non-empty, or `O(self.len())` otherwise.
126 pub fn split(&self, separator: &str) -> Array {
127 if separator.is_empty() {
128 // If there's no separator, we split the string between
129 // each grapheme cluster.
130 self.grapheme_clusters()
131 } else {
132 use grapheme_cluster_preservation::SearchEvent;
133
134 let mut result = Vec::new();
135
136 // We build our result vec by searching through the string for the separator.
137 grapheme_cluster_preservation::search(self, separator).for_each(|e| match e {
138 SearchEvent::Match {
139 // If we hit a match, then everything we saw between the previous
140 // match and now is a split...
141 since_last_match: split,
142 } // ...then we do the same with whatever's left at the end.
143 | SearchEvent::LastNonMatch { non_match: split } => {
144 result.push(Term::Str(split.into()).into())
145 }
146 });
147
148 Array::from_iter(result)
149 }
150 }
151
152 /// Returns `true` if `needle` is contained in `self`, and `false` otherwise.
153 ///
154 /// Note that in contrast to Rust `String`'s `contains` method, this method
155 /// will not consider a Unicode codepoint to be contained in a string if
156 /// it exists as part of a larger extended grapheme cluster.
157 ///
158 /// The time complexity of this method is `O(self.len() * needle.len())`.
159 pub fn contains(&self, needle: &str) -> bool {
160 if needle.is_empty() {
161 true
162 } else {
163 use grapheme_cluster_preservation::SearchEvent;
164
165 grapheme_cluster_preservation::search(self, needle)
166 .any(|e| matches!(e, SearchEvent::Match { .. }))
167 }
168 }
169
170 /// Returns a new `NclString` replacing every occurence of `from` with `to`.
171 ///
172 /// This method has time complexity `O(self.len() * from.len())`.
173 pub fn replace(&self, from: &str, to: &str) -> NickelString {
174 let mut result = String::with_capacity(self.capacity());
175
176 if from.is_empty() {
177 // If `from` is empty then we:
178 // 1. insert `to` at the beginning.
179 result.push_str(to);
180 // 2. insert `to` after each cluster.
181 self.graphemes(true)
182 .flat_map(|grapheme| [grapheme, to])
183 .for_each(|s| result.push_str(s))
184 } else {
185 use grapheme_cluster_preservation::SearchEvent;
186
187 grapheme_cluster_preservation::search(self, from).for_each(|e| match e {
188 // If we hit a match...
189 SearchEvent::Match { since_last_match } => {
190 // ...we write everything we've seen since the last match...
191 result.push_str(since_last_match);
192 // ...and then we replace the matched `from` with `to`.
193 result.push_str(to);
194 }
195 // We also need to write anything remaining after the last match.
196 SearchEvent::LastNonMatch { non_match } => result.push_str(non_match),
197 });
198 }
199
200 result.into()
201 }
202
203 /// Returns the substring of `self` between `start` and `end`.
204 ///
205 /// Returns an error if:
206 /// - either start or end is not a fraction
207 /// - start < 0
208 /// - start > end
209 ///
210 /// The time complexity of this method is `O(self.len())`.
211 pub fn substring(&self, start: &Number, end: &Number) -> Result<NickelString, SubstringError> {
212 let Ok(start_usize) = usize::try_from(start) else {
213 return Err(SubstringError::NonIntStart(start.clone()));
214 };
215
216 let Ok(end_usize) = usize::try_from(end) else {
217 return Err(SubstringError::NonIntEnd(end.clone()));
218 };
219
220 let mut from_start = self.graphemes(true).skip(start_usize).peekable();
221
222 // If `from_start` is `None` then we skipped past the end of `self`.
223 if from_start.peek().is_none() {
224 Err(SubstringError::StartOutOfBounds {
225 start: start.clone(),
226 str_len: self.grapheme_cluster_count(),
227 })
228 } else if end_usize < start_usize {
229 Err(SubstringError::EndOutOfBounds {
230 start: start.clone(),
231 end: end.clone(),
232 str_len: self.grapheme_cluster_count(),
233 })
234 } else {
235 let wanted_substr_len = end_usize - start_usize;
236 let substr: String = from_start.take(wanted_substr_len).collect();
237 let substr: NickelString = substr.into();
238 // If `substr.grapheme_cluster_count()` is smaller than we wanted
239 // then end was larger than the length of the string.
240 if substr.grapheme_cluster_count() != wanted_substr_len {
241 Err(SubstringError::EndOutOfBounds {
242 start: start.clone(),
243 end: end.clone(),
244 str_len: self.grapheme_cluster_count(),
245 })
246 } else {
247 Ok(substr)
248 }
249 }
250 }
251
252 /// Returns `true` if `self` matches `regex` and `false` otherwise.
253 ///
254 /// Note that this function returns `false` if a match occurs in the middle
255 /// of a Unicode extended grapheme cluster.
256 ///
257 /// The time complexity of this method is `O(self.len())`.
258 pub fn matches_regex(&self, regex: &CompiledRegex) -> Term {
259 use grapheme_cluster_preservation::regex;
260
261 Term::Bool(regex::find_iter(self, regex).next().is_some())
262 }
263
264 /// Returns a new string in which every occurence of `regex` in `self` is
265 /// replaced by `replacement`.
266 ///
267 /// Note that this function will not replace matches that begin or end
268 /// in the middle of a Unicode extended grapheme cluster.
269 ///
270 /// The time complexity of this method is `O(self.len())`.
271 pub fn replace_regex(&self, regex: &CompiledRegex, replacement: &NickelString) -> NickelString {
272 use grapheme_cluster_preservation::regex;
273
274 let mut result = String::new();
275 let mut prev_match_end = 0;
276 for m in regex::find_iter(self, regex) {
277 // Push everything between the last match and this one
278 result.push_str(&self[prev_match_end..m.start()]);
279 // Push the replacement
280 result.push_str(replacement);
281 // Skip to the end of the match
282 prev_match_end = m.end();
283 }
284 // Push whatever remains between the end of the match & the end of the
285 // string.
286 result.push_str(&self[prev_match_end..]);
287
288 result.into()
289 }
290
291 /// Find the first match in `self` for a given `regex`, and return the
292 /// match itself, the index in `self` where it appears, and any capture
293 /// groups specified.
294 ///
295 /// Note that matches will be ignored if either the match itself or any
296 /// of its capture groups begin or end in the middle of a Unicode extended
297 /// grapheme cluster.
298 ///
299 /// The time complexity of this method is `O(self.len())`.
300 pub fn find_regex(&self, regex: &CompiledRegex) -> Option<RegexFindResult> {
301 self.find_all_regex(regex).next()
302 }
303
304 /// Find all matches in `self` for a given `regex`, returning an iterator
305 /// over the match itself, the index in `self` where it appears, and any
306 /// capture groups specified.
307 ///
308 /// Note that matches will be ignored if either the match itself or any of
309 /// its capture groups begin or end in the middle of a Unicode extended
310 /// grapheme cluster.
311 pub fn find_all_regex<'a>(
312 &'a self,
313 regex: &'a CompiledRegex,
314 ) -> impl Iterator<Item = RegexFindResult> + 'a {
315 use grapheme_cluster_preservation::regex;
316
317 regex::captures_iter(self, regex).map(|capt| {
318 // If we found a capture group, we extract the whole match...
319 let first_match = capt.get(0).unwrap();
320 // and then convert each group into a `NickelString`
321 let groups = capt
322 .iter()
323 .skip(1)
324 .map(|s_opt| s_opt.map(|s| s.as_str().into()))
325 .collect();
326
327 // The indices returned by the `regex` crate are byte offsets into
328 // the string, but we need to return the index into the Nickel string,
329 // i.e., the grapheme cluster index which starts at this byte offset.
330 let adjusted_index = self
331 .grapheme_indices(true)
332 .enumerate()
333 .find_map(|(grapheme_idx, (byte_offset, _))| {
334 if byte_offset == first_match.start() {
335 Some(grapheme_idx.into())
336 } else {
337 None
338 }
339 })
340 .expect("We already know that `first_match.start()` occurs on a cluster boundary.");
341
342 RegexFindResult {
343 matched: first_match.as_str().into(),
344 index: adjusted_index,
345 groups,
346 }
347 })
348 }
349
350 /// Consumes `self`, returning the Rust `String`.
351 pub fn into_inner(self) -> String {
352 self.0
353 }
354}
355
356impl Default for NickelString {
357 fn default() -> Self {
358 Self::new()
359 }
360}
361
362pub struct RegexFindResult {
363 pub matched: NickelString,
364 pub index: Number,
365 /// If a capture group didn't match, we store a `None`. This `None` placeholders
366 /// make the indexing predictable, so it's possible to associate captures with
367 /// parenthesis groupings in the original regex.
368 pub groups: Vec<Option<NickelString>>,
369}
370
371/// Errors returned by `NickelString`'s `substring` method.
372pub enum SubstringError {
373 /// The start index was not an int
374 NonIntStart(Number),
375 /// The end index was not an int
376 NonIntEnd(Number),
377 /// The start index was not within the bounds of the string
378 StartOutOfBounds { start: Number, str_len: usize },
379 EndOutOfBounds {
380 start: Number,
381 end: Number,
382 str_len: usize,
383 },
384}
385
386impl std::fmt::Display for SubstringError {
387 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
388 use SubstringError::*;
389
390 write!(f, "substring: ")?;
391
392 match self {
393 NonIntStart(start) => write!(
394 f,
395 "expected 2nd argument (start) to be a positive integer smaller than {}, \
396 got {start}",
397 usize::MAX
398 ),
399 NonIntEnd(end) => write!(
400 f,
401 "expected 3rd argument (end) to be a positive integer smaller than {}, got {end}",
402 usize::MAX
403 ),
404 StartOutOfBounds { start, str_len } => write!(
405 f,
406 "index out of bounds. \
407 Expected 2nd argument (start) to be between 0 and {str_len}, got {start}"
408 ),
409 EndOutOfBounds {
410 start,
411 end,
412 str_len,
413 } => write!(
414 f,
415 "index out of bounds. \
416 Expected 3rd argument (end) to be between {start} and {str_len}, got {end}"
417 ),
418 }
419 }
420}
421
422/// Types and functions designed to make it as easy as possible not to accidentally
423/// break up Unicode extended grapheme clusters in string operations.
424mod grapheme_cluster_preservation {
425 use unicode_segmentation::GraphemeCursor;
426
427 /// Returns a [`SearchIter`], which is an `Iterator` for iterating through
428 /// `haystack` in order to find `needle`.
429 ///
430 /// [`SearchIter`] produces [`SearchEvent`]s which can be used to
431 /// implement a variety of different behaviours.
432 pub fn search<'a>(haystack: &'a str, needle: &'a str) -> SearchIter<'a> {
433 SearchIter {
434 haystack,
435 needle,
436 cursor: GraphemeCursor::new(0, haystack.len(), true),
437 current_split_start: 0,
438 last_boundary: 0,
439 }
440 }
441
442 /// An event emitted by [`SearchIter`].
443 pub enum SearchEvent<'a> {
444 /// The `needle` was found within `haystack`.
445 /// Since the caller already has a reference to `needle`, we don't
446 /// include it in this event. However, we do include every part of
447 /// `haystack` which was traversed up until the match happened.
448 Match {
449 /// The slice of the string from the end of the last match
450 /// up to this one.
451 since_last_match: &'a str,
452 },
453 /// Iteration through `haystack` has finished, but there was
454 /// an unconsumed (non-matching) slice which the caller may wish
455 /// to do something with.
456 LastNonMatch {
457 /// Any non-matching text from the end of the string.
458 non_match: &'a str,
459 },
460 }
461
462 /// Our base algorithm for implementing a variety of grapheme cluster
463 /// preserving methods on [`NclString`]. As the name suggests, it is an
464 /// `Iterator`. However, rather than iterating through grapheme clusters
465 /// directly, its `Item` is a [`SearchEvent`].
466 pub struct SearchIter<'a> {
467 haystack: &'a str,
468 needle: &'a str,
469 cursor: GraphemeCursor,
470 current_split_start: usize,
471 last_boundary: usize,
472 }
473
474 impl<'a> Iterator for SearchIter<'a> {
475 type Item = SearchEvent<'a>;
476
477 fn next(&mut self) -> Option<Self::Item> {
478 use indoc::indoc;
479
480 let unwrap_explanation = indoc! {"
481 None of the GraphemeIncomplete errors are possible here:
482 - PreContext and PrevChunk only happen if chunk_start is nonzero.
483 - NextChunk only happens if the chunk is smaller than the cursor's len parameter
484 but we pass `haystack` and `hackstack.len()`` respectively.
485 - InvalidOffset can't happen because we check that `haystack` contains `needle`
486 in the range (last_boundary, last_boundary + needle.len())
487 "};
488
489 loop {
490 let nxt = self
491 .cursor
492 .next_boundary(self.haystack, 0)
493 .expect(unwrap_explanation);
494 match nxt {
495 Some(next_boundary) => {
496 // To check whether a match has occured, we'll first check whether
497 // the slice of `haystack` starting at the `last_boundary` begins
498 // with `needle`. If it does, we *also* need to check that this
499 // instance of `needle` doesn't end in the middle of a cluster.
500 // This is to avoid the situtaion where we have a string like
501 // `"a🧑🤝🧑"` and we get a match by searching for `"a🧑"`.
502 let does_match_intersect_cluster = || {
503 let mut tmp_cursor = self.cursor.clone();
504 tmp_cursor.set_cursor(self.last_boundary + self.needle.len());
505 !tmp_cursor
506 .is_boundary(self.haystack, 0)
507 .expect(unwrap_explanation)
508 };
509
510 if self.haystack[self.last_boundary..].starts_with(self.needle)
511 && !does_match_intersect_cluster()
512 {
513 // Yay, we found a match! We start by grabbing the slice of
514 // the string we traversed while looking for the match.
515 let since_last_match =
516 &self.haystack[self.current_split_start..self.last_boundary];
517 // Now we move the cursor to after the match, to avoid needlessly
518 // traversing that part of the string again.
519 // Note that in particular this means that if we seach for "aa" in
520 // a string like "aaaa" we will get two matches: the first pair,
521 // and then the second.
522 let match_end = self.last_boundary + self.needle.len();
523 self.cursor.set_cursor(match_end);
524 self.current_split_start = match_end;
525 self.last_boundary = match_end;
526 // Finally we return the part of the string we traversed
527 // to get to this match back to the caller.
528 break Some(SearchEvent::Match { since_last_match });
529 } else {
530 // This is the only codepath which doesn't break
531 // the loop. This is because we didn't match `needle`
532 // so we're just going to move on to the next
533 // cluster boundary.
534 self.last_boundary = next_boundary;
535 }
536 }
537 None => {
538 // We finished iterating through the grapheme clusters, but
539 // there could still be a final chunk of the string that we
540 // haven't told the caller about yet.
541 if self.current_split_start < self.haystack.len() {
542 // We pass that slice back here & advance the `current_split_start`
543 // pointer to the end of the string to ensure we don't hit this
544 // condition again & will correctly return None on the next call.
545 let non_match = &self.haystack[self.current_split_start..];
546 self.current_split_start = self.haystack.len();
547 break Some(SearchEvent::LastNonMatch { non_match });
548 } else {
549 break None;
550 }
551 }
552 }
553 }
554 }
555 }
556
557 /// Functions for working with regex without accidentally breaking up
558 /// Unicode extended grapheme clusters.
559 pub mod regex {
560 use regex::Regex;
561 use unicode_segmentation::GraphemeCursor;
562
563 /// An iterator which finds occurences of a given `Regex` pattern in
564 /// some source `str`, while filtering out any matches which either
565 /// begin or end in the middle of a Unicode extended grapheme cluster.
566 pub fn find_iter<'a>(
567 haystack: &'a str,
568 needle: &'a Regex,
569 ) -> impl Iterator<Item = regex::Match<'a>> {
570 needle
571 .find_iter(haystack)
572 .filter(|m| does_match_start_and_end_on_boundary(haystack, m))
573 }
574
575 /// Find all `needle` matches in `haystack`, filtering out any match
576 /// where either the match itself, or any of its capture groups, begin
577 /// or end in the middle of a Unicode extended grapheme cluster.
578 pub fn captures_iter<'a>(
579 haystack: &'a str,
580 needle: &'a Regex,
581 ) -> impl Iterator<Item = regex::Captures<'a>> {
582 needle.captures_iter(haystack).filter(|c| {
583 c.iter().all(|maybe_match| {
584 maybe_match.is_none_or(|m| does_match_start_and_end_on_boundary(haystack, &m))
585 })
586 })
587 }
588
589 fn does_match_start_and_end_on_boundary(haystack: &str, m: ®ex::Match<'_>) -> bool {
590 let mut cursor = GraphemeCursor::new(0, haystack.len(), true);
591 cursor.set_cursor(m.start());
592 let starts_on_boundary = cursor.is_boundary(haystack, 0).expect("bad start");
593 cursor.set_cursor(m.end());
594 let ends_on_boundary = cursor.is_boundary(haystack, 0).expect("bad end");
595
596 starts_on_boundary && ends_on_boundary
597 }
598 }
599}