nickel_lang_core/term/string.rs
1use std::ops::{Deref, DerefMut};
2
3use serde::{Deserialize, Serialize};
4use unicode_segmentation::UnicodeSegmentation;
5
6use super::{CompiledRegex, Number};
7use crate::{
8 eval::value::{Array, NickelValue},
9 identifier::{Ident, LocIdent},
10};
11
12/// A Nickel string is really just a Rust `String`, overlayed with some
13/// methods implementing custom logic (in particular, functions which
14/// avoid ever breaking up Unicode extended grapheme clusters.)
15#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
16pub struct NickelString(String);
17
18// Conversions to & from String-like things.
19
20impl<S> From<S> for NickelString
21where
22 String: From<S>,
23{
24 fn from(inner: S) -> Self {
25 Self(inner.into())
26 }
27}
28
29impl From<&NickelString> for String {
30 fn from(s: &NickelString) -> Self {
31 s.clone().into_inner()
32 }
33}
34
35impl From<NickelString> for Ident {
36 fn from(s: NickelString) -> Self {
37 Ident::from(s.0)
38 }
39}
40
41impl From<NickelString> for LocIdent {
42 fn from(s: NickelString) -> Self {
43 LocIdent::from(s.0)
44 }
45}
46
47impl<'a> From<&'a NickelString> for LocIdent {
48 fn from(s: &'a NickelString) -> Self {
49 LocIdent::from(s.0.as_str())
50 }
51}
52
53// The below impls broadly allow `NickelString`s to be treated just like Rust `String`s.
54
55impl Deref for NickelString {
56 type Target = String;
57
58 fn deref(&self) -> &String {
59 &self.0
60 }
61}
62
63impl DerefMut for NickelString {
64 fn deref_mut(&mut self) -> &mut String {
65 &mut self.0
66 }
67}
68
69impl std::fmt::Display for NickelString {
70 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71 write!(f, "{}", self.0)
72 }
73}
74
75impl AsRef<str> for NickelString {
76 fn as_ref(&self) -> &str {
77 self.0.as_str()
78 }
79}
80
81impl NickelString {
82 /// Creates a new empty `NclString`. Which is to say, it creates a new,
83 /// empty `String`.
84 pub fn new() -> NickelString {
85 String::new().into()
86 }
87
88 /// The number of Unicode extended grapheme clusters the string contains.
89 ///
90 /// This method has `O(self.len())` time complexity.
91 pub fn length(&self) -> usize {
92 self.grapheme_cluster_count()
93 }
94
95 /// Returns the number of Unicode extended grapheme clusters the string
96 /// contains.
97 ///
98 /// This method has `O(self.len())` time complexity.
99 #[inline]
100 fn grapheme_cluster_count(&self) -> usize {
101 self.graphemes(true).count()
102 }
103
104 /// Returns an [`Array`] of Nickel strings, each one
105 /// containing a single Unicode extended grapheme cluster.
106 ///
107 /// This method has `O(self.len())` time complexity.
108 pub fn characters(&self) -> Array {
109 self.grapheme_clusters()
110 }
111
112 /// Returns an [`Array`] of Nickel strings, each one
113 /// containing a single Unicode extended grapheme cluster.
114 ///
115 /// This method has `O(self.len())` time complexity.
116 #[inline]
117 fn grapheme_clusters(&self) -> Array {
118 self.graphemes(true)
119 .map(NickelValue::string_posless)
120 .collect()
121 }
122
123 /// Splits the string on `separator`, returning an [`Array`] of Nickel
124 /// strings.
125 ///
126 /// This method has `O(self.len() * separator.len())` time complexity when
127 /// the separator is non-empty, or `O(self.len())` otherwise.
128 pub fn split(&self, separator: &str) -> Array {
129 if separator.is_empty() {
130 // If there's no separator, we split the string between
131 // each grapheme cluster.
132 self.grapheme_clusters()
133 } else {
134 use grapheme_cluster_preservation::SearchEvent;
135
136 let mut result = Vec::new();
137
138 // We build our result vec by searching through the string for the separator.
139 grapheme_cluster_preservation::search(self, separator).for_each(|e| match e {
140 SearchEvent::Match {
141 // If we hit a match, then everything we saw between the previous
142 // match and now is a split...
143 since_last_match: split,
144 } // ...then we do the same with whatever's left at the end.
145 | SearchEvent::LastNonMatch { non_match: split } => {
146 result.push(NickelValue::string_posless(split))
147 }
148 });
149
150 Array::from_iter(result)
151 }
152 }
153
154 /// Returns `true` if `needle` is contained in `self`, and `false` otherwise.
155 ///
156 /// Note that in contrast to Rust `String`'s `contains` method, this method
157 /// will not consider a Unicode codepoint to be contained in a string if
158 /// it exists as part of a larger extended grapheme cluster.
159 ///
160 /// The time complexity of this method is `O(self.len() * needle.len())`.
161 pub fn contains(&self, needle: &str) -> bool {
162 if needle.is_empty() {
163 true
164 } else {
165 use grapheme_cluster_preservation::SearchEvent;
166
167 grapheme_cluster_preservation::search(self, needle)
168 .any(|e| matches!(e, SearchEvent::Match { .. }))
169 }
170 }
171
172 /// Returns a new `NclString` replacing every occurence of `from` with `to`.
173 ///
174 /// This method has time complexity `O(self.len() * from.len())`.
175 pub fn replace(&self, from: &str, to: &str) -> NickelString {
176 let mut result = String::with_capacity(self.capacity());
177
178 if from.is_empty() {
179 // If `from` is empty then we:
180 // 1. insert `to` at the beginning.
181 result.push_str(to);
182 // 2. insert `to` after each cluster.
183 self.graphemes(true)
184 .flat_map(|grapheme| [grapheme, to])
185 .for_each(|s| result.push_str(s))
186 } else {
187 use grapheme_cluster_preservation::SearchEvent;
188
189 grapheme_cluster_preservation::search(self, from).for_each(|e| match e {
190 // If we hit a match...
191 SearchEvent::Match { since_last_match } => {
192 // ...we write everything we've seen since the last match...
193 result.push_str(since_last_match);
194 // ...and then we replace the matched `from` with `to`.
195 result.push_str(to);
196 }
197 // We also need to write anything remaining after the last match.
198 SearchEvent::LastNonMatch { non_match } => result.push_str(non_match),
199 });
200 }
201
202 result.into()
203 }
204
205 /// Returns the substring of `self` between `start` and `end`.
206 ///
207 /// Returns an error if:
208 /// - either start or end is not a fraction
209 /// - start < 0
210 /// - start > end
211 ///
212 /// The time complexity of this method is `O(self.len())`.
213 pub fn substring(&self, start: &Number, end: &Number) -> Result<NickelString, SubstringError> {
214 let Ok(start_usize) = usize::try_from(start) else {
215 return Err(SubstringError::NonIntStart(start.clone()));
216 };
217
218 let Ok(end_usize) = usize::try_from(end) else {
219 return Err(SubstringError::NonIntEnd(end.clone()));
220 };
221
222 let mut from_start = self.graphemes(true).skip(start_usize).peekable();
223
224 // If `from_start` is `None` then we skipped past the end of `self`.
225 if from_start.peek().is_none() {
226 Err(SubstringError::StartOutOfBounds {
227 start: start.clone(),
228 str_len: self.grapheme_cluster_count(),
229 })
230 } else if end_usize < start_usize {
231 Err(SubstringError::EndOutOfBounds {
232 start: start.clone(),
233 end: end.clone(),
234 str_len: self.grapheme_cluster_count(),
235 })
236 } else {
237 let wanted_substr_len = end_usize - start_usize;
238 let substr: String = from_start.take(wanted_substr_len).collect();
239 let substr: NickelString = substr.into();
240 // If `substr.grapheme_cluster_count()` is smaller than we wanted
241 // then end was larger than the length of the string.
242 if substr.grapheme_cluster_count() != wanted_substr_len {
243 Err(SubstringError::EndOutOfBounds {
244 start: start.clone(),
245 end: end.clone(),
246 str_len: self.grapheme_cluster_count(),
247 })
248 } else {
249 Ok(substr)
250 }
251 }
252 }
253
254 /// Returns `true` if `self` matches `regex` and `false` otherwise.
255 ///
256 /// Note that this function returns `false` if a match occurs in the middle
257 /// of a Unicode extended grapheme cluster.
258 ///
259 /// The time complexity of this method is `O(self.len())`.
260 pub fn matches_regex(&self, regex: &CompiledRegex) -> NickelValue {
261 use grapheme_cluster_preservation::regex;
262
263 NickelValue::bool_value_posless(regex::find_iter(self, regex).next().is_some())
264 }
265
266 /// Returns a new string in which every occurence of `regex` in `self` is
267 /// replaced by `replacement`.
268 ///
269 /// Note that this function will not replace matches that begin or end
270 /// in the middle of a Unicode extended grapheme cluster.
271 ///
272 /// The time complexity of this method is `O(self.len())`.
273 pub fn replace_regex(&self, regex: &CompiledRegex, replacement: &NickelString) -> NickelString {
274 use grapheme_cluster_preservation::regex;
275
276 let mut result = String::new();
277 let mut prev_match_end = 0;
278 for m in regex::find_iter(self, regex) {
279 // Push everything between the last match and this one
280 result.push_str(&self[prev_match_end..m.start()]);
281 // Push the replacement
282 result.push_str(replacement);
283 // Skip to the end of the match
284 prev_match_end = m.end();
285 }
286 // Push whatever remains between the end of the match & the end of the
287 // string.
288 result.push_str(&self[prev_match_end..]);
289
290 result.into()
291 }
292
293 /// Find the first match in `self` for a given `regex`, and return the
294 /// match itself, the index in `self` where it appears, and any capture
295 /// groups specified.
296 ///
297 /// Note that matches will be ignored if either the match itself or any
298 /// of its capture groups begin or end in the middle of a Unicode extended
299 /// grapheme cluster.
300 ///
301 /// The time complexity of this method is `O(self.len())`.
302 pub fn find_regex(&self, regex: &CompiledRegex) -> Option<RegexFindResult> {
303 self.find_all_regex(regex).next()
304 }
305
306 /// Find all matches in `self` for a given `regex`, returning an iterator
307 /// over the match itself, the index in `self` where it appears, and any
308 /// capture groups specified.
309 ///
310 /// Note that matches will be ignored if either the match itself or any of
311 /// its capture groups begin or end in the middle of a Unicode extended
312 /// grapheme cluster.
313 pub fn find_all_regex<'a>(
314 &'a self,
315 regex: &'a CompiledRegex,
316 ) -> impl Iterator<Item = RegexFindResult> + 'a {
317 use grapheme_cluster_preservation::regex;
318
319 regex::captures_iter(self, regex).map(|capt| {
320 // If we found a capture group, we extract the whole match...
321 let first_match = capt.get(0).unwrap();
322 // and then convert each group into a `NickelString`
323 let groups = capt
324 .iter()
325 .skip(1)
326 .map(|s_opt| s_opt.map(|s| s.as_str().into()))
327 .collect();
328
329 // The indices returned by the `regex` crate are byte offsets into
330 // the string, but we need to return the index into the Nickel string,
331 // i.e., the grapheme cluster index which starts at this byte offset.
332 let adjusted_index = self
333 .grapheme_indices(true)
334 .enumerate()
335 .find_map(|(grapheme_idx, (byte_offset, _))| {
336 if byte_offset == first_match.start() {
337 Some(grapheme_idx.into())
338 } else {
339 None
340 }
341 })
342 .expect("We already know that `first_match.start()` occurs on a cluster boundary.");
343
344 RegexFindResult {
345 matched: first_match.as_str().into(),
346 index: adjusted_index,
347 groups,
348 }
349 })
350 }
351
352 /// Consumes `self`, returning the Rust `String`.
353 pub fn into_inner(self) -> String {
354 self.0
355 }
356}
357
358impl Default for NickelString {
359 fn default() -> Self {
360 Self::new()
361 }
362}
363
364pub struct RegexFindResult {
365 pub matched: NickelString,
366 pub index: Number,
367 /// If a capture group didn't match, we store a `None`. This `None` placeholders
368 /// make the indexing predictable, so it's possible to associate captures with
369 /// parenthesis groupings in the original regex.
370 pub groups: Vec<Option<NickelString>>,
371}
372
373/// Errors returned by `NickelString`'s `substring` method.
374pub enum SubstringError {
375 /// The start index was not an int
376 NonIntStart(Number),
377 /// The end index was not an int
378 NonIntEnd(Number),
379 /// The start index was not within the bounds of the string
380 StartOutOfBounds { start: Number, str_len: usize },
381 EndOutOfBounds {
382 start: Number,
383 end: Number,
384 str_len: usize,
385 },
386}
387
388impl std::fmt::Display for SubstringError {
389 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
390 use SubstringError::*;
391
392 write!(f, "substring: ")?;
393
394 match self {
395 NonIntStart(start) => write!(
396 f,
397 "expected 2nd argument (start) to be a positive integer smaller than {}, \
398 got {start}",
399 usize::MAX
400 ),
401 NonIntEnd(end) => write!(
402 f,
403 "expected 3rd argument (end) to be a positive integer smaller than {}, got {end}",
404 usize::MAX
405 ),
406 StartOutOfBounds { start, str_len } => write!(
407 f,
408 "index out of bounds. \
409 Expected 2nd argument (start) to be between 0 and {str_len}, got {start}"
410 ),
411 EndOutOfBounds {
412 start,
413 end,
414 str_len,
415 } => write!(
416 f,
417 "index out of bounds. \
418 Expected 3rd argument (end) to be between {start} and {str_len}, got {end}"
419 ),
420 }
421 }
422}
423
424/// Types and functions designed to make it as easy as possible not to accidentally
425/// break up Unicode extended grapheme clusters in string operations.
426mod grapheme_cluster_preservation {
427 use unicode_segmentation::GraphemeCursor;
428
429 /// Returns a [`SearchIter`], which is an `Iterator` for iterating through
430 /// `haystack` in order to find `needle`.
431 ///
432 /// [`SearchIter`] produces [`SearchEvent`]s which can be used to
433 /// implement a variety of different behaviours.
434 pub fn search<'a>(haystack: &'a str, needle: &'a str) -> SearchIter<'a> {
435 SearchIter {
436 haystack,
437 needle,
438 cursor: GraphemeCursor::new(0, haystack.len(), true),
439 current_split_start: 0,
440 last_boundary: 0,
441 }
442 }
443
444 /// An event emitted by [`SearchIter`].
445 pub enum SearchEvent<'a> {
446 /// The `needle` was found within `haystack`.
447 /// Since the caller already has a reference to `needle`, we don't
448 /// include it in this event. However, we do include every part of
449 /// `haystack` which was traversed up until the match happened.
450 Match {
451 /// The slice of the string from the end of the last match
452 /// up to this one.
453 since_last_match: &'a str,
454 },
455 /// Iteration through `haystack` has finished, but there was
456 /// an unconsumed (non-matching) slice which the caller may wish
457 /// to do something with.
458 LastNonMatch {
459 /// Any non-matching text from the end of the string.
460 non_match: &'a str,
461 },
462 }
463
464 /// Our base algorithm for implementing a variety of grapheme cluster
465 /// preserving methods on [`NclString`]. As the name suggests, it is an
466 /// `Iterator`. However, rather than iterating through grapheme clusters
467 /// directly, its `Item` is a [`SearchEvent`].
468 pub struct SearchIter<'a> {
469 haystack: &'a str,
470 needle: &'a str,
471 cursor: GraphemeCursor,
472 current_split_start: usize,
473 last_boundary: usize,
474 }
475
476 impl<'a> Iterator for SearchIter<'a> {
477 type Item = SearchEvent<'a>;
478
479 fn next(&mut self) -> Option<Self::Item> {
480 use indoc::indoc;
481
482 let unwrap_explanation = indoc! {"
483 None of the GraphemeIncomplete errors are possible here:
484 - PreContext and PrevChunk only happen if chunk_start is nonzero.
485 - NextChunk only happens if the chunk is smaller than the cursor's len parameter
486 but we pass `haystack` and `hackstack.len()`` respectively.
487 - InvalidOffset can't happen because we check that `haystack` contains `needle`
488 in the range (last_boundary, last_boundary + needle.len())
489 "};
490
491 loop {
492 let nxt = self
493 .cursor
494 .next_boundary(self.haystack, 0)
495 .expect(unwrap_explanation);
496 match nxt {
497 Some(next_boundary) => {
498 // To check whether a match has occured, we'll first check whether
499 // the slice of `haystack` starting at the `last_boundary` begins
500 // with `needle`. If it does, we *also* need to check that this
501 // instance of `needle` doesn't end in the middle of a cluster.
502 // This is to avoid the situtaion where we have a string like
503 // `"a🧑🤝🧑"` and we get a match by searching for `"a🧑"`.
504 let does_match_intersect_cluster = || {
505 let mut tmp_cursor = self.cursor.clone();
506 tmp_cursor.set_cursor(self.last_boundary + self.needle.len());
507 !tmp_cursor
508 .is_boundary(self.haystack, 0)
509 .expect(unwrap_explanation)
510 };
511
512 if self.haystack[self.last_boundary..].starts_with(self.needle)
513 && !does_match_intersect_cluster()
514 {
515 // Yay, we found a match! We start by grabbing the slice of
516 // the string we traversed while looking for the match.
517 let since_last_match =
518 &self.haystack[self.current_split_start..self.last_boundary];
519 // Now we move the cursor to after the match, to avoid needlessly
520 // traversing that part of the string again.
521 // Note that in particular this means that if we seach for "aa" in
522 // a string like "aaaa" we will get two matches: the first pair,
523 // and then the second.
524 let match_end = self.last_boundary + self.needle.len();
525 self.cursor.set_cursor(match_end);
526 self.current_split_start = match_end;
527 self.last_boundary = match_end;
528 // Finally we return the part of the string we traversed
529 // to get to this match back to the caller.
530 break Some(SearchEvent::Match { since_last_match });
531 } else {
532 // This is the only codepath which doesn't break
533 // the loop. This is because we didn't match `needle`
534 // so we're just going to move on to the next
535 // cluster boundary.
536 self.last_boundary = next_boundary;
537 }
538 }
539 None => {
540 // We finished iterating through the grapheme clusters, but
541 // there could still be a final chunk of the string that we
542 // haven't told the caller about yet.
543 if self.current_split_start < self.haystack.len() {
544 // We pass that slice back here & advance the `current_split_start`
545 // pointer to the end of the string to ensure we don't hit this
546 // condition again & will correctly return None on the next call.
547 let non_match = &self.haystack[self.current_split_start..];
548 self.current_split_start = self.haystack.len();
549 break Some(SearchEvent::LastNonMatch { non_match });
550 } else {
551 break None;
552 }
553 }
554 }
555 }
556 }
557 }
558
559 /// Functions for working with regex without accidentally breaking up
560 /// Unicode extended grapheme clusters.
561 pub mod regex {
562 use regex::Regex;
563 use unicode_segmentation::GraphemeCursor;
564
565 /// An iterator which finds occurences of a given `Regex` pattern in
566 /// some source `str`, while filtering out any matches which either
567 /// begin or end in the middle of a Unicode extended grapheme cluster.
568 pub fn find_iter<'a>(
569 haystack: &'a str,
570 needle: &'a Regex,
571 ) -> impl Iterator<Item = regex::Match<'a>> {
572 needle
573 .find_iter(haystack)
574 .filter(|m| does_match_start_and_end_on_boundary(haystack, m))
575 }
576
577 /// Find all `needle` matches in `haystack`, filtering out any match
578 /// where either the match itself, or any of its capture groups, begin
579 /// or end in the middle of a Unicode extended grapheme cluster.
580 pub fn captures_iter<'a>(
581 haystack: &'a str,
582 needle: &'a Regex,
583 ) -> impl Iterator<Item = regex::Captures<'a>> {
584 needle.captures_iter(haystack).filter(|c| {
585 c.iter().all(|maybe_match| {
586 maybe_match.is_none_or(|m| does_match_start_and_end_on_boundary(haystack, &m))
587 })
588 })
589 }
590
591 fn does_match_start_and_end_on_boundary(haystack: &str, m: ®ex::Match<'_>) -> bool {
592 let mut cursor = GraphemeCursor::new(0, haystack.len(), true);
593 cursor.set_cursor(m.start());
594 let starts_on_boundary = cursor.is_boundary(haystack, 0).expect("bad start");
595 cursor.set_cursor(m.end());
596 let ends_on_boundary = cursor.is_boundary(haystack, 0).expect("bad end");
597
598 starts_on_boundary && ends_on_boundary
599 }
600 }
601}