safe_string/
lib.rs

1//! This crate provides replacement types for [`String`] and [`&str`](`str`) that allow for safe
2//! indexing by character to avoid panics and the usual pitfalls of working with multi-byte
3//! UTF-8 characters, namely the scenario where the _byte length_ of a string and the
4//! _character length_ of that same string are not the same.
5//!
6//! Specifically, [`IndexedString`] (replaces [`String`]) and [`IndexedSlice`] (replaces
7//! [`&str`](`str`)) allow for O(1) slicing and indexing by character, and they will never panic
8//! when indexing or slicing.
9//!
10//! This is accomplished by storing the character offsets of each character in the string,
11//! along with the original [`String`], and using this information to calculate the byte
12//! offsets of each character on the fly. Thus [`IndexedString`] uses ~2x the memory of a
13//! normal [`String`], but [`IndexedSlice`] and other types implementing [`IndexedStr`] have
14//! only one [`usize`] extra in overhead over that of a regular [`&str`](`str`) slice / fat
15//! pointer. In theory this could be reduced down to the same size as a fat pointer using
16//! unsafe rust, but this way we get to have completely safe code and the difference is
17//! negligible.
18//! # Examples
19//!
20//! ```
21//! use safe_string::{IndexedString, IndexedStr, IndexedSlice};
22//!
23//! let message = IndexedString::from("Hello, δΈ–η•Œ! πŸ‘‹πŸ˜Š");
24//! assert_eq!(message.as_str(), "Hello, δΈ–η•Œ! πŸ‘‹πŸ˜Š");
25//! assert_eq!(message, "Hello, δΈ–η•Œ! πŸ‘‹πŸ˜Š"); // handy PartialEq impls
26//!
27//! // Access characters by index
28//! assert_eq!(message.char_at(7), Some('δΈ–'));
29//! assert_eq!(message.char_at(100), None); // Out of bounds access returns None
30//!
31//! // Slice the IndexedString
32//! let slice = message.slice(7..9);
33//! assert_eq!(slice.as_str(), "δΈ–η•Œ");
34//!
35//! // Convert slice back to IndexedString
36//! let sliced_message = slice.to_indexed_string();
37//! assert_eq!(sliced_message.as_str(), "δΈ–η•Œ");
38//!
39//! // Nested slicing
40//! let slice = message.slice(0..10);
41//! let nested_slice = slice.slice(3..6);
42//! assert_eq!(nested_slice.as_str(), "lo,");
43//!
44//! // Display byte length and character length
45//! assert_eq!(IndexedString::from_str("δΈ–η•Œ").byte_len(), 6); // "δΈ–η•Œ" is 6 bytes in UTF-8
46//! assert_eq!(IndexedString::from_str("δΈ–η•Œ").len(), 2); // "δΈ–η•Œ" has 2 characters
47//!
48//! // Demonstrate clamped slicing (no panic)
49//! let clamped_slice = message.slice(20..30);
50//! assert_eq!(clamped_slice.as_str(), "");
51//!
52//! // Using `as_str` to interface with standard Rust string handling
53//! let slice = message.slice(0..5);
54//! let standard_str_slice = slice.as_str();
55//! assert_eq!(standard_str_slice, "Hello");
56//! ```
57
58#![deny(missing_docs)]
59
60use core::fmt::{Debug, Display};
61use core::ops::{Bound, RangeBounds};
62use core::str::FromStr;
63
64/// A trait that facilitates safe interaction with strings that contain multi-byte characters.
65///
66/// [`IndexedString`] replaces [`String`], whereas [`IndexedSlice`] replaces [`&str`](`str`).
67///
68/// Both of these types as well as anything that implements [`IndexedStr`] are characterized by
69/// the fact that their `len()` and indexing methods operate on characters, not bytes, and
70/// enough information is stored to allow for O(1) slicing and indexing on a character _and_
71/// byte basis as needed, but the default interface is character-centric.
72///
73/// This all comes at the cost of increased memory usage and some performance overhead when a
74/// [`IndexedString`] is created, but the overhead is minimal when using [`IndexedSlice`] or
75/// any other type implementing [`IndexedStr`].
76///
77/// It is also worth noting that all of these types will never panic when indexing or slicing,
78/// unlike [`&str`](`str`) and [`String`], and clamped bounds are used instead.
79pub trait IndexedStr:
80    Display + Debug + PartialEq<IndexedString> + for<'a> PartialEq<IndexedSlice<'a>>
81{
82    /// Returns a [`&str`](`str`) representation of this [`IndexedStr`].
83    ///
84    /// WARNING: Once you cast to a [`&str`](`str`), you are leaving the safety provided by
85    /// [`IndexedStr`]. Only use this method when you need to interface with code that requires
86    /// a [`&str`](`str`).
87    fn as_str(&self) -> &str;
88
89    /// Returns a [`IndexedSlice`] that represents the entire contents of this [`IndexedStr`].
90    fn as_slice(&self) -> IndexedSlice;
91
92    /// Returns the length of this [`IndexedStr`] in characters, NOT bytes.
93    fn len(&self) -> usize;
94
95    /// Returns the byte length of this [`IndexedStr`]. This will be longer than the
96    /// [`len`](`IndexedStr::len`) if the string contains multi-byte characters.
97    fn byte_len(&self) -> usize;
98
99    /// Returns `true` if this [`IndexedStr`] is empty (of length 0).
100    fn is_empty(&self) -> bool {
101        self.len() == 0
102    }
103
104    /// Returns the character at the given index, if it exists.
105    fn char_at(&self, index: usize) -> Option<char>;
106
107    /// Returns a sub-slice of this [`IndexedStr`] based on the given range in terms of the
108    /// _characters_ in the string, not bytes.
109    ///
110    /// The range is automatically clamped to the bounds of the [`IndexedStr`].
111    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice;
112
113    /// Returns a slice containing all characters of this [`IndexedStr`] in order.
114    fn chars(&self) -> &[char];
115
116    /// Converts this [`IndexedStr`] into an owned, dynamically allocated [`IndexedString`].
117    fn to_indexed_string(&self) -> IndexedString;
118
119    /// Returns a new [`IndexedStr`] that is the lowercase version of this [`IndexedStr`].
120    fn to_lowercase(&self) -> IndexedString {
121        self.as_str().to_lowercase().into()
122    }
123
124    /// Returns a new [`IndexedStr`] that is the uppercase version of this [`IndexedStr`].
125    fn to_uppercase(&self) -> IndexedString {
126        self.as_str().to_uppercase().into()
127    }
128
129    /// Returns `true` if this [`IndexedStr`] starts with the given string.
130    fn starts_with<S: AsRef<str>>(&self, s: S) -> bool {
131        self.as_str().starts_with(s.as_ref())
132    }
133
134    /// Returns `true` if this [`IndexedStr`] ends with the given string.
135    fn ends_with<S: AsRef<str>>(&self, s: S) -> bool {
136        self.as_str().ends_with(s.as_ref())
137    }
138
139    /// Parses this [`IndexedStr`] into a value of type `F` using the [`FromStr`] trait.
140    fn parse<F>(&self) -> Result<F, <F as FromStr>::Err>
141    where
142        F: FromStr,
143    {
144        self.as_str().parse()
145    }
146
147    /// Returns an iterator over the lines of this [`IndexedStr`].
148    fn lines(&self) -> IndexedLines;
149}
150
151/// A [`String`] replacement that allows for safe indexing and slicing of multi-byte characters.
152///
153/// This is the owned counterpart to [`IndexedSlice`].
154#[derive(Clone, Debug, Eq, Hash)]
155pub struct IndexedString {
156    chars: Vec<char>,
157    offsets: Vec<usize>,
158    string: String,
159}
160
161impl IndexedStr for IndexedString {
162    fn as_str(&self) -> &str {
163        &self.string
164    }
165
166    fn char_at(&self, index: usize) -> Option<char> {
167        self.chars.get(index).copied()
168    }
169
170    fn chars(&self) -> &[char] {
171        &self.chars[..]
172    }
173
174    fn len(&self) -> usize {
175        self.chars.len()
176    }
177
178    fn byte_len(&self) -> usize {
179        self.string.len()
180    }
181
182    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice {
183        let start = match range.start_bound() {
184            Bound::Included(&start) => start,
185            Bound::Excluded(&start) => start + 1,
186            Bound::Unbounded => 0,
187        };
188        let end = match range.end_bound() {
189            Bound::Included(&end) => end + 1,
190            Bound::Excluded(&end) => end,
191            Bound::Unbounded => self.chars.len(),
192        };
193        let start = if start > self.chars.len() {
194            self.chars.len()
195        } else {
196            start
197        };
198        let end = if end > self.chars.len() {
199            self.chars.len()
200        } else {
201            end
202        };
203
204        IndexedSlice {
205            source: self,
206            start,
207            end,
208        }
209    }
210
211    fn to_indexed_string(&self) -> IndexedString {
212        self.clone()
213    }
214
215    fn as_slice(&self) -> IndexedSlice {
216        IndexedSlice {
217            source: self,
218            start: 0,
219            end: self.chars.len(),
220        }
221    }
222
223    fn lines(&self) -> IndexedLines {
224        IndexedLines {
225            source: self,
226            start: 0,
227        }
228    }
229}
230
231impl IndexedString {
232    /// Creates a new [`IndexedString`] from a `&str` or anything that implements [`Display`].
233    pub fn from_str(s: impl Display) -> Self {
234        IndexedString::from_string(s.to_string())
235    }
236
237    /// Creates a new [`IndexedString`] from a [`String`], avoiding the need to clone the
238    /// string by taking ownership of it.
239    pub fn from_string(s: String) -> Self {
240        IndexedString {
241            chars: s.chars().collect(),
242            offsets: s.char_indices().map(|(i, _)| i).collect(),
243            string: s,
244        }
245    }
246
247    /// Creates a new [`IndexedString`] from an iterator of [`char`]s.
248    pub fn from_chars(chars: impl Iterator<Item = char>) -> Self {
249        let chars: Vec<char> = chars.collect();
250        let offsets: Vec<usize> = chars
251            .iter()
252            .scan(0, |acc, &c| {
253                let offset = *acc;
254                *acc += c.len_utf8();
255                Some(offset)
256            })
257            .collect();
258        let string: String = chars.iter().collect();
259        IndexedString {
260            chars,
261            offsets,
262            string,
263        }
264    }
265}
266
267impl AsRef<str> for IndexedString {
268    fn as_ref(&self) -> &str {
269        &self.string
270    }
271}
272
273impl Display for IndexedString {
274    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
275        write!(f, "{}", self.string)
276    }
277}
278
279impl<S: AsRef<str>> PartialEq<S> for IndexedString {
280    fn eq(&self, other: &S) -> bool {
281        self.string == other.as_ref()
282    }
283}
284
285/// A [`&str`](`str`) replacement that allows for safe indexing and slicing of multi-byte characters.
286///
287/// This is the borrowed counterpart to [`IndexedString`].
288#[derive(Eq, Debug, Clone)]
289pub struct IndexedSlice<'a> {
290    source: &'a IndexedString,
291    start: usize,
292    end: usize,
293}
294
295impl IndexedStr for IndexedSlice<'_> {
296    fn as_str(&self) -> &str {
297        if self.start >= self.source.offsets.len()
298            || self.end > self.source.offsets.len()
299            || self.start > self.end
300        {
301            return "";
302        }
303
304        let start_byte = self.source.offsets[self.start];
305        let end_byte = if self.end == self.source.offsets.len() {
306            self.source.string.len()
307        } else {
308            self.source.offsets[self.end]
309        };
310
311        &self.source.string[start_byte..end_byte]
312    }
313
314    fn len(&self) -> usize {
315        self.end - self.start
316    }
317
318    fn byte_len(&self) -> usize {
319        self.source.offsets[self.end] - self.source.offsets[self.start]
320    }
321
322    fn char_at(&self, index: usize) -> Option<char> {
323        self.source.char_at(self.start + index)
324    }
325
326    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice {
327        let start = match range.start_bound() {
328            Bound::Included(&start) => start,
329            Bound::Excluded(&start) => start + 1,
330            Bound::Unbounded => 0,
331        };
332        let end = match range.end_bound() {
333            Bound::Included(&end) => end + 1,
334            Bound::Excluded(&end) => end,
335            Bound::Unbounded => self.len(),
336        };
337        let start = if start > self.len() {
338            self.len()
339        } else {
340            start
341        };
342        let end = if end > self.len() { self.len() } else { end };
343
344        IndexedSlice {
345            source: self.source,
346            start: self.start + start,
347            end: self.start + end,
348        }
349    }
350
351    fn chars(&self) -> &[char] {
352        &self.source.chars[self.start..self.end]
353    }
354
355    fn to_indexed_string(&self) -> IndexedString {
356        IndexedString::from_chars(self.chars().iter().copied())
357    }
358
359    fn as_slice(&self) -> IndexedSlice {
360        self.clone()
361    }
362
363    fn lines(&self) -> IndexedLines {
364        IndexedLines {
365            source: self.source,
366            start: self.start,
367        }
368    }
369}
370
371impl<S: AsRef<str>> PartialEq<S> for IndexedSlice<'_> {
372    fn eq(&self, other: &S) -> bool {
373        self.as_str() == other.as_ref()
374    }
375}
376
377impl AsRef<str> for IndexedSlice<'_> {
378    fn as_ref(&self) -> &str {
379        self.as_str()
380    }
381}
382
383impl<'a> From<&'a IndexedString> for IndexedSlice<'a> {
384    fn from(s: &'a IndexedString) -> Self {
385        IndexedSlice {
386            source: s,
387            start: 0,
388            end: s.chars.len(),
389        }
390    }
391}
392
393impl<'a> From<IndexedSlice<'a>> for IndexedString {
394    fn from(s: IndexedSlice<'a>) -> Self {
395        s.to_indexed_string()
396    }
397}
398
399impl From<String> for IndexedString {
400    fn from(s: String) -> Self {
401        IndexedString::from_string(s)
402    }
403}
404
405impl From<&str> for IndexedString {
406    fn from(s: &str) -> Self {
407        IndexedString::from_str(s)
408    }
409}
410
411impl From<&String> for IndexedString {
412    fn from(s: &String) -> Self {
413        IndexedString::from_string(s.clone())
414    }
415}
416
417impl Display for IndexedSlice<'_> {
418    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
419        write!(f, "{}", self.as_str())
420    }
421}
422
423impl IndexedStr for &IndexedString {
424    fn as_str(&self) -> &str {
425        (*self).as_str()
426    }
427
428    fn as_slice(&self) -> IndexedSlice {
429        (*self).as_slice()
430    }
431
432    fn len(&self) -> usize {
433        (*self).len()
434    }
435
436    fn byte_len(&self) -> usize {
437        (*self).byte_len()
438    }
439
440    fn char_at(&self, index: usize) -> Option<char> {
441        (*self).char_at(index)
442    }
443
444    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice {
445        (*self).slice(range)
446    }
447
448    fn chars(&self) -> &[char] {
449        (*self).chars()
450    }
451
452    fn to_indexed_string(&self) -> IndexedString {
453        (*self).to_indexed_string()
454    }
455
456    fn lines(&self) -> IndexedLines {
457        (*self).lines()
458    }
459}
460
461impl PartialEq<IndexedString> for &IndexedString {
462    fn eq(&self, other: &IndexedString) -> bool {
463        self.as_str() == other.as_str()
464    }
465}
466
467impl PartialEq<IndexedSlice<'_>> for &IndexedString {
468    fn eq(&self, other: &IndexedSlice) -> bool {
469        self.as_str() == other.as_str()
470    }
471}
472
473impl IndexedStr for &IndexedSlice<'_> {
474    fn as_str(&self) -> &str {
475        (*self).as_str()
476    }
477
478    fn as_slice(&self) -> IndexedSlice {
479        (*self).as_slice()
480    }
481
482    fn len(&self) -> usize {
483        (*self).len()
484    }
485
486    fn byte_len(&self) -> usize {
487        (*self).byte_len()
488    }
489
490    fn char_at(&self, index: usize) -> Option<char> {
491        (*self).char_at(index)
492    }
493
494    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice {
495        (*self).slice(range)
496    }
497
498    fn chars(&self) -> &[char] {
499        (*self).chars()
500    }
501
502    fn to_indexed_string(&self) -> IndexedString {
503        (*self).to_indexed_string()
504    }
505
506    fn lines(&self) -> IndexedLines {
507        (*self).lines()
508    }
509}
510
511impl PartialEq<IndexedString> for &IndexedSlice<'_> {
512    fn eq(&self, other: &IndexedString) -> bool {
513        self.as_str() == other.as_str()
514    }
515}
516
517impl PartialEq<IndexedSlice<'_>> for &IndexedSlice<'_> {
518    fn eq(&self, other: &IndexedSlice) -> bool {
519        self.as_str() == other.as_str()
520    }
521}
522
523impl PartialEq<IndexedSlice<'_>> for &str {
524    fn eq(&self, other: &IndexedSlice) -> bool {
525        other.as_str() == *self
526    }
527}
528
529impl PartialEq<IndexedSlice<'_>> for String {
530    fn eq(&self, other: &IndexedSlice) -> bool {
531        other.as_str() == *self
532    }
533}
534
535impl PartialEq<IndexedString> for &str {
536    fn eq(&self, other: &IndexedString) -> bool {
537        other.as_str() == *self
538    }
539}
540
541impl PartialEq<IndexedString> for String {
542    fn eq(&self, other: &IndexedString) -> bool {
543        other.as_str() == *self
544    }
545}
546
547/// An iterator over the lines of an [`IndexedStr`].
548pub struct IndexedLines<'a> {
549    source: &'a IndexedString,
550    start: usize,
551}
552
553impl<'a> Iterator for IndexedLines<'a> {
554    type Item = IndexedSlice<'a>;
555
556    fn next(&mut self) -> Option<Self::Item> {
557        if self.start > self.source.chars.len() {
558            return None;
559        }
560
561        if self.start == self.source.chars.len() {
562            self.start += 1; // Mark as finished
563            return Some(self.source.slice(self.start - 1..self.start - 1));
564        }
565
566        let mut end = self.start;
567        while end < self.source.chars.len() {
568            if self.source.chars[end] == '\n' {
569                let line = self.source.slice(self.start..end);
570                self.start = end + 1; // Skip the newline character
571                return Some(line);
572            }
573            end += 1;
574        }
575
576        if self.start <= self.source.chars.len() {
577            let line = self.source.slice(self.start..self.source.chars.len());
578            self.start = self.source.chars.len() + 1; // Mark as finished
579            return Some(line);
580        }
581
582        None
583    }
584}