safe_string/
lib.rs

1#![cfg_attr(not(feature = "std"), no_std)]
2#![allow(clippy::should_implement_trait)]
3#![deny(missing_docs)]
4//! This crate provides replacement types for [`String`] and [`&str`](`str`) that allow for safe
5//! indexing by character to avoid panics and the usual pitfalls of working with multi-byte
6//! UTF-8 characters, namely the scenario where the _byte length_ of a string and the
7//! _character length_ of that same string are not the same.
8//!
9//! Specifically, [`IndexedString`] (replaces [`String`]) and [`IndexedSlice`] (replaces
10//! [`&str`](`str`)) allow for O(1) slicing and indexing by character, and they will never panic
11//! when indexing or slicing.
12//!
13//! This is accomplished by storing the character offsets of each character in the string,
14//! along with the original [`String`], and using this information to calculate the byte
15//! offsets of each character on the fly. Thus [`IndexedString`] uses ~2x the memory of a
16//! normal [`String`], but [`IndexedSlice`] and other types implementing [`IndexedStr`] have
17//! only one [`usize`] extra in overhead over that of a regular [`&str`](`str`) slice / fat
18//! pointer. In theory this could be reduced down to the same size as a fat pointer using
19//! unsafe rust, but this way we get to have completely safe code and the difference is
20//! negligible.
21//! # Examples
22//!
23//! ```
24//! use safe_string::{IndexedString, IndexedStr, IndexedSlice};
25//!
26//! let message = IndexedString::from("Hello, δΈ–η•Œ! πŸ‘‹πŸ˜Š");
27//! assert_eq!(message.as_str(), "Hello, δΈ–η•Œ! πŸ‘‹πŸ˜Š");
28//! assert_eq!(message, "Hello, δΈ–η•Œ! πŸ‘‹πŸ˜Š"); // handy PartialEq impls
29//!
30//! // Access characters by index
31//! assert_eq!(message.char_at(7), Some('δΈ–'));
32//! assert_eq!(message.char_at(100), None); // Out of bounds access returns None
33//!
34//! // Slice the IndexedString
35//! let slice = message.slice(7..9);
36//! assert_eq!(slice.as_str(), "δΈ–η•Œ");
37//!
38//! // Convert slice back to IndexedString
39//! let sliced_message = slice.to_indexed_string();
40//! assert_eq!(sliced_message.as_str(), "δΈ–η•Œ");
41//!
42//! // Nested slicing
43//! let slice = message.slice(0..10);
44//! let nested_slice = slice.slice(3..6);
45//! assert_eq!(nested_slice.as_str(), "lo,");
46//!
47//! // Display byte length and character length
48//! assert_eq!(IndexedString::from_str("δΈ–η•Œ").byte_len(), 6); // "δΈ–η•Œ" is 6 bytes in UTF-8
49//! assert_eq!(IndexedString::from_str("δΈ–η•Œ").len(), 2); // "δΈ–η•Œ" has 2 characters
50//!
51//! // Demonstrate clamped slicing (no panic)
52//! let clamped_slice = message.slice(20..30);
53//! assert_eq!(clamped_slice.as_str(), "");
54//!
55//! // Using `as_str` to interface with standard Rust string handling
56//! let slice = message.slice(0..5);
57//! let standard_str_slice = slice.as_str();
58//! assert_eq!(standard_str_slice, "Hello");
59//! ```
60
61#[cfg(not(feature = "std"))]
62extern crate alloc;
63
64#[cfg(not(feature = "std"))]
65use alloc::string::{String, ToString};
66#[cfg(not(feature = "std"))]
67use alloc::vec::Vec;
68use core::fmt::{Debug, Display};
69use core::ops::{Bound, RangeBounds};
70use core::str::FromStr;
71#[cfg(feature = "std")]
72use std::string::{String, ToString};
73#[cfg(feature = "std")]
74use std::vec::Vec;
75
76/// A trait that facilitates safe interaction with strings that contain multi-byte characters.
77///
78/// [`IndexedString`] replaces [`String`], whereas [`IndexedSlice`] replaces [`&str`](`str`).
79///
80/// Both of these types as well as anything that implements [`IndexedStr`] are characterized by
81/// the fact that their `len()` and indexing methods operate on characters, not bytes, and
82/// enough information is stored to allow for O(1) slicing and indexing on a character _and_
83/// byte basis as needed, but the default interface is character-centric.
84///
85/// This all comes at the cost of increased memory usage and some performance overhead when a
86/// [`IndexedString`] is created, but the overhead is minimal when using [`IndexedSlice`] or
87/// any other type implementing [`IndexedStr`].
88///
89/// It is also worth noting that all of these types will never panic when indexing or slicing,
90/// unlike [`&str`](`str`) and [`String`], and clamped bounds are used instead.
91pub trait IndexedStr:
92    Display + Debug + PartialEq<IndexedString> + for<'a> PartialEq<IndexedSlice<'a>>
93{
94    /// Returns a [`&str`](`str`) representation of this [`IndexedStr`].
95    ///
96    /// WARNING: Once you cast to a [`&str`](`str`), you are leaving the safety provided by
97    /// [`IndexedStr`]. Only use this method when you need to interface with code that requires
98    /// a [`&str`](`str`).
99    fn as_str(&self) -> &str;
100
101    /// Returns a [`IndexedSlice`] that represents the entire contents of this [`IndexedStr`].
102    fn as_slice(&self) -> IndexedSlice<'_>;
103
104    /// Returns the length of this [`IndexedStr`] in characters, NOT bytes.
105    fn len(&self) -> usize;
106
107    /// Returns the byte length of this [`IndexedStr`]. This will be longer than the
108    /// [`len`](`IndexedStr::len`) if the string contains multi-byte characters.
109    fn byte_len(&self) -> usize;
110
111    /// Returns `true` if this [`IndexedStr`] is empty (of length 0).
112    fn is_empty(&self) -> bool {
113        self.len() == 0
114    }
115
116    /// Returns the character at the given index, if it exists.
117    fn char_at(&self, index: usize) -> Option<char>;
118
119    /// Returns a sub-slice of this [`IndexedStr`] based on the given range in terms of the
120    /// _characters_ in the string, not bytes.
121    ///
122    /// The range is automatically clamped to the bounds of the [`IndexedStr`].
123    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice<'_>;
124
125    /// Returns a slice containing all characters of this [`IndexedStr`] in order.
126    fn chars(&self) -> &[char];
127
128    /// Converts this [`IndexedStr`] into an owned, dynamically allocated [`IndexedString`].
129    fn to_indexed_string(&self) -> IndexedString;
130
131    /// Returns a new [`IndexedStr`] that is the lowercase version of this [`IndexedStr`].
132    fn to_lowercase(&self) -> IndexedString {
133        self.as_str().to_lowercase().into()
134    }
135
136    /// Returns a new [`IndexedStr`] that is the uppercase version of this [`IndexedStr`].
137    fn to_uppercase(&self) -> IndexedString {
138        self.as_str().to_uppercase().into()
139    }
140
141    /// Returns `true` if this [`IndexedStr`] starts with the given string.
142    fn starts_with<S: AsRef<str>>(&self, s: S) -> bool {
143        self.as_str().starts_with(s.as_ref())
144    }
145
146    /// Returns `true` if this [`IndexedStr`] ends with the given string.
147    fn ends_with<S: AsRef<str>>(&self, s: S) -> bool {
148        self.as_str().ends_with(s.as_ref())
149    }
150
151    /// Parses this [`IndexedStr`] into a value of type `F` using the [`FromStr`] trait.
152    fn parse<F>(&self) -> Result<F, <F as FromStr>::Err>
153    where
154        F: FromStr,
155    {
156        self.as_str().parse()
157    }
158
159    /// Returns an iterator over the lines of this [`IndexedStr`].
160    fn lines(&self) -> IndexedLines<'_>;
161}
162
163/// A [`String`] replacement that allows for safe indexing and slicing of multi-byte characters.
164///
165/// This is the owned counterpart to [`IndexedSlice`].
166#[derive(Clone, Debug, Eq, Hash)]
167pub struct IndexedString {
168    chars: Vec<char>,
169    offsets: Vec<usize>,
170    string: String,
171}
172
173impl IndexedStr for IndexedString {
174    fn as_str(&self) -> &str {
175        &self.string
176    }
177
178    fn char_at(&self, index: usize) -> Option<char> {
179        self.chars.get(index).copied()
180    }
181
182    fn chars(&self) -> &[char] {
183        &self.chars[..]
184    }
185
186    fn len(&self) -> usize {
187        self.chars.len()
188    }
189
190    fn byte_len(&self) -> usize {
191        self.string.len()
192    }
193
194    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice<'_> {
195        let start = match range.start_bound() {
196            Bound::Included(&start) => start,
197            Bound::Excluded(&start) => start + 1,
198            Bound::Unbounded => 0,
199        };
200        let end = match range.end_bound() {
201            Bound::Included(&end) => end + 1,
202            Bound::Excluded(&end) => end,
203            Bound::Unbounded => self.chars.len(),
204        };
205        let start = if start > self.chars.len() {
206            self.chars.len()
207        } else {
208            start
209        };
210        let end = if end > self.chars.len() {
211            self.chars.len()
212        } else {
213            end
214        };
215
216        IndexedSlice {
217            source: self,
218            start,
219            end,
220        }
221    }
222
223    fn to_indexed_string(&self) -> IndexedString {
224        self.clone()
225    }
226
227    fn as_slice(&self) -> IndexedSlice<'_> {
228        IndexedSlice {
229            source: self,
230            start: 0,
231            end: self.chars.len(),
232        }
233    }
234
235    fn lines(&self) -> IndexedLines<'_> {
236        IndexedLines {
237            source: self,
238            start: 0,
239        }
240    }
241}
242
243impl IndexedString {
244    /// Creates a new [`IndexedString`] from a `&str` or anything that implements [`Display`].
245    pub fn from_str(s: impl Display) -> Self {
246        IndexedString::from_string(s.to_string())
247    }
248
249    /// Creates a new [`IndexedString`] from a [`String`], avoiding the need to clone the
250    /// string by taking ownership of it.
251    pub fn from_string(s: String) -> Self {
252        IndexedString {
253            chars: s.chars().collect(),
254            offsets: s.char_indices().map(|(i, _)| i).collect(),
255            string: s,
256        }
257    }
258
259    /// Creates a new [`IndexedString`] from an iterator of [`char`]s.
260    pub fn from_chars(chars: impl Iterator<Item = char>) -> Self {
261        let chars: Vec<char> = chars.collect();
262        let offsets: Vec<usize> = chars
263            .iter()
264            .scan(0, |acc, &c| {
265                let offset = *acc;
266                *acc += c.len_utf8();
267                Some(offset)
268            })
269            .collect();
270        let string: String = chars.iter().collect();
271        IndexedString {
272            chars,
273            offsets,
274            string,
275        }
276    }
277}
278
279impl AsRef<str> for IndexedString {
280    fn as_ref(&self) -> &str {
281        &self.string
282    }
283}
284
285impl Display for IndexedString {
286    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
287        write!(f, "{}", self.string)
288    }
289}
290
291impl<S: AsRef<str>> PartialEq<S> for IndexedString {
292    fn eq(&self, other: &S) -> bool {
293        self.string == other.as_ref()
294    }
295}
296
297/// A [`&str`](`str`) replacement that allows for safe indexing and slicing of multi-byte characters.
298///
299/// This is the borrowed counterpart to [`IndexedString`].
300#[derive(Eq, Debug, Clone)]
301pub struct IndexedSlice<'a> {
302    source: &'a IndexedString,
303    start: usize,
304    end: usize,
305}
306
307impl IndexedStr for IndexedSlice<'_> {
308    fn as_str(&self) -> &str {
309        if self.start >= self.source.offsets.len()
310            || self.end > self.source.offsets.len()
311            || self.start > self.end
312        {
313            return "";
314        }
315
316        let start_byte = self.source.offsets[self.start];
317        let end_byte = if self.end == self.source.offsets.len() {
318            self.source.string.len()
319        } else {
320            self.source.offsets[self.end]
321        };
322
323        &self.source.string[start_byte..end_byte]
324    }
325
326    fn len(&self) -> usize {
327        self.end - self.start
328    }
329
330    fn byte_len(&self) -> usize {
331        self.source.offsets[self.end] - self.source.offsets[self.start]
332    }
333
334    fn char_at(&self, index: usize) -> Option<char> {
335        self.source.char_at(self.start + index)
336    }
337
338    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice<'_> {
339        let start = match range.start_bound() {
340            Bound::Included(&start) => start,
341            Bound::Excluded(&start) => start + 1,
342            Bound::Unbounded => 0,
343        };
344        let end = match range.end_bound() {
345            Bound::Included(&end) => end + 1,
346            Bound::Excluded(&end) => end,
347            Bound::Unbounded => self.len(),
348        };
349        let start = if start > self.len() {
350            self.len()
351        } else {
352            start
353        };
354        let end = if end > self.len() { self.len() } else { end };
355
356        IndexedSlice {
357            source: self.source,
358            start: self.start + start,
359            end: self.start + end,
360        }
361    }
362
363    fn chars(&self) -> &[char] {
364        &self.source.chars[self.start..self.end]
365    }
366
367    fn to_indexed_string(&self) -> IndexedString {
368        IndexedString::from_chars(self.chars().iter().copied())
369    }
370
371    fn as_slice(&self) -> IndexedSlice<'_> {
372        self.clone()
373    }
374
375    fn lines(&self) -> IndexedLines<'_> {
376        IndexedLines {
377            source: self.source,
378            start: self.start,
379        }
380    }
381}
382
383impl<S: AsRef<str>> PartialEq<S> for IndexedSlice<'_> {
384    fn eq(&self, other: &S) -> bool {
385        self.as_str() == other.as_ref()
386    }
387}
388
389impl AsRef<str> for IndexedSlice<'_> {
390    fn as_ref(&self) -> &str {
391        self.as_str()
392    }
393}
394
395impl<'a> From<&'a IndexedString> for IndexedSlice<'a> {
396    fn from(s: &'a IndexedString) -> Self {
397        IndexedSlice {
398            source: s,
399            start: 0,
400            end: s.chars.len(),
401        }
402    }
403}
404
405impl<'a> From<IndexedSlice<'a>> for IndexedString {
406    fn from(s: IndexedSlice<'a>) -> Self {
407        s.to_indexed_string()
408    }
409}
410
411impl From<String> for IndexedString {
412    fn from(s: String) -> Self {
413        IndexedString::from_string(s)
414    }
415}
416
417impl From<&str> for IndexedString {
418    fn from(s: &str) -> Self {
419        IndexedString::from_str(s)
420    }
421}
422
423impl From<&String> for IndexedString {
424    fn from(s: &String) -> Self {
425        IndexedString::from_string(s.clone())
426    }
427}
428
429impl Display for IndexedSlice<'_> {
430    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
431        write!(f, "{}", self.as_str())
432    }
433}
434
435impl IndexedStr for &IndexedString {
436    fn as_str(&self) -> &str {
437        (*self).as_str()
438    }
439
440    fn as_slice(&self) -> IndexedSlice<'_> {
441        (*self).as_slice()
442    }
443
444    fn len(&self) -> usize {
445        (*self).len()
446    }
447
448    fn byte_len(&self) -> usize {
449        (*self).byte_len()
450    }
451
452    fn char_at(&self, index: usize) -> Option<char> {
453        (*self).char_at(index)
454    }
455
456    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice<'_> {
457        (*self).slice(range)
458    }
459
460    fn chars(&self) -> &[char] {
461        (*self).chars()
462    }
463
464    fn to_indexed_string(&self) -> IndexedString {
465        (*self).to_indexed_string()
466    }
467
468    fn lines(&self) -> IndexedLines<'_> {
469        (*self).lines()
470    }
471}
472
473impl PartialEq<IndexedString> for &IndexedString {
474    fn eq(&self, other: &IndexedString) -> bool {
475        self.as_str() == other.as_str()
476    }
477}
478
479impl PartialEq<IndexedSlice<'_>> for &IndexedString {
480    fn eq(&self, other: &IndexedSlice) -> bool {
481        self.as_str() == other.as_str()
482    }
483}
484
485impl IndexedStr for &IndexedSlice<'_> {
486    fn as_str(&self) -> &str {
487        (*self).as_str()
488    }
489
490    fn as_slice(&self) -> IndexedSlice<'_> {
491        (*self).as_slice()
492    }
493
494    fn len(&self) -> usize {
495        (*self).len()
496    }
497
498    fn byte_len(&self) -> usize {
499        (*self).byte_len()
500    }
501
502    fn char_at(&self, index: usize) -> Option<char> {
503        (*self).char_at(index)
504    }
505
506    fn slice<R: RangeBounds<usize>>(&self, range: R) -> IndexedSlice<'_> {
507        (*self).slice(range)
508    }
509
510    fn chars(&self) -> &[char] {
511        (*self).chars()
512    }
513
514    fn to_indexed_string(&self) -> IndexedString {
515        (*self).to_indexed_string()
516    }
517
518    fn lines(&self) -> IndexedLines<'_> {
519        (*self).lines()
520    }
521}
522
523impl PartialEq<IndexedString> for &IndexedSlice<'_> {
524    fn eq(&self, other: &IndexedString) -> bool {
525        self.as_str() == other.as_str()
526    }
527}
528
529impl PartialEq<IndexedSlice<'_>> for &IndexedSlice<'_> {
530    fn eq(&self, other: &IndexedSlice) -> bool {
531        self.as_str() == other.as_str()
532    }
533}
534
535impl PartialEq<IndexedSlice<'_>> for &str {
536    fn eq(&self, other: &IndexedSlice) -> bool {
537        other.as_str() == *self
538    }
539}
540
541impl PartialEq<IndexedSlice<'_>> for String {
542    fn eq(&self, other: &IndexedSlice) -> bool {
543        other.as_str() == *self
544    }
545}
546
547impl PartialEq<IndexedString> for &str {
548    fn eq(&self, other: &IndexedString) -> bool {
549        other.as_str() == *self
550    }
551}
552
553impl PartialEq<IndexedString> for String {
554    fn eq(&self, other: &IndexedString) -> bool {
555        other.as_str() == *self
556    }
557}
558
559/// An iterator over the lines of an [`IndexedStr`].
560pub struct IndexedLines<'a> {
561    source: &'a IndexedString,
562    start: usize,
563}
564
565impl<'a> Iterator for IndexedLines<'a> {
566    type Item = IndexedSlice<'a>;
567
568    fn next(&mut self) -> Option<Self::Item> {
569        if self.start > self.source.chars.len() {
570            return None;
571        }
572
573        if self.start == self.source.chars.len() {
574            self.start += 1; // Mark as finished
575            return Some(self.source.slice(self.start - 1..self.start - 1));
576        }
577
578        let mut end = self.start;
579        while end < self.source.chars.len() {
580            if self.source.chars[end] == '\n' {
581                let line = self.source.slice(self.start..end);
582                self.start = end + 1; // Skip the newline character
583                return Some(line);
584            }
585            end += 1;
586        }
587
588        if self.start <= self.source.chars.len() {
589            let line = self.source.slice(self.start..self.source.chars.len());
590            self.start = self.source.chars.len() + 1; // Mark as finished
591            return Some(line);
592        }
593
594        None
595    }
596}