1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
//! Substring method for string types.
//!
//! This crate provides a [`substring()`] method on Rust string types. The method takes a start and
//! end character index and returns a string slice of the characters within that range.
//!
//! The method is provided via the [`Substring`] trait which is implemented on the [`str`]
//! primitive.
//!
//! # Example
//! ```
//! use substring::Substring;
//!
//! // Works on a string slice.
//! assert_eq!("foobar".substring(2,5), "oba");
//!
//! // Also works on a String.
//! assert_eq!("foobar".to_string().substring(1,6), "oobar");
//! ```
//!
//! As Rust strings are UTF-8 encoded, the algorithm for finding a character substring is `O(n)`,
//! where `n` is the byte length of the string. This is due to characters not being of predictible
//! byte lengths.
//!
//! # Note
//! The indexing of substrings is based on [*Unicode Scalar Value*]. As such, substrings may not
//! always match your intuition:
//!
//! ```
//! use substring::Substring;
//!
//! assert_eq!("ã".substring(0, 1), "a");  // As opposed to "ã".
//! assert_eq!("ã".substring(1, 2), "\u{0303}")
//! ```
//!
//! The above example occurs because "ã" is technically made up of two UTF-8 scalar values.
//!
//! [`str`]: https://doc.rust-lang.org/std/primitive.str.html
//! [`Substring`]: trait.Substring.html
//! [`substring()`]: trait.Substring.html#tymethod.substring
//!
//! [*Unicode Scalar Value*]: http://www.unicode.org/glossary/#unicode_scalar_value

#![deny(missing_docs)]
// Since the MSRV is 1.0.0, allowing usage of deprecated items is ok, as the replacements are likely
// not available in early versions.
#![allow(deprecated)]
#![cfg_attr(rustc_1_6, no_std)]

#[cfg(not(rustc_1_6))]
extern crate std as core;

/// Provides a [`substring()`] method.
///
/// The [`substring()`] method obtains a string slice of characters within the range specified by
/// `start_index` and `end_index`.
///
/// [`substring()`]: trait.Substring.html#tymethod.substring
pub trait Substring {
    /// Obtains a string slice containing the characters within the range specified by
    /// `start_index` and `end_index`.
    ///
    /// The range specified is a character range, not a byte range.
    fn substring(&self, start_index: usize, end_index: usize) -> &str;
}

/// Implements a [`substring()`] method for [`str`].
///
/// Note that structs which implement `Deref<Target=str>` (such as [`String`]) will also have
/// access to this implementation.
///
/// [`str`]: https://doc.rust-lang.org/std/primitive.str.html
/// [`String`]: https://doc.rust-lang.org/std/string/struct.String.html
/// [`substring()`]: trait.Substring.html#method.substring
impl Substring for str {
    /// Obtain a slice of the characters within the range of `start_index` and `end_index`.
    ///
    /// As this is by character index, rather than byte index, the temporal complexity of finding a
    /// substring is *O(n)*, where *n* is the byte length of the string.
    ///
    /// Example:
    /// ```
    /// use substring::Substring;
    ///
    /// assert_eq!("foobar".substring(2,5), "oba");
    /// ```
    #[must_use]
    fn substring(&self, start_index: usize, end_index: usize) -> &str {
        if end_index <= start_index {
            return "";
        }

        let mut indices = self.char_indices();

        let obtain_index = |(index, _char)| index;
        let str_len = self.len();

        unsafe {
            // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
            // that the indices obtained from it will always be within the bounds of `self` and they
            // will always lie on UTF-8 sequence boundaries.
            self.slice_unchecked(
                indices.nth(start_index).map_or(str_len, &obtain_index),
                indices
                    .nth(end_index - start_index - 1)
                    .map_or(str_len, &obtain_index),
            )
        }
    }
}

#[cfg(test)]
mod tests {
    use Substring;

    #[test]
    fn test_substring() {
        assert_eq!("foobar".substring(0, 3), "foo");
    }

    #[test]
    fn test_out_of_bounds() {
        assert_eq!("foobar".substring(0, 10), "foobar");
        assert_eq!("foobar".substring(6, 10), "");
    }

    #[test]
    fn test_start_less_than_end() {
        assert_eq!("foobar".substring(3, 2), "");
    }

    #[test]
    fn test_start_and_end_equal() {
        assert_eq!("foobar".substring(3, 3), "");
    }

    #[test]
    fn test_multiple_byte_characters() {
        assert_eq!("fõøbα®".substring(2, 5), "øbα");
    }
}