1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#![no_std]

//! The [`reverse_grapheme_clusters_in_place`][0] function reverses a string slice in-place without
//! allocating any memory on the heap.  It correctly handles multi-byte UTF-8 sequences and
//! grapheme clusters, including combining marks and astral characters such as Emoji.
//!
//! ## Example
//!
//! ```rust
//! use unicode_reverse::reverse_grapheme_clusters_in_place;
//!
//! let mut x = "man\u{0303}ana".to_string();
//! println!("{}", x); // prints "mañana"
//!
//! reverse_grapheme_clusters_in_place(&mut x);
//! println!("{}", x); // prints "anañam"
//! ```
//!
//! ## Background
//!
//! As described in [this article by Mathias Bynens][1], naively reversing a Unicode string can go
//! wrong in several ways. For example, merely reversing the `chars` (Unicode Scalar Values) in a
//! string can cause combining marks to become attached to the wrong characters:
//!
//! ```rust
//! let x = "man\u{0303}ana";
//! println!("{}", x); // prints "mañana"
//!
//! let y: String = x.chars().rev().collect();
//! println!("{}", y); // prints "anãnam": Oops! The '~' is now applied to the 'a'.
//! ```
//!
//! Reversing the [grapheme clusters][2] of the string fixes this problem:
//!
//! ```rust
//! extern crate unicode_segmentation;
//! use unicode_segmentation::UnicodeSegmentation;
//!
//! # fn main() {
//! let x = "man\u{0303}ana";
//! let y: String = x.graphemes(true).rev().collect();
//! println!("{}", y); // prints "anañam"
//! # }
//! ```
//!
//! The `reverse_grapheme_clusters_in_place` function from this crate performs this same operation,
//! but performs the reversal in-place rather than allocating a new string.
//!
//! # Algorithm
//!
//! The implementation is very simple. It makes two passes over the string's contents:
//!
//! 1. For each grapheme cluster, reverse the bytes within the grapheme cluster in-place.
//! 2. Reverse the bytes of the entire string in-place.
//!
//! After the second pass, each grapheme cluster has been reversed twice, so its bytes are now back
//! in their original order, but the clusters are now in the opposite order within the string.
//!
//! # no_std
//!
//! This crate does not depend on libstd, so it can be used in [`no_std` projects][3].
//!
//! [0]: fn.reverse_grapheme_clusters_in_place.html
//! [1]: https://mathiasbynens.be/notes/javascript-unicode
//! [2]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
//! [3]: https://doc.rust-lang.org/book/no-stdlib.html

extern crate unicode_segmentation;

use core::slice;
use core::str;
use unicode_segmentation::UnicodeSegmentation;

/// Reverse a Unicode string in-place without allocating.
///
/// This function reverses a string slice in-place without allocating any memory on the heap.  It
/// correctly handles multi-byte UTF-8 sequences and grapheme clusters, including combining marks
/// and astral characters such as Emoji.
///
/// See the [crate-level documentation](index.html) for more details.
///
/// ## Example
///
/// ```rust
/// extern crate unicode_reverse;
/// use unicode_reverse::reverse_grapheme_clusters_in_place;
///
/// fn main() {
///     let mut x = "man\u{0303}ana".to_string();
///     println!("{}", x); // prints "mañana"
///
///     reverse_grapheme_clusters_in_place(&mut x);
///     println!("{}", x); // prints "anañam"
/// }
/// ```
pub fn reverse_grapheme_clusters_in_place(s: &mut str) {
    // Part 1: Reverse the bytes within each grapheme cluster.
    // This does not preserve UTF-8 validity. We must guarantee this `reverse` is
    // undone before the data is accessed as `str` again.
    {
        let mut tail = &mut s[..];
        loop {
            // Advance to the next grapheme cluster:
            let len = match tail.graphemes(true).next() {
                Some(grapheme) => grapheme.len(),
                None => break
            };
            let (head, new_tail) = {tail}.split_at_mut(len);
            tail = new_tail;

            // Reverse the bytes within this grapheme cluster.
            let bytes = unsafe {
                let head = head;
                // This is safe because `head` is &mut str so guaranteed not to be aliased.
                slice::from_raw_parts_mut(head.as_ptr() as *mut u8, head.len())
            };
            bytes.reverse();
        }
    }

    // Part 2: Reverse all the bytes.
    // This un-reverses all of the reversals from Part 1.
    let bytes = unsafe {
        let s = s;
        // This is safe because `s` is &mut str so guaranteed not to be aliased.
        slice::from_raw_parts_mut(s.as_ptr() as *mut u8, s.len())
    };
    bytes.reverse();

    // Each UTF-8 sequence is now in the right order.
    debug_assert!(str::from_utf8(bytes).is_ok());
}

#[cfg(test)]
mod tests {
    use super::reverse_grapheme_clusters_in_place;

    extern crate std;
    use self::std::string::ToString;

    fn test_rev(a: &str, b: &str) {
        let mut a = a.to_string();
        reverse_grapheme_clusters_in_place(&mut a);
        assert_eq!(a, b);
    }

    #[test]
    fn test_ascii() {
        test_rev("Hello", "olleH");
    }

    #[test]
    fn test_utf8() {
        test_rev("¡Hola!", "!aloH¡");
    }

    #[test]
    fn test_emoji() {
        test_rev("\u{1F36D}\u{1F36E}", "\u{1F36E}\u{1F36D}");
    }

    #[test]
    fn test_combining_mark() {
        test_rev("man\u{0303}ana", "anan\u{0303}am");
    }
}