char_circle/lib.rs
1//! A circular buffer for strings and traits for in-place string transforms.
2//!
3//! This crate provides two key types: the [`CharCircle`] struct and the
4//! [`StringTransform`] trait. The `CharCircle` is a circular buffer
5//! specialized for UTF-8 strings, and the `StringTransform` trait builds upon
6//! it to provide a character-oriented API for in-place string transformations.
7//! In short, `StringTransform` allows you to implement transformations as
8//! iterator adaptors, with copy-on-write optimizations in mind.
9//!
10//! The `CharCircle` uses internal mutability. This enables its contents to be
11//! consumed by an external iterator, [`Chars`]. As a consequence, the
12//! `CharCircle` is not `Sync`. It is implemented as a `RefCell` around the
13//! [`RawCircle`], which has a nearly identical API, uses external mutability,
14//! is thread-safe, and does not provide a consuming iterator.
15//!
16//! The `StringTransform` trait is implemented by factories of iterator
17//! adaptors. For simple cases, the [`SimpleTransform`] trait provides an
18//! alternative that is implemented directly by the adaptor.
19//!
20//!
21//! Example: To Uppercase
22//! ---------------------------------------------------------------------------
23//!
24//! Transforms which don't require configuration are most easily implemented
25//! with `SimpleTransform`.
26//!
27//! Here we implement an uppercase transform:
28//!
29//! ```
30//! use char_circle::{SimpleTransform, Chars};
31//!
32//! // Step 1: Define the transform as an iterator adaptor.
33//! struct ToUpper<I>(I);
34//!
35//! impl<I> Iterator for ToUpper<I> where I: Iterator<Item=char> {
36//! type Item = char;
37//! fn next(&mut self) -> Option<char> {
38//! self.0.next().map(|ch| ch.to_ascii_uppercase())
39//! }
40//! }
41//!
42//! // Step 2: Define a constructor for the adaptor with `SimpleTransform`.
43//! impl<'a> SimpleTransform<'a> for ToUpper<Chars<'a>> {
44//! fn transform_chars(chars: Chars<'a>) -> Self {
45//! ToUpper(chars)
46//! }
47//! }
48//!
49//! // Step 3: Profit!
50//! let s = "can you hear me in the back?";
51//! let s = ToUpper::transform(s);
52//! assert_eq!(&s, "CAN YOU HEAR ME IN THE BACK?");
53//! ```
54//!
55//!
56//! Example: Caesar Cipher
57//! ---------------------------------------------------------------------------
58//!
59//! Transforms that need to be configured should define a factory which
60//! implements `StringTransform`.
61//!
62//! Here we implement a Caesar cipher configured with its key:
63//!
64//! ```
65//! use char_circle::{StringTransform, Chars};
66//!
67//! // Step 1: Define the transform as an iterator adaptor.
68//! struct CaesarCipherIter<I> {
69//! inner: I,
70//! key: i32,
71//! }
72//!
73//! impl<I> Iterator for CaesarCipherIter<I> where I: Iterator<Item=char> {
74//! type Item = char;
75//! fn next(&mut self) -> Option<char> {
76//! let plaintext = self.inner.next()?;
77//! let ciphertext = plaintext as i32 + self.key;
78//! let ciphertext = std::char::from_u32(ciphertext as u32).unwrap();
79//! Some(ciphertext)
80//! }
81//! }
82//!
83//! // Step 2: Define a factory for the adaptor with `StringTransform`.
84//! struct CaesarCipher(i32);
85//!
86//! impl<'a> StringTransform<'a> for CaesarCipher {
87//! type Iter = CaesarCipherIter<Chars<'a>>;
88//! fn transform_chars(&self, chars: Chars<'a>) -> Self::Iter {
89//! CaesarCipherIter { inner: chars, key: self.0 }
90//! }
91//! }
92//!
93//! // Step 3: Profit!
94//! let encoder = CaesarCipher(8);
95//! let decoder = CaesarCipher(-8);
96//! let plaintext = "Veni, vidi, vici";
97//! let ciphertext = encoder.transform(plaintext);
98//! assert_eq!(&ciphertext, "^mvq4(~qlq4(~qkq");
99//! let plaintext = decoder.transform(ciphertext);
100//! assert_eq!(&plaintext, "Veni, vidi, vici");
101//! ```
102
103use std::borrow::Cow;
104use std::cell::RefCell;
105use std::char;
106use std::cmp;
107use std::io::{self, Read, Write};
108use std::marker::PhantomData;
109use std::mem;
110use std::ptr;
111
112
113// RawCircle
114// ---------------------------------------------------------------------------
115
116/// A thread-safe version of [`CharCircle`].
117///
118/// The API of this buffer is almost identical to `CharCircle` except that it
119/// uses external mutability, and it does not provide a means to consume its
120/// characters from an external iterator.
121#[derive(Debug, Default, Clone)]
122pub struct RawCircle {
123 buf: Vec<u8>, // The backing storage.
124 len: usize, // The number of used bytes in the buffer.
125 n_chars: usize, // The number of characters in the buffer.
126 read: usize, // Index of the read head. May equal the capacity.
127 write: usize, // Index of the write head. May equal the capacity.
128}
129
130impl RawCircle {
131 /// Construct a new `RawCircle` using a string as the initial buffer.
132 pub fn new(s: String) -> RawCircle {
133 let n_chars = s.chars().count();
134 let buf = s.into_bytes();
135 let len = buf.len();
136 RawCircle { buf, len, n_chars, read: 0, write: 0 }
137 }
138
139 /// Construct a new, empty `RawCircle`.
140 pub fn empty() -> RawCircle {
141 RawCircle::default()
142 }
143
144 /// The number of UTF-8 bytes in the buffer.
145 pub fn len(&self) -> usize {
146 self.len
147 }
148
149 /// The number of characters in the buffer.
150 pub fn n_chars(&self) -> usize {
151 self.n_chars
152 }
153
154 /// The number of bytes the buffer can hold before reallocating.
155 ///
156 /// This refers to the length of the backing vector. That vector may have
157 /// additional capacity allocated to it that is not reported by this method.
158 pub fn capacity(&self) -> usize {
159 self.buf.len()
160 }
161
162 /// Reallocate the buffer if it is full.
163 fn grow_if_full(&mut self) {
164 // SAFTEY: There are several safety concerns here.
165 //
166 // 1. The read and write heads may equal the capacity when this method is
167 // called (but may not otherwise be out of bounds). We move the read head
168 // to 0 if it is at the capacity, and we prove the write head is in bounds
169 // before we use it.
170 //
171 // 2. We move data around using `ptr::copy`. We must maintain that the affected
172 // regions are in bounds and that the destination is trash bytes.
173 //
174 // 3. We must properly maintain the read and write heads. When we return, the
175 // [BACK] and [FRONT] areas must be valid UTF-8, and the read head may equal
176 // the capacity. The length should never change.
177 //
178 // It is safe to directly set the length of the buffer to the capacity. It is
179 // in-bounds and there are no initialization or drop-safety concerns with
180 // plain-old bytes.
181 unsafe {
182 let old_cap = self.capacity();
183 if self.read == old_cap { self.read = 0 };
184 if self.read == 0 { self.write = self.len };
185
186 if old_cap == 0 {
187 // We have no capacity, so add some.
188 // Only hit the allocator if the backing vector has no extra capacity.
189 debug_assert!(self.read == 0);
190 debug_assert!(self.write == 0);
191 if self.buf.capacity() == 0 { self.buf.reserve(1); }
192 let new_cap = self.buf.capacity();
193 self.buf.set_len(new_cap);
194
195 } else if self.len == old_cap {
196 // If the backing vector has excess capacity, we just use that.
197 // Otherwise we ask the allocator to double our capacity.
198 if old_cap == self.buf.capacity() {
199 self.buf.reserve(old_cap);
200 }
201 let new_cap = self.buf.capacity();
202 self.buf.set_len(new_cap);
203
204 if self.write == self.read {
205 // The memory areas are [BACK, FRONT, TMP].
206 // The read and write heads are between [BACK] and [FRONT].
207 // Rearange: [BACK, FRONT, TMP] -> [BACK, TMP, FRONT].
208 let front_size = old_cap - self.read;
209 let new_read = new_cap - front_size;
210 let src = self.buf.get_unchecked_mut(self.read) as *mut u8;
211 let dest = self.buf.get_unchecked_mut(new_read) as *mut u8;
212 ptr::copy(src, dest, front_size);
213 self.read = new_read;
214 } else {
215 debug_assert!(self.read == 0);
216 debug_assert!(self.write == self.len);
217 }
218 }
219 }
220 }
221
222 /// Return the number of UTF-8 bytes of the next character.
223 fn peek_char_len(&self) -> Option<usize> {
224 // SAFETY: It is safe to use an unchecked get at the read head when the
225 // length is non-zero.
226 if self.len == 0 { return None };
227 let byte = unsafe { self.buf.get_unchecked(self.read) };
228 let reverse = byte ^ 0b11111111;
229 let leading_ones = reverse.leading_zeros();
230 match leading_ones {
231 0 => Some(1),
232 1 => unreachable!("invalid utf-8"),
233 2 => Some(2),
234 3 => Some(3),
235 4 => Some(4),
236 _ => unreachable!("invalid utf-8"),
237 }
238 }
239
240 /// Read a single byte from the buffer.
241 ///
242 /// It is unsafe to partially read a multibyte UTF-8 character.
243 ///
244 /// This method DOES update `len` but DOES NOT update `n_chars`.
245 /// Callers MUST update `n_chars`.
246 unsafe fn read_byte(&mut self) -> Option<u8> {
247 // SAFTEY: The read head may equal the capacity,
248 // but may not otherwise be out of bounds.
249 if self.len == 0 { return None };
250 if self.read == self.capacity() { self.read = 0 };
251 let byte = self.buf.get_unchecked(self.read);
252 self.read += 1;
253 self.len -= 1;
254 Some(*byte)
255 }
256
257 /// Read the next character in the buffer.
258 pub fn read_char(&mut self) -> Option<char> {
259 // SAFETY: We MUST read an entire character, and we MUST update `n_chars`.
260 unsafe {
261 let byte = self.read_byte()?;
262 let reverse = byte ^ 0b11111111;
263 let leading_ones = reverse.leading_zeros();
264 let mask = 0b11111111 >> leading_ones;
265 let mut ch = (byte & mask) as u32;
266 let n_additional_bytes = leading_ones.saturating_sub(1);
267 for _ in 0..n_additional_bytes {
268 let byte = self.read_byte().unwrap();
269 let byte = byte & 0b00111111;
270 ch = (ch << 6) | byte as u32;
271 }
272 self.n_chars -= 1;
273 Some(char::from_u32_unchecked(ch))
274 }
275 }
276
277 /// Read bytes from this circle into a buffer.
278 ///
279 /// This method will only ever read complete UTF-8 characters. It returns the
280 /// number of bytes read; it never returns an error.
281 ///
282 /// This is the implementation of [`std::io::Read`] for `RawCircle`.
283 pub fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
284 let max_len = cmp::min(buf.len(), self.len());
285 let mut len = 0;
286 loop {
287 match self.peek_char_len() {
288 None => return Ok(len),
289 Some(n) => {
290 if len + n <= max_len {
291 // SAFETY: We MUST read an entire character, and we MUST update `n_chars`.
292 for i in 0..n {
293 buf[len+i] = unsafe { self.read_byte().unwrap() };
294 }
295 self.n_chars -= 1;
296 len += n;
297 } else {
298 return Ok(len);
299 }
300 }
301 }
302 }
303 }
304
305 /// Read bytes from this circle into a buffer.
306 ///
307 /// This method is equivalent to [`RawCircle::read`] except the return value
308 /// is the buffer cast to a `&str`.
309 pub fn read_str<'a>(&mut self, buf: &'a mut [u8]) -> &'a str {
310 // SAFETY: It is safe to cast to a string because `self.read` only ever reads
311 // valid UTF-8.
312 let n = self.read(buf).unwrap();
313 let str = &buf[..n] as *const [u8] as *const str;
314 return unsafe { &*str }
315 }
316
317 /// Write a single byte into the buffer.
318 ///
319 /// It is unsafe to write invalid UTF-8.
320 ///
321 /// This method DOES update `len` but DOES NOT update `n_chars`.
322 /// Callers MUST update `n_chars`.
323 unsafe fn write_byte(&mut self, byte: u8) {
324 // SAFTEY: The write head may equal the capacity,
325 // but may not otherwise be out of bounds.
326 self.grow_if_full();
327 if self.write == self.capacity() { self.write = 0 };
328 let byte_ref = self.buf.get_unchecked_mut(self.write);
329 *byte_ref = byte;
330 self.write += 1;
331 self.len += 1;
332 }
333
334 /// Write a character into the buffer.
335 pub fn write_char(&mut self, ch: char) {
336 // SAFETY: We MUST write an entire character, and we MUST update `n_chars`.
337 unsafe {
338 let mut utf8 = [0u8; 4];
339 for byte in ch.encode_utf8(&mut utf8).as_bytes() {
340 self.write_byte(*byte)
341 }
342 self.n_chars += 1;
343 }
344 }
345
346 /// Read bytes from a string into this buffer;
347 ///
348 /// This method will only ever write complete UTF-8 characters. It returns the
349 /// number of bytes written. This method returns an error if the input is not
350 /// valid UTF-8.
351 ///
352 /// This is the implementation of [`std::io::Write`] for `RawCircle`.
353 pub fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
354 let mut len = 0;
355 let mut tmp = [0u8; 4];
356 loop {
357 // SAFETY: We must make sure the read is in-bounds to use `get_unchecked`.
358 if len == buf.len() { return Ok(len) };
359 let first_byte = unsafe { *buf.get_unchecked(len) };
360 let reverse = first_byte ^ 0b11111111;
361 let leading_ones = reverse.leading_zeros();
362 let n = match leading_ones {
363 0 => 1,
364 1 => return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid UTF-8")),
365 2 => 2,
366 3 => 3,
367 4 => 4,
368 _ => return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid UTF-8")),
369 };
370 tmp[0] = first_byte;
371 for i in 1..n {
372 // SAFETY: We must make sure the read is in-bounds to use `get_unchecked`.
373 if len + i == buf.len() { return Ok(len) };
374 let byte = unsafe { *buf.get_unchecked(len + i) };
375 let reverse = byte ^ 0b11111111;
376 let leading_ones = reverse.leading_zeros();
377 if leading_ones != 1 {
378 return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid UTF-8"));
379 }
380 tmp[i] = byte;
381 }
382 // SAFETY: We MUST write an entire character, and we MUST update `n_chars`.
383 for i in 0..n {
384 unsafe { self.write_byte(tmp[i]) };
385 }
386 self.n_chars += 1;
387 len += n;
388 }
389 }
390
391 /// Read bytes from a string into this buffer;
392 ///
393 /// This method is equivalent to [`RawCircle::write`] except that it cannot return
394 /// an error because the input is valid UTF-8.
395 pub fn write_str(&mut self, buf: &str) -> usize {
396 self.write(buf.as_bytes()).unwrap()
397 }
398
399 /// Rearange the contents so that the read head is at 0.
400 fn realign(&mut self) {
401 // SAFTEY: There are several safety concerns here.
402 //
403 // 1. The read and write heads may equal the capacity when this method is
404 // called (but may not otherwise be out of bounds). We move the read head
405 // to 0 if it is at the capacity, and we prove the write head is in bounds
406 // before we use it.
407 //
408 // 2. We move data around using `ptr::copy`, `ptr::copy_nonoverlapping`, and
409 // `ptr::swap_nonoverlapping`. We must maintain that all affected regions
410 // are in bounds and non-overlapping when required. We must also maintain
411 // that the destination of copys is trash bytes.
412 //
413 // 3. We must properly maintain the read and write heads. When we return, the
414 // read head should be at 0, and the write head should be at `self.len`.
415 // The range `read..write` must be valid UTF-8, and the write head may
416 // equal the capacity. The length should never change.
417 unsafe {
418 if self.read == self.capacity() { self.read = 0 };
419 if self.read == 0 { self.write = self.len };
420
421 if self.len == 0 || self.read == 0 {
422 // Nothing to do.
423
424 } else if self.read < self.write {
425 // The areas in memory are [LEFT, STR, RIGHT].
426 // [LEFT, STR, RIGHT] -> [STR, LEFT, RIGHT].
427 let src = self.buf.get_unchecked_mut(self.read) as *mut u8;
428 let dest = self.buf.get_unchecked_mut(0) as *mut u8;
429 ptr::copy(src, dest, self.len);
430
431 } else {
432 // SAFTEY: Note that the write head is in bounds in this case.
433 debug_assert!(self.write < self.read);
434
435 // The areas in memory are [BACK, TMP, FRONT]. [TMP] may be empty.
436 // We are trying to get to [FRONT, BACK, TMP] where [FRONT, BACK] is our string.
437 let mut back = (0, self.write);
438 let mut tmp = (self.write, self.read);
439 let mut front = (self.read, self.capacity());
440 let len = |bounds: (usize, usize)| bounds.1 - bounds.0;
441
442 // [BACK, TMP, FRONT] -> [BACK, FRONT, TMP]
443 let src = self.buf.get_unchecked_mut(front.0) as *mut u8;
444 let dest = self.buf.get_unchecked_mut(tmp.0) as *mut u8;
445 let count = len(front);
446 ptr::copy(src, dest, count);
447 front = (tmp.0, tmp.0 + count);
448 tmp = (front.1, self.capacity());
449
450 // Iterativly move parts of [FRONT] to the correct position.
451 // When we're done, we still have three areas [BACK, FRONT, TMP],
452 // but they do not include the bytes that are already in place.
453 while len(tmp) < len(back) {
454 if len(front) < len(back) {
455 // Say [BACK] = [B0, B1] where [B0] has the same size as [FRONT].
456 // [B0, B1, FRONT, TMP] -> [FRONT, B1, B0, TMP]
457 let count = len(front);
458 let b0 = (back.0, back.0 + count);
459 let b1 = (back.0 + count, back.1);
460 let x = self.buf.get_unchecked_mut(b0.0) as *mut u8;
461 let y = self.buf.get_unchecked_mut(front.0) as *mut u8;
462 ptr::swap_nonoverlapping(x, y, count);
463 back = b1;
464 } else {
465 // Say [FRONT] = [F0, F1] where [F0] has the same size as [BACK].
466 // [BACK, F0, F1, TMP] -> [F0, BACK, F1, TMP]
467 let count = len(back);
468 let f0 = (front.0, front.0 + count);
469 let f1 = (front.0 + count, front.1);
470 let x = self.buf.get_unchecked_mut(f0.0) as *mut u8;
471 let y = self.buf.get_unchecked_mut(back.0) as *mut u8;
472 ptr::swap_nonoverlapping(x, y, count);
473 back = f0;
474 front = f1;
475 }
476 }
477
478 if len(tmp) != 0 {
479 // Say [TMP] = [T0, T1] where [T0] has the same size as [BACK].
480 // [BACK, FRONT, T0, T1] -> [T0, FRONT, BACK, T1]
481 let src = self.buf.get_unchecked_mut(back.0) as *mut u8;
482 let dest = self.buf.get_unchecked_mut(tmp.0) as *mut u8;
483 let count = len(back);
484 ptr::copy_nonoverlapping(src, dest, count);
485 let t0 = (back.0, back.0 + count);
486 back = (tmp.0, tmp.0 + count);
487
488 // [T0, FRONT, BACK, T1] -> [FRONT, BACK, T0, T1]
489 let src = self.buf.get_unchecked_mut(front.0) as *mut u8;
490 let dest = self.buf.get_unchecked_mut(t0.0) as *mut u8;
491 let count = len(front) + len(back);
492 ptr::copy(src, dest, count);
493 } else {
494 // [BACK, FRONT, TMP] == [FRONT]
495 debug_assert!(len(back) == 0);
496 }
497 }
498 }
499
500 // Fix the read and write heads.
501 self.read = 0;
502 self.write = self.len;
503 }
504
505 /// Unpack this circular buffer into a byte vector.
506 pub fn into_vec(mut self) -> Vec<u8> {
507 // SAFTEY: It is safe to directly set the length of the buffer to the length
508 // of the content. It is in-bounds and there are no drop-safety concerns with
509 // plain-old bytes. When the buffer has been realigned, the content will be
510 // on the range `0..len`.
511 self.realign();
512 let mut vec = self.buf;
513 unsafe { vec.set_len(self.len) };
514 vec
515 }
516
517 /// Unpack this circular buffer into a string.
518 pub fn into_string(self) -> String {
519 // SAFTEY: The public API only allows valid UTF-8 to be read or written from
520 // the buffer. Thus it is safe to cast to a string.
521 let vec = self.into_vec();
522 unsafe { String::from_utf8_unchecked(vec) }
523 }
524}
525
526impl From<String> for RawCircle {
527 fn from(s: String) -> RawCircle {
528 RawCircle::new(s)
529 }
530}
531
532impl From<RawCircle> for String {
533 fn from(c: RawCircle) -> String {
534 c.into_string()
535 }
536}
537
538impl From<RawCircle> for Vec<u8> {
539 fn from(c: RawCircle) -> Vec<u8> {
540 c.into_vec()
541 }
542}
543
544impl Read for RawCircle {
545 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
546 RawCircle::read(self, buf)
547 }
548}
549
550impl Write for RawCircle {
551 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
552 RawCircle::write(self, buf)
553 }
554
555 fn flush(&mut self) -> io::Result<()> {
556 Ok(())
557 }
558}
559
560
561// CharCircle
562// ---------------------------------------------------------------------------
563
564/// A circular buffer of characters.
565///
566/// The reallocation policy of [`std::collections::VecDeque`] makes it a poor
567/// choice as a circular buffer for modifying strings in place. It reallocates
568/// early (!) and when it does, it places its contents in such a way that
569/// requires shuffling to convert it back into a string in the common case.
570/// This circular buffer provides a better alternative.
571///
572/// This circular buffer will not allocate early and is optimized for the case
573/// when you read exactly the contents that were in the initial buffer. In that
574/// case, the read head is guaranteed to be at index 0 of the underlying
575/// buffer, making the conversion into a string trivial.
576///
577/// Additionally, this circular buffer provides a `char` oriented interface, but
578/// uses UTF-8 internally, allowing it to operate directly on [`String`]s.
579#[derive(Debug, Default, Clone)]
580pub struct CharCircle {
581 raw: RefCell<RawCircle>,
582}
583
584impl CharCircle {
585 /// Construct a new `CharCircle` using a string as the initial buffer.
586 pub fn new(s: String) -> CharCircle {
587 CharCircle { raw: RefCell::new(RawCircle::new(s)) }
588 }
589
590 /// Construct a new, empty `CharCircle`.
591 pub fn empty() -> CharCircle {
592 CharCircle::default()
593 }
594
595 /// The number of UTF-8 bytes in the buffer.
596 pub fn len(&self) -> usize {
597 self.raw.borrow().len()
598 }
599
600 /// The number of characters in the buffer.
601 pub fn n_chars(&self) -> usize {
602 self.raw.borrow().n_chars()
603 }
604
605 /// The number of bytes the buffer can hold before reallocating.
606 ///
607 /// This refers to the length of the backing vector. That vector may have
608 /// additional capacity allocated to it that is not reported by this method.
609 pub fn capacity(&self) -> usize {
610 self.raw.borrow().capacity()
611 }
612
613 /// Read the next character in the buffer.
614 pub fn read_char(&self) -> Option<char> {
615 self.raw.borrow_mut().read_char()
616 }
617
618 /// Read bytes from this circle into a buffer.
619 ///
620 /// This method will only ever read complete UTF-8 characters. It returns the
621 /// number of bytes read; it never returns an error.
622 ///
623 /// This is the implementation of [`std::io::Read`] for `CharCircle`.
624 pub fn read(&self, buf: &mut [u8]) -> io::Result<usize> {
625 self.raw.borrow_mut().read(buf)
626 }
627
628 /// Read bytes from this circle into a buffer.
629 ///
630 /// This method is equivalent to [`RawCircle::read`] except the return value
631 /// is the buffer cast to a `&str`.
632 pub fn read_str<'a>(&self, buf: &'a mut [u8]) -> &'a str {
633 self.raw.borrow_mut().read_str(buf)
634 }
635
636 /// Write a character into the buffer.
637 pub fn write_char(&self, ch: char) {
638 self.raw.borrow_mut().write_char(ch)
639 }
640
641 /// Read bytes from a string into this buffer;
642 ///
643 /// This method will only ever write complete UTF-8 characters. It returns the
644 /// number of bytes written. This method returns an error if the input is not
645 /// valid UTF-8.
646 ///
647 /// This is the implementation of [`std::io::Write`] for `CharCircle`.
648 pub fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
649 self.raw.borrow_mut().write(buf)
650 }
651
652 /// Read bytes from a string into this buffer;
653 ///
654 /// This method is equivalent to [`CharCircle::write`] except that it cannot return
655 /// an error because the input is valid UTF-8.
656 pub fn write_str(&self, buf: &str) -> usize {
657 self.raw.borrow_mut().write_str(buf)
658 }
659
660 /// Unpack this circular buffer into a byte vector.
661 pub fn into_vec(self) -> Vec<u8> {
662 self.raw.into_inner().into_vec()
663 }
664
665 /// Unpack this circular buffer into a string.
666 pub fn into_string(self) -> String {
667 self.raw.into_inner().into_string()
668 }
669
670 /// Read characters from the buffer with an iterator.
671 ///
672 /// The returned iterator will read at most `n` characters and ensures that it
673 /// has been exhausted upon drop.
674 ///
675 /// Calling `next` on the iterator is equivalent to calling `read_char` on
676 /// this buffer.
677 pub fn take_chars(&self, n: usize) -> Chars {
678 Chars::new(&self, n)
679 }
680
681 /// Read the current characters from the buffer with an iterator.
682 ///
683 /// The returned iterator will read at most `n` characters, where `n` is the
684 /// number of characters currently in the buffer, and ensures that it has been
685 /// exhausted upon drop.
686 ///
687 /// Calling `next` on the iterator is equivalent to calling `read_char` on
688 /// this buffer.
689 ///
690 /// This is equivalent to calling [`CharCircle::take_chars`] with the current
691 /// number of characters in the buffer. In particular, interleaving calls to
692 /// `read_char` and `write_char` on the buffer with calls to `next` on the
693 /// iterator may cause the iterator to consume characters that were not in the
694 /// buffer at the time it was created.
695 pub fn take_current_chars(&self) -> Chars {
696 self.take_chars(self.n_chars())
697 }
698}
699
700impl From<String> for CharCircle {
701 fn from(s: String) -> CharCircle {
702 CharCircle::new(s)
703 }
704}
705
706impl From<CharCircle> for String {
707 fn from(c: CharCircle) -> String {
708 c.into_string()
709 }
710}
711
712impl From<CharCircle> for Vec<u8> {
713 fn from(c: CharCircle) -> Vec<u8> {
714 c.into_vec()
715 }
716}
717
718impl Read for CharCircle {
719 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
720 CharCircle::read(self, buf)
721 }
722}
723
724impl Write for CharCircle {
725 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
726 CharCircle::write(self, buf)
727 }
728
729 fn flush(&mut self) -> io::Result<()> {
730 Ok(())
731 }
732}
733
734
735// Chars
736// ---------------------------------------------------------------------------
737
738/// An iterator that consumes characters in a [`CharCircle`].
739///
740/// Calling `next` on this iterator is equivalent to calling
741/// [`CharCircle::read_char`] on the corresponding `CharCircle`.
742///
743/// This iterator ensures that it has been exhausted upon drop.
744///
745/// In the [`StringTransform`] API, this struct is an iterator over the
746/// characters of the string being transformed.
747#[derive(Debug)]
748pub struct Chars<'a> {
749 circle: &'a CharCircle,
750 n_chars: usize,
751}
752
753impl<'a> Chars<'a> {
754 fn new(circle: &'a CharCircle, n_chars: usize) -> Chars<'a> {
755 Chars{ circle, n_chars }
756 }
757}
758
759impl<'a> Iterator for Chars<'a> {
760 type Item = char;
761
762 fn next(&mut self) -> Option<char> {
763 if self.n_chars == 0 {
764 None
765 } else {
766 let ch = self.circle.read_char()?;
767 self.n_chars -= 1;
768 Some(ch)
769 }
770 }
771}
772
773impl<'a> Drop for Chars<'a> {
774 /// Ensure the iterator is exhausted.
775 fn drop(&mut self) {
776 for _ in self { }
777 }
778}
779
780
781// StringTransform
782// ---------------------------------------------------------------------------
783
784/// A factory trait for in-place string transformations.
785///
786/// This trait allows character-iterator adaptors to modify strings in-place.
787/// It is implemented by adaptor factories with a single required method,
788/// `transform_chars`, for constructing the adaptors. In return, this trait
789/// provides a method, `transform`, which can apply the transformation to
790/// both owned strings and string slices, without allocating when possible.
791///
792/// The in-place operation works by treating the underlying string as a
793/// circular buffer. The transform reads characters from the front of the
794/// buffer and writes characters to the back of the buffer. Once the transform
795/// returns `None`, the unread characters are deleted and the circular buffer
796/// is cast back into a string. Note that the transformation cannot always be
797/// applied in place; if the transform ever returns more bytes than it has
798/// read, an allocation is required to grow the buffer.
799///
800/// This trait supports copy-on-write optimizations. Implementors can override
801/// the [`StringTransform::will_modify`] method to short-circuit the transform.
802///
803/// See [`SimpleTransform`] for a variant of this trait that is implemented
804/// directly by iterator adaptors.
805///
806/// The lifetime `'a` refers to the lifetime of the `transform` method.
807pub trait StringTransform<'a> {
808 /// The type after applying this transform to a [`Chars`] iterator.
809 type Iter: Iterator<Item=char>;
810
811 /// Transform the characters of a string.
812 fn transform_chars(&self, chars: Chars<'a>) -> Self::Iter;
813
814 /// A hint to short-circuit string transformations.
815 ///
816 /// If true, the string might be modified by the transform. If false, the
817 /// string will definitely not be modified by the transform.
818 ///
819 /// Implementors may override this function to facilitate copy-on-write
820 /// optimizations. The default implementation always returns true, which
821 /// disables copy-on-write.
822 #[allow(unused_variables)]
823 fn will_modify(&self, val: &str) -> bool {
824 return true
825 }
826
827 /// Transform a string in-place
828 ///
829 /// This method can operate on both `String` and `&str`.
830 ///
831 /// A new string may be allocated if the input is not owned or if the
832 /// transformation needs to buffer more characters than the string has capacity.
833 fn transform<'b, T: Into<Cow<'b, str>>>(&self, s: T) -> Cow<'b, str> {
834 // SAFTEY: The lifetime `'a` refers to the circular buffer, but the borrow
835 // checker isn't smart enough to know that, because `'a` is a type parameter
836 // of the trait. Transumting the iterator to a new lifetime solves the issue.
837 // The safety condition is that we must not let the iterator outlive the
838 // buffer. This means that `old_chars` must be dropped before `circle`.
839 //
840 // Note that it is not safe for the buffer and iterator to be used from
841 // different threads. `CharCircle` is implemented with a `RefCell`, ensuring
842 // that it is not `Send` and that `Chars` is neither `Send` nor `Sync`.
843 let s = s.into();
844 if self.will_modify(&s) {
845 let s = s.into_owned();
846 let circle = CharCircle::new(s);
847 let old_chars = circle.take_current_chars();
848 let old_chars: Chars<'_> = unsafe { mem::transmute(old_chars) };
849 let new_chars = self.transform_chars(old_chars);
850 for ch in new_chars { circle.write_char(ch) }; // `old_chars` is dropped after loop.
851 let t = circle.into_string(); // `circle` is dropped here.
852 Cow::Owned(t)
853 } else {
854 s
855 }
856 }
857}
858
859
860// SimpleTransform
861// ---------------------------------------------------------------------------
862
863/// A simple trait for in-place string transformations.
864///
865/// This trait allows character-iterator adaptors to modify strings in-place.
866/// It is implemented by adaptors with a single required associated function,
867/// `transform_chars`, for constructing the adaptor. In return, this trait
868/// provides an associated function, `transform`, which can apply the
869/// transformation to both owned strings and string slices, without allocating
870/// when possible.
871///
872/// This trait is a simplified version of [`StringTransform`]. See the
873/// documentation of that trait for more details.
874pub trait SimpleTransform<'a>: Iterator<Item=char> + Sized {
875 /// Transform the characters of a string.
876 fn transform_chars(chars: Chars<'a>) -> Self;
877
878 /// A hint to short-circuit string transformations.
879 ///
880 /// If true, the string might be modified by the transform. If false, the
881 /// string will definitely not be modified by the transform.
882 ///
883 /// Implementors may override this function to facilitate copy-on-write
884 /// optimizations. The default implementation always returns true, which
885 /// disables copy-on-write.
886 #[allow(unused_variables)]
887 fn will_modify(val: &str) -> bool {
888 return true
889 }
890
891 /// Transform a string in-place
892 ///
893 /// This function can operate on both `String` and `&str`.
894 ///
895 /// A new string may be allocated if the input is not owned or if the
896 /// transformation needs to buffer more characters than the string has capacity.
897 fn transform<'b, T: Into<Cow<'b, str>>>(s: T) -> Cow<'b, str> {
898 SimpleStringTransform::<Self>::new().transform(s)
899 }
900}
901
902
903// SimpleStringTransform
904// ---------------------------------------------------------------------------
905
906/// Used when you have a [`SimpleTransform`] but need a [`StringTransform`].
907///
908/// ### Example
909///
910/// ```
911/// # use char_circle::*;
912/// // `Identity` is a `SimpleTransform`.
913/// struct Identity<I>(I);
914///
915/// impl<I: Iterator<Item=char>> Iterator for Identity<I> {
916/// type Item = char;
917/// fn next(&mut self) -> Option<char> {
918/// self.0.next()
919/// }
920/// }
921///
922/// impl<'a> SimpleTransform<'a> for Identity<Chars<'a>> {
923/// fn transform_chars(chars: Chars<'a>) -> Self {
924/// Identity(chars)
925/// }
926/// }
927///
928/// // This function takes a `StringTransform`.
929/// fn apply_transform<'a, T: StringTransform<'a>>(t: T, s: String) -> String {
930/// t.transform(s).into_owned()
931/// }
932///
933/// // Use `SimpleStringTransform` to bridge the gap.
934/// let t = SimpleStringTransform::<Identity<Chars>>::new();
935/// let s = apply_transform(t, "Hello World".to_string());
936/// assert_eq!(&s, "Hello World");
937/// ```
938pub struct SimpleStringTransform<T>(pub PhantomData<T>);
939
940impl<'a, T: SimpleTransform<'a>> SimpleStringTransform<T> {
941 /// Create a new `SimpleStringTransform`.
942 pub fn new() -> Self {
943 SimpleStringTransform(PhantomData)
944 }
945}
946
947impl<'a, T: SimpleTransform<'a>> StringTransform<'a> for SimpleStringTransform<T> {
948 type Iter = T;
949
950 fn transform_chars(&self, chars: Chars<'a>) -> Self::Iter {
951 Self::Iter::transform_chars(chars)
952 }
953
954 fn will_modify(&self, val: &str) -> bool {
955 Self::Iter::will_modify(val)
956 }
957}
958
959
960// Unit Tests
961// ---------------------------------------------------------------------------
962
963#[cfg(test)]
964mod tests {
965 use super::*;
966
967 // Identity transform
968 // -------------------------
969
970 /// An identity function as a string transform.
971 struct Identity<I>(I);
972
973 impl<I: Iterator<Item=char>> Iterator for Identity<I> {
974 type Item = char;
975 fn next(&mut self) -> Option<char> {
976 self.0.next()
977 }
978 }
979
980 impl<'a> SimpleTransform<'a> for Identity<Chars<'a>> {
981 fn transform_chars(chars: Chars<'a>) -> Self {
982 Identity(chars)
983 }
984 }
985
986 // Half transform
987 // -------------------------
988
989 /// Deletes every other character.
990 struct Half<I>(I);
991
992 impl<I: Iterator<Item=char>> Iterator for Half<I> {
993 type Item = char;
994 fn next(&mut self) -> Option<char> {
995 self.0.next()?;
996 self.0.next()
997 }
998 }
999
1000 impl<'a> SimpleTransform<'a> for Half<Chars<'a>> {
1001 fn transform_chars(chars: Chars<'a>) -> Self {
1002 Half(chars)
1003 }
1004 }
1005
1006 // Double transform
1007 // -------------------------
1008
1009 /// Duplicates every character.
1010 struct Double<I> {
1011 inner: I,
1012 state: Option<char>,
1013 }
1014
1015 impl<I: Iterator<Item=char>> Iterator for Double<I> {
1016 type Item = char;
1017 fn next(&mut self) -> Option<char> {
1018 match self.state {
1019 Some(_) => self.state.take(),
1020 None => {
1021 let state = self.inner.next();
1022 self.state = state;
1023 state
1024 }
1025 }
1026 }
1027 }
1028
1029 impl<'a> SimpleTransform<'a> for Double<Chars<'a>> {
1030 fn transform_chars(chars: Chars<'a>) -> Self {
1031 Double { inner: chars, state: None }
1032 }
1033 }
1034
1035 // Tests
1036 // -------------------------
1037
1038 #[test]
1039 fn test_transform() {
1040 let s: String = "Hello World".to_string();
1041
1042 // An identity transform does not reallocate.
1043 let ptr = s.as_ptr();
1044 let cap = s.capacity();
1045 let s = Identity::transform(s).into_owned();
1046 assert_eq!(&s, "Hello World");
1047 assert_eq!(s.as_ptr(), ptr);
1048 assert_eq!(s.capacity(), cap);
1049
1050 // A half transform does not reallocate.
1051 let ptr = s.as_ptr();
1052 let cap = s.capacity();
1053 let s = Half::transform(s).into_owned();
1054 assert_eq!(&s, "el ol");
1055 assert_eq!(s.as_ptr(), ptr);
1056 assert_eq!(s.capacity(), cap);
1057
1058 // A first double transform does not reallocate because there is excess capacity.
1059 let ptr = s.as_ptr();
1060 let cap = s.capacity();
1061 let s = Double::transform(s).into_owned();
1062 assert_eq!(&s, "eell ooll");
1063 assert_eq!(s.as_ptr(), ptr);
1064 assert_eq!(s.capacity(), cap);
1065
1066 // A second double transform _does_ reallocate because it needs additional capacity.
1067 let cap = s.capacity();
1068 let s = Double::transform(s).into_owned();
1069 assert_eq!(&s, "eeeellll oooollll");
1070 assert_ne!(s.capacity(), cap);
1071 }
1072
1073 #[test]
1074 fn test_chars() {
1075 let s: String = "Hello World".to_string();
1076 let circle = CharCircle::new(s);
1077
1078 // The iterator consumes characters from the buffer.
1079 let mut chars = circle.take_chars(6);
1080 assert_eq!(chars.next(), Some('H'));
1081 assert_eq!(chars.next(), Some('e'));
1082 assert_eq!(chars.next(), Some('l'));
1083
1084 // The iterator consumes all `n` characters upon drop.
1085 std::mem::drop(chars);
1086 let s = circle.into_string();
1087 assert_eq!(s.as_str(), "World");
1088
1089 // The iterator does not consume more than `n` characters.
1090 let s: String = "Hello World".to_string();
1091 let circle = CharCircle::new(s);
1092 let mut chars = circle.take_chars(6);
1093 assert_eq!(chars.next(), Some('H'));
1094 assert_eq!(chars.next(), Some('e'));
1095 assert_eq!(chars.next(), Some('l'));
1096 assert_eq!(chars.next(), Some('l'));
1097 assert_eq!(chars.next(), Some('o'));
1098 assert_eq!(chars.next(), Some(' '));
1099 assert_eq!(chars.next(), None);
1100 }
1101
1102 #[test]
1103 fn test_circle() {
1104 // `read` and `write` work as expected.
1105 // The `read_str` and `write_str` versions hit that code path.
1106 let circle = CharCircle::empty();
1107 circle.write_str("Hello World!");
1108 assert_eq!(circle.len(), 11);
1109 assert_eq!(circle.n_chars(), 11);
1110 let mut buf = [0u8; 6];
1111 let buf_str = circle.read_str(&mut buf);
1112 assert_eq!(buf_str, "Hello ");
1113 assert_eq!(circle.len(), 5);
1114 assert_eq!(circle.n_chars(), 5);
1115 assert_eq!(&circle.into_string(), "World!");
1116
1117 // A more complicated test that alternates between reading and writing.
1118 let circle = CharCircle::empty();
1119 circle.write_char('F');
1120 circle.write_char('o');
1121 circle.write_char('o');
1122 circle.write_char('B');
1123 circle.write_char('a');
1124 circle.write_char('r');
1125 assert_eq!(circle.read_char(), Some('F'));
1126 assert_eq!(circle.read_char(), Some('o'));
1127 assert_eq!(circle.read_char(), Some('o'));
1128 assert_eq!(circle.read_char(), Some('B'));
1129 assert_eq!(circle.read_char(), Some('a'));
1130 assert_eq!(circle.read_char(), Some('r'));
1131 assert_eq!(circle.read_char(), None);
1132 circle.write_char('H');
1133 circle.write_char('e');
1134 circle.write_char('l');
1135 circle.write_char('l');
1136 circle.write_char('o');
1137 circle.write_char(' ');
1138 circle.write_char('W');
1139 circle.write_char('o');
1140 circle.write_char('r');
1141 circle.write_char('l');
1142 circle.write_char('d');
1143 circle.write_char('!');
1144 assert_eq!(circle.read_char(), Some('H'));
1145 assert_eq!(circle.read_char(), Some('e'));
1146 assert_eq!(circle.read_char(), Some('l'));
1147 assert_eq!(circle.read_char(), Some('l'));
1148 assert_eq!(circle.read_char(), Some('o'));
1149 assert_eq!(circle.read_char(), Some(' '));
1150 circle.write_char('F');
1151 circle.write_char('o');
1152 circle.write_char('o');
1153 circle.write_char('F');
1154 circle.write_char('o');
1155 circle.write_char('o');
1156 assert_eq!(circle.read_char(), Some('W'));
1157 assert_eq!(circle.read_char(), Some('o'));
1158 assert_eq!(circle.read_char(), Some('r'));
1159 assert_eq!(circle.read_char(), Some('l'));
1160 assert_eq!(circle.read_char(), Some('d'));
1161 assert_eq!(circle.read_char(), Some('!'));
1162 assert_eq!(circle.read_char(), Some('F'));
1163 assert_eq!(circle.read_char(), Some('o'));
1164 assert_eq!(circle.read_char(), Some('o'));
1165 circle.write_char('B');
1166 circle.write_char('a');
1167 circle.write_char('r');
1168 assert_eq!(&circle.into_string(), "FooBar");
1169 }
1170}