Skip to main content

editor_core/
storage.rs

1//! Stage 1: Linear Storage Layer
2//!
3//! Implements efficient insertion and deletion operations using Piece Table,
4//! providing O(1) undo capability.
5
6/// Buffer type identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum BufferType {
9    /// Read-only original buffer
10    Original,
11    /// Append-only add buffer
12    Add,
13}
14
15/// Piece structure: references a fragment in a buffer
16#[derive(Debug, Clone)]
17pub struct Piece {
18    /// Buffer type
19    pub buffer_type: BufferType,
20    /// Start position in the corresponding buffer (byte offset)
21    pub start: usize,
22    /// Byte length of the fragment
23    pub byte_length: usize,
24    /// Character count of the fragment (handles UTF-8 multi-byte characters)
25    pub char_count: usize,
26}
27
28impl Piece {
29    /// Create a new Piece
30    pub fn new(
31        buffer_type: BufferType,
32        start: usize,
33        byte_length: usize,
34        char_count: usize,
35    ) -> Self {
36        Self {
37            buffer_type,
38            start,
39            byte_length,
40            char_count,
41        }
42    }
43}
44
45/// Piece Table - main storage structure
46pub struct PieceTable {
47    /// Read-only original buffer
48    original_buffer: Vec<u8>,
49    /// Append-only add buffer
50    add_buffer: Vec<u8>,
51    /// List of pieces
52    pieces: Vec<Piece>,
53    /// Operation counter (for triggering GC)
54    operation_count: usize,
55    /// GC threshold (trigger GC after every N operations)
56    gc_threshold: usize,
57}
58
59impl PieceTable {
60    /// Create a new Piece Table from original text
61    pub fn new(text: &str) -> Self {
62        let bytes = text.as_bytes().to_vec();
63        let char_count = text.chars().count();
64        let byte_length = bytes.len();
65
66        let pieces = if byte_length > 0 {
67            vec![Piece::new(BufferType::Original, 0, byte_length, char_count)]
68        } else {
69            Vec::new()
70        };
71
72        Self {
73            original_buffer: bytes,
74            add_buffer: Vec::new(),
75            pieces,
76            operation_count: 0,
77            gc_threshold: 1000, // Trigger GC after every 1000 operations
78        }
79    }
80
81    /// Create an empty Piece Table
82    pub fn empty() -> Self {
83        Self {
84            original_buffer: Vec::new(),
85            add_buffer: Vec::new(),
86            pieces: Vec::new(),
87            operation_count: 0,
88            gc_threshold: 1000,
89        }
90    }
91
92    /// Insert text at the specified character offset
93    pub fn insert(&mut self, offset: usize, text: &str) {
94        if text.is_empty() {
95            return;
96        }
97
98        let text_bytes = text.as_bytes();
99        let text_char_count = text.chars().count();
100        let text_byte_length = text_bytes.len();
101
102        // Add new text to add_buffer
103        let add_start = self.add_buffer.len();
104        self.add_buffer.extend_from_slice(text_bytes);
105
106        // Find the piece at the insertion position
107        let (piece_index, char_offset_in_piece) = self.find_piece_at_offset(offset);
108
109        if let Some(piece_idx) = piece_index {
110            let piece = &self.pieces[piece_idx];
111
112            if char_offset_in_piece == 0 {
113                // Insert at the beginning of the piece
114                let new_piece = Piece::new(
115                    BufferType::Add,
116                    add_start,
117                    text_byte_length,
118                    text_char_count,
119                );
120                self.pieces.insert(piece_idx, new_piece);
121            } else if char_offset_in_piece == piece.char_count {
122                // Insert at the end of the piece
123                let new_piece = Piece::new(
124                    BufferType::Add,
125                    add_start,
126                    text_byte_length,
127                    text_char_count,
128                );
129                self.pieces.insert(piece_idx + 1, new_piece);
130            } else {
131                // Insert in the middle of the piece, need to split
132                let (left_piece, right_piece) = self.split_piece(piece, char_offset_in_piece);
133                let new_piece = Piece::new(
134                    BufferType::Add,
135                    add_start,
136                    text_byte_length,
137                    text_char_count,
138                );
139
140                // Replace the original piece with three new pieces
141                self.pieces.splice(
142                    piece_idx..=piece_idx,
143                    vec![left_piece, new_piece, right_piece],
144                );
145            }
146        } else {
147            // Empty document or insert at the end
148            let new_piece = Piece::new(
149                BufferType::Add,
150                add_start,
151                text_byte_length,
152                text_char_count,
153            );
154            self.pieces.push(new_piece);
155        }
156
157        // Try to merge adjacent pieces
158        self.try_merge_adjacent_pieces();
159
160        // Check if GC needs to be triggered
161        self.check_gc();
162    }
163
164    /// Delete characters in the specified range
165    pub fn delete(&mut self, start_offset: usize, length: usize) {
166        if length == 0 {
167            return;
168        }
169
170        let end_offset = start_offset + length;
171
172        // Find the pieces at the start and end positions
173        let (start_piece_idx, start_char_offset) = self.find_piece_at_offset(start_offset);
174        let (end_piece_idx, end_char_offset) = self.find_piece_at_offset(end_offset);
175
176        match (start_piece_idx, end_piece_idx) {
177            (Some(start_idx), Some(end_idx)) if start_idx == end_idx => {
178                // Delete range is within the same piece
179                let piece = &self.pieces[start_idx];
180
181                if start_char_offset == 0 && end_char_offset == piece.char_count {
182                    // Delete the entire piece
183                    self.pieces.remove(start_idx);
184                } else if start_char_offset == 0 {
185                    // Delete from the beginning
186                    let (_, right) = self.split_piece(piece, end_char_offset);
187                    self.pieces[start_idx] = right;
188                } else if end_char_offset == piece.char_count {
189                    // Delete to the end
190                    let (left, _) = self.split_piece(piece, start_char_offset);
191                    self.pieces[start_idx] = left;
192                } else {
193                    // Delete in the middle
194                    let (left, temp) = self.split_piece(piece, start_char_offset);
195                    let (_, right) = self.split_piece(&temp, end_char_offset - start_char_offset);
196                    self.pieces.splice(start_idx..=start_idx, vec![left, right]);
197                }
198            }
199            (Some(start_idx), Some(end_idx)) => {
200                // Delete range spans multiple pieces
201                let start_piece = &self.pieces[start_idx];
202                let end_piece = &self.pieces[end_idx];
203
204                let mut new_pieces = Vec::new();
205
206                // Handle the starting piece
207                if start_char_offset > 0 {
208                    let (left, _) = self.split_piece(start_piece, start_char_offset);
209                    new_pieces.push(left);
210                }
211
212                // Handle the ending piece
213                if end_char_offset < end_piece.char_count {
214                    let (_, right) = self.split_piece(end_piece, end_char_offset);
215                    new_pieces.push(right);
216                }
217
218                // Replace all pieces in the range
219                self.pieces.splice(start_idx..=end_idx, new_pieces);
220            }
221            (None, None) => {
222                // Empty document, no need to delete
223            }
224            _ => {
225                // Only one position is valid, handle edge cases
226                if let Some(start_idx) = start_piece_idx {
227                    // Delete from start_idx to the end
228                    let start_piece = &self.pieces[start_idx];
229                    if start_char_offset == 0 {
230                        self.pieces.truncate(start_idx);
231                    } else {
232                        let (left, _) = self.split_piece(start_piece, start_char_offset);
233                        self.pieces.truncate(start_idx);
234                        self.pieces.push(left);
235                    }
236                }
237            }
238        }
239
240        // Check if GC needs to be triggered
241        self.check_gc();
242    }
243
244    /// Get the entire document content
245    pub fn get_text(&self) -> String {
246        let mut result = String::new();
247        for piece in &self.pieces {
248            let buffer = match piece.buffer_type {
249                BufferType::Original => &self.original_buffer,
250                BufferType::Add => &self.add_buffer,
251            };
252            let slice = &buffer[piece.start..piece.start + piece.byte_length];
253            result.push_str(std::str::from_utf8(slice).unwrap());
254        }
255        result
256    }
257
258    /// Get text in the specified range
259    pub fn get_range(&self, start_offset: usize, length: usize) -> String {
260        let mut result = String::new();
261        let mut current_offset = 0;
262        let end_offset = start_offset + length;
263
264        for piece in &self.pieces {
265            let piece_end = current_offset + piece.char_count;
266
267            if current_offset >= end_offset {
268                break;
269            }
270
271            if piece_end > start_offset {
272                let buffer = match piece.buffer_type {
273                    BufferType::Original => &self.original_buffer,
274                    BufferType::Add => &self.add_buffer,
275                };
276
277                let piece_text =
278                    std::str::from_utf8(&buffer[piece.start..piece.start + piece.byte_length])
279                        .unwrap();
280
281                let skip_chars = start_offset.saturating_sub(current_offset);
282
283                let take_chars = if piece_end > end_offset {
284                    end_offset - current_offset.max(start_offset)
285                } else {
286                    piece.char_count - skip_chars
287                };
288
289                result.push_str(
290                    &piece_text
291                        .chars()
292                        .skip(skip_chars)
293                        .take(take_chars)
294                        .collect::<String>(),
295                );
296            }
297
298            current_offset = piece_end;
299        }
300
301        result
302    }
303
304    /// Get the total character count of the document
305    pub fn char_count(&self) -> usize {
306        self.pieces.iter().map(|p| p.char_count).sum()
307    }
308
309    /// Get the total byte count of the document
310    pub fn byte_count(&self) -> usize {
311        self.pieces.iter().map(|p| p.byte_length).sum()
312    }
313
314    /// Get the size of add_buffer (for memory testing)
315    pub fn add_buffer_size(&self) -> usize {
316        self.add_buffer.len()
317    }
318
319    /// Find the piece at the specified character offset and the offset within that piece
320    /// Returns (piece_index, char_offset_in_piece)
321    fn find_piece_at_offset(&self, offset: usize) -> (Option<usize>, usize) {
322        let mut current_offset = 0;
323
324        for (idx, piece) in self.pieces.iter().enumerate() {
325            let next_offset = current_offset + piece.char_count;
326            if offset <= next_offset {
327                return (Some(idx), offset - current_offset);
328            }
329            current_offset = next_offset;
330        }
331
332        if self.pieces.is_empty() {
333            (None, 0)
334        } else {
335            (
336                Some(self.pieces.len() - 1),
337                self.pieces.last().unwrap().char_count,
338            )
339        }
340    }
341
342    /// Split a piece at the specified character position
343    /// Returns (left_piece, right_piece)
344    fn split_piece(&self, piece: &Piece, char_offset: usize) -> (Piece, Piece) {
345        let buffer = match piece.buffer_type {
346            BufferType::Original => &self.original_buffer,
347            BufferType::Add => &self.add_buffer,
348        };
349
350        let piece_text =
351            std::str::from_utf8(&buffer[piece.start..piece.start + piece.byte_length]).unwrap();
352
353        // Calculate byte offset (O(n))
354        // `char_offset` is the character offset within this piece; convert it to UTF-8 byte offset to complete the split.
355        let byte_offset = piece_text
356            .char_indices()
357            .nth(char_offset)
358            .map(|(i, _)| i)
359            .unwrap_or(piece.byte_length);
360
361        let left = Piece::new(piece.buffer_type, piece.start, byte_offset, char_offset);
362
363        let right = Piece::new(
364            piece.buffer_type,
365            piece.start + byte_offset,
366            piece.byte_length - byte_offset,
367            piece.char_count - char_offset,
368        );
369
370        (left, right)
371    }
372
373    /// Check if two pieces can be merged (must be from the same buffer and adjacent)
374    fn can_merge(&self, p1: &Piece, p2: &Piece) -> bool {
375        p1.buffer_type == p2.buffer_type &&
376        p1.buffer_type == BufferType::Add && // Only merge pieces in AddBuffer
377        p1.start + p1.byte_length == p2.start
378    }
379
380    /// Merge two adjacent pieces
381    fn merge_pieces(&self, p1: &Piece, p2: &Piece) -> Piece {
382        Piece::new(
383            p1.buffer_type,
384            p1.start,
385            p1.byte_length + p2.byte_length,
386            p1.char_count + p2.char_count,
387        )
388    }
389
390    /// Try to merge adjacent pieces after insertion
391    fn try_merge_adjacent_pieces(&mut self) {
392        let mut i = 0;
393        while i + 1 < self.pieces.len() {
394            if self.can_merge(&self.pieces[i], &self.pieces[i + 1]) {
395                let merged = self.merge_pieces(&self.pieces[i], &self.pieces[i + 1]);
396                self.pieces.splice(i..=i + 1, vec![merged]);
397            } else {
398                i += 1;
399            }
400        }
401    }
402
403    /// Garbage collection: compact add_buffer, remove unreferenced data
404    pub fn gc(&mut self) {
405        // Collect all referenced fragments in AddBuffer
406        let mut referenced_ranges: Vec<(usize, usize)> = self
407            .pieces
408            .iter()
409            .filter(|p| p.buffer_type == BufferType::Add)
410            .map(|p| (p.start, p.start + p.byte_length))
411            .collect();
412
413        if referenced_ranges.is_empty() {
414            // No references, clear add_buffer
415            self.add_buffer.clear();
416            return;
417        }
418
419        // Sort by start position
420        referenced_ranges.sort_by_key(|r| r.0);
421
422        // Merge overlapping ranges
423        let mut merged_ranges = vec![referenced_ranges[0]];
424        for range in referenced_ranges.iter().skip(1) {
425            let last_idx = merged_ranges.len() - 1;
426            if range.0 <= merged_ranges[last_idx].1 {
427                // Overlapping or adjacent, merge
428                merged_ranges[last_idx].1 = merged_ranges[last_idx].1.max(range.1);
429            } else {
430                merged_ranges.push(*range);
431            }
432        }
433
434        // Build new add_buffer and update piece references
435        let mut new_add_buffer = Vec::new();
436        let mut mappings: Vec<(usize, usize, usize)> = Vec::new(); // (old_start, old_end, new_start)
437
438        for (old_start, old_end) in merged_ranges {
439            let new_start = new_add_buffer.len();
440            new_add_buffer.extend_from_slice(&self.add_buffer[old_start..old_end]);
441            mappings.push((old_start, old_end, new_start));
442        }
443
444        // Update offsets of all AddBuffer pieces (allow piece.start to fall within merged ranges)
445        for piece in &mut self.pieces {
446            if piece.buffer_type != BufferType::Add {
447                continue;
448            }
449
450            // Binary search: find the last mapping where old_start <= piece.start
451            let idx = match mappings.binary_search_by_key(&piece.start, |(s, _, _)| *s) {
452                Ok(exact) => exact,
453                Err(insert_pos) => insert_pos.saturating_sub(1),
454            };
455
456            if let Some((old_start, old_end, new_start)) = mappings.get(idx).copied()
457                && piece.start < old_end
458            {
459                piece.start = new_start + (piece.start - old_start);
460            }
461        }
462
463        self.add_buffer = new_add_buffer;
464        self.operation_count = 0; // Reset counter
465    }
466
467    /// Check if GC needs to be triggered
468    fn check_gc(&mut self) {
469        self.operation_count += 1;
470        if self.operation_count >= self.gc_threshold {
471            self.gc();
472        }
473    }
474
475    /// Set GC threshold
476    pub fn set_gc_threshold(&mut self, threshold: usize) {
477        self.gc_threshold = threshold;
478    }
479}
480
481#[cfg(test)]
482mod tests {
483    use super::*;
484
485    #[test]
486    fn test_new_piece_table() {
487        let pt = PieceTable::new("Hello, World!");
488        assert_eq!(pt.get_text(), "Hello, World!");
489        assert_eq!(pt.char_count(), 13);
490    }
491
492    #[test]
493    fn test_empty_piece_table() {
494        let pt = PieceTable::empty();
495        assert_eq!(pt.get_text(), "");
496        assert_eq!(pt.char_count(), 0);
497    }
498
499    #[test]
500    fn test_insert_at_start() {
501        let mut pt = PieceTable::new("World");
502        pt.insert(0, "Hello, ");
503        assert_eq!(pt.get_text(), "Hello, World");
504    }
505
506    #[test]
507    fn test_insert_at_end() {
508        let mut pt = PieceTable::new("Hello");
509        pt.insert(5, ", World");
510        assert_eq!(pt.get_text(), "Hello, World");
511    }
512
513    #[test]
514    fn test_insert_in_middle() {
515        let mut pt = PieceTable::new("Hlo");
516        pt.insert(1, "el");
517        assert_eq!(pt.get_text(), "Hello");
518    }
519
520    #[test]
521    fn test_delete_at_start() {
522        let mut pt = PieceTable::new("Hello, World");
523        pt.delete(0, 7);
524        assert_eq!(pt.get_text(), "World");
525    }
526
527    #[test]
528    fn test_delete_at_end() {
529        let mut pt = PieceTable::new("Hello, World");
530        pt.delete(5, 7);
531        assert_eq!(pt.get_text(), "Hello");
532    }
533
534    #[test]
535    fn test_delete_in_middle() {
536        let mut pt = PieceTable::new("Hello, World");
537        pt.delete(5, 2);
538        assert_eq!(pt.get_text(), "HelloWorld");
539    }
540
541    #[test]
542    fn test_multiple_operations() {
543        let mut pt = PieceTable::new("Hello");
544        pt.insert(5, " World");
545        pt.insert(5, ",");
546        pt.delete(0, 7);
547        pt.insert(0, "Hi, ");
548        assert_eq!(pt.get_text(), "Hi, World");
549    }
550
551    #[test]
552    fn test_utf8_chinese() {
553        let mut pt = PieceTable::new("你好");
554        assert_eq!(pt.char_count(), 2);
555        assert_eq!(pt.byte_count(), 6);
556
557        pt.insert(1, "们");
558        assert_eq!(pt.get_text(), "你们好");
559        assert_eq!(pt.char_count(), 3);
560    }
561
562    #[test]
563    fn test_utf8_emoji() {
564        let mut pt = PieceTable::new("Hello 👋");
565        pt.insert(6, "World ");
566        assert_eq!(pt.get_text(), "Hello World 👋");
567    }
568
569    #[test]
570    fn test_get_range() {
571        let pt = PieceTable::new("Hello, World!");
572        assert_eq!(pt.get_range(0, 5), "Hello");
573        assert_eq!(pt.get_range(7, 5), "World");
574        assert_eq!(pt.get_range(0, 13), "Hello, World!");
575    }
576
577    #[test]
578    fn test_piece_merging() {
579        let mut pt = PieceTable::new("Hello");
580
581        // 连续插入相邻文本应该被合并
582        let initial_pieces = pt.pieces.len();
583        pt.insert(5, " ");
584        pt.insert(6, "World");
585
586        // 由于合并,pieces 数量应该较少
587        assert_eq!(pt.get_text(), "Hello World");
588        // 验证合并发生了(应该有2个pieces:原始的 "Hello" 和合并后的 " World")
589        assert!(pt.pieces.len() <= initial_pieces + 1);
590    }
591
592    #[test]
593    fn test_gc_basic() {
594        let mut pt = PieceTable::new("Hello");
595
596        // 插入一些文本
597        pt.insert(5, " World");
598        pt.insert(11, "!");
599
600        let add_buffer_size_before = pt.add_buffer.len();
601
602        // 删除一些文本,产生未引用的片段
603        pt.delete(5, 6); // 删除 " World"
604
605        // 手动触发 GC
606        pt.gc();
607
608        // 验证内容不变
609        assert_eq!(pt.get_text(), "Hello!");
610
611        // 验证 add_buffer 被压缩了
612        assert!(pt.add_buffer.len() < add_buffer_size_before);
613    }
614
615    #[test]
616    fn test_gc_multiple_references() {
617        let mut pt = PieceTable::new("ABC");
618
619        // 创建多个引用到 add_buffer 的 pieces
620        pt.insert(1, "1");
621        pt.insert(3, "2");
622        pt.insert(5, "3");
623
624        assert_eq!(pt.get_text(), "A1B2C3");
625
626        // GC 不应该删除被引用的数据
627        pt.gc();
628
629        // 内容应该保持不变
630        assert_eq!(pt.get_text(), "A1B2C3");
631
632        // add_buffer 仍然包含所有被引用的数据
633        assert!(!pt.add_buffer.is_empty());
634    }
635
636    #[test]
637    fn test_auto_gc_trigger() {
638        let mut pt = PieceTable::new("Test");
639        pt.set_gc_threshold(5); // 设置低阈值便于测试
640
641        // 执行多次操作
642        for i in 0..6 {
643            pt.insert(4 + i, "x");
644        }
645
646        // 应该触发了自动 GC(计数器被重置)
647        assert!(pt.operation_count < 6);
648    }
649}