fresh/model/buffer/mod.rs
1/// Text buffer that uses PieceTree with integrated line tracking
2/// Architecture where the tree is the single source of truth for text and line information
3use crate::model::encoding;
4use crate::model::filesystem::{FileSearchOptions, FileSystem};
5use crate::model::piece_tree::{
6 BufferData, BufferLocation, Cursor, PieceInfo, PieceRangeIter, PieceTree, PieceView, Position,
7 StringBuffer, TreeStats,
8};
9use crate::model::piece_tree_diff::PieceTreeDiff;
10use crate::primitives::grapheme;
11use anyhow::{Context, Result};
12use regex::bytes::Regex;
13use std::io;
14
15use std::ops::Range;
16use std::path::{Path, PathBuf};
17use std::sync::Arc;
18
19// Re-export Encoding for backward compatibility
20pub use encoding::Encoding;
21
22pub mod file_kind;
23pub mod format;
24pub mod persistence;
25pub mod save;
26pub mod search;
27pub use file_kind::BufferFileKind;
28pub use format::{BufferFormat, LineEnding};
29pub use persistence::Persistence;
30pub use save::SudoSaveRequired;
31#[cfg(test)]
32pub(crate) use save::{RecipeAction, WriteRecipe};
33#[cfg(test)]
34use search::search_boundary_overlap;
35use search::SearchRegion;
36pub use search::{ChunkedSearchState, HybridSearchPlan};
37
38/// Error returned when a large file has a non-resynchronizable encoding
39/// and requires user confirmation before loading the entire file into memory.
40///
41/// Non-resynchronizable encodings (like Shift-JIS, GB18030, GBK, EUC-KR) cannot
42/// determine character boundaries when jumping into the middle of a file.
43/// This means the entire file must be loaded and decoded sequentially.
44#[derive(Debug, Clone, PartialEq)]
45pub struct LargeFileEncodingConfirmation {
46 /// Path to the file
47 pub path: PathBuf,
48 /// Size of the file in bytes
49 pub file_size: usize,
50 /// The detected encoding that requires full loading
51 pub encoding: Encoding,
52}
53
54impl std::fmt::Display for LargeFileEncodingConfirmation {
55 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56 let size_mb = self.file_size as f64 / (1024.0 * 1024.0);
57 write!(
58 f,
59 "{} ({:.0} MB) requires full load. (l)oad, (e)ncoding, (C)ancel? ",
60 self.encoding.display_name(),
61 size_mb
62 )
63 }
64}
65
66impl std::error::Error for LargeFileEncodingConfirmation {}
67
68/// A work item for incremental line-feed scanning (one per leaf).
69#[derive(Debug, Clone)]
70pub struct LineScanChunk {
71 /// Index of the leaf in the piece tree's leaf array.
72 pub leaf_index: usize,
73 /// Number of bytes in this leaf.
74 pub byte_len: usize,
75 /// True if the leaf already had a known line_feed_cnt (no I/O needed).
76 pub already_known: bool,
77}
78
79// Re-export SearchMatch from filesystem — same type is used by both
80// FileSystem::search_file (project grep on disk) and the piece-tree
81// search below (in-editor Ctrl+F and dirty buffers).
82pub use crate::model::filesystem::SearchMatch;
83
84// Large file support configuration
85/// Default threshold for considering a file "large" (100 MB)
86pub const DEFAULT_LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024;
87
88/// Chunk size to load when lazy loading (1 MB)
89pub const LOAD_CHUNK_SIZE: usize = 1024 * 1024;
90
91/// Chunk alignment for lazy loading (64 KB)
92pub const CHUNK_ALIGNMENT: usize = 64 * 1024;
93
94/// Configuration passed to TextBuffer constructors.
95#[derive(Debug, Clone)]
96pub struct BufferConfig {
97 /// Estimated average line length in bytes. Used for approximate line number
98 /// display in large files and for goto-line byte offset estimation.
99 pub estimated_line_length: usize,
100}
101
102impl Default for BufferConfig {
103 fn default() -> Self {
104 Self {
105 estimated_line_length: 80,
106 }
107 }
108}
109
110/// Line ending format used in the file
111
112/// Represents a line number (simplified for new implementation)
113/// Legacy enum kept for backwards compatibility - always Absolute now
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub enum LineNumber {
116 /// Absolute line number - this is the actual line number in the file
117 Absolute(usize),
118 /// Relative line number (deprecated - now same as Absolute)
119 Relative {
120 line: usize,
121 from_cached_line: usize,
122 },
123}
124
125impl LineNumber {
126 /// Get the line number value
127 pub fn value(&self) -> usize {
128 match self {
129 Self::Absolute(line) | Self::Relative { line, .. } => *line,
130 }
131 }
132
133 /// Check if this is an absolute line number
134 pub fn is_absolute(&self) -> bool {
135 matches!(self, LineNumber::Absolute(_))
136 }
137
138 /// Check if this is a relative line number
139 pub fn is_relative(&self) -> bool {
140 matches!(self, LineNumber::Relative { .. })
141 }
142
143 /// Format the line number for display
144 pub fn format(&self) -> String {
145 match self {
146 Self::Absolute(line) => format!("{}", line + 1),
147 Self::Relative { line, .. } => format!("~{}", line + 1),
148 }
149 }
150}
151
152/// A text buffer that manages document content using a piece table
153/// with integrated line tracking
154pub struct TextBuffer {
155 /// The piece tree for efficient text manipulation with integrated line tracking
156 piece_tree: PieceTree,
157
158 /// List of string buffers containing chunks of text data.
159 /// Index 0 is typically the original/stored buffer.
160 /// Additional buffers are added for modifications.
161 buffers: Vec<StringBuffer>,
162
163 /// Next buffer ID to assign.
164 next_buffer_id: usize,
165
166 /// Filesystem handle, optional file path, dirty/recovery flags,
167 /// saved-root snapshot, and saved-file size — see
168 /// `persistence.rs`.
169 persistence: Persistence,
170
171 /// File-kind flags (large_file, line_feeds_scanned, is_binary) —
172 /// see `file_kind.rs`.
173 file_kind: BufferFileKind,
174
175 /// Encoding + line-ending state — see `format.rs`.
176 format: BufferFormat,
177
178 /// Monotonic version counter for change tracking.
179 version: u64,
180
181 /// Buffer configuration (estimated line length, etc.)
182 config: BufferConfig,
183}
184
185/// Snapshot of a TextBuffer's piece tree and associated string buffers.
186///
187/// Used by BulkEdit undo/redo to capture the complete buffer state.
188/// Without this, consolidate_after_save() would destroy the string buffers
189/// that a BulkEdit's piece tree snapshot references, causing corruption on undo.
190#[derive(Debug, Clone)]
191pub struct BufferSnapshot {
192 pub piece_tree: PieceTree,
193 pub buffers: Vec<StringBuffer>,
194 pub next_buffer_id: usize,
195}
196
197impl TextBuffer {
198 /// Create a new text buffer with the given filesystem implementation.
199 /// Note: large_file_threshold is ignored in the new implementation
200 pub fn new(_large_file_threshold: usize, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
201 let piece_tree = PieceTree::empty();
202 let saved_root = piece_tree.root();
203 let line_ending = LineEnding::default();
204 let encoding = Encoding::default();
205 TextBuffer {
206 piece_tree,
207 buffers: vec![StringBuffer::new(0, Vec::new())],
208 next_buffer_id: 1,
209 persistence: Persistence::new(fs, None, saved_root, None),
210 file_kind: BufferFileKind::new(false, false),
211 format: BufferFormat::new(line_ending, encoding),
212 version: 0,
213 config: BufferConfig::default(),
214 }
215 }
216
217 /// Create an empty buffer associated with a file path.
218 /// Used for files that don't exist yet — the path is set so saving will create the file.
219 pub fn new_with_path(
220 large_file_threshold: usize,
221 fs: Arc<dyn FileSystem + Send + Sync>,
222 path: PathBuf,
223 ) -> Self {
224 let mut buffer = Self::new(large_file_threshold, fs);
225 buffer.persistence.set_file_path(path);
226 buffer
227 }
228
229 /// Current buffer version (monotonic, wraps on overflow)
230 pub fn version(&self) -> u64 {
231 self.version
232 }
233
234 /// Get a reference to the filesystem implementation used by this buffer.
235 pub fn filesystem(&self) -> &Arc<dyn FileSystem + Send + Sync> {
236 self.persistence.fs()
237 }
238
239 /// Set the filesystem implementation for this buffer.
240 pub fn set_filesystem(&mut self, fs: Arc<dyn FileSystem + Send + Sync>) {
241 self.persistence.set_fs(fs);
242 }
243
244 #[inline]
245 fn bump_version(&mut self) {
246 self.version = self.version.wrapping_add(1);
247 }
248
249 #[inline]
250 fn mark_content_modified(&mut self) {
251 self.persistence.mark_dirty();
252 self.bump_version();
253 }
254
255 /// Create a text buffer from raw bytes WITHOUT encoding conversion.
256 /// Used for binary files where we want to preserve the exact bytes.
257 fn from_bytes_raw(content: Vec<u8>, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
258 let bytes = content.len();
259
260 // For binary files, detect line ending but don't convert encoding
261 let line_ending = format::detect_line_ending(&content);
262
263 // Create initial StringBuffer with ID 0
264 let buffer = StringBuffer::new(0, content);
265 let line_feed_cnt = buffer.line_feed_count();
266
267 let piece_tree = if bytes > 0 {
268 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
269 } else {
270 PieceTree::empty()
271 };
272
273 let saved_root = piece_tree.root();
274
275 TextBuffer {
276 piece_tree,
277 buffers: vec![buffer],
278 next_buffer_id: 1,
279 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
280 file_kind: BufferFileKind::new(false, true),
281 format: BufferFormat::new(line_ending, Encoding::Utf8),
282 version: 0,
283 config: BufferConfig::default(),
284 }
285 }
286
287 /// Create a text buffer from initial content with the given filesystem.
288 pub fn from_bytes(content: Vec<u8>, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
289 // Auto-detect encoding and convert to UTF-8 if needed
290 let (encoding, utf8_content) = format::detect_and_convert_encoding(&content);
291
292 let bytes = utf8_content.len();
293
294 // Auto-detect line ending format from content
295 let line_ending = format::detect_line_ending(&utf8_content);
296
297 // Create initial StringBuffer with ID 0
298 let buffer = StringBuffer::new(0, utf8_content);
299 let line_feed_cnt = buffer.line_feed_count();
300
301 let piece_tree = if bytes > 0 {
302 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
303 } else {
304 PieceTree::empty()
305 };
306
307 let saved_root = piece_tree.root();
308
309 TextBuffer {
310 piece_tree,
311 buffers: vec![buffer],
312 next_buffer_id: 1,
313 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
314 file_kind: BufferFileKind::new(false, false),
315 format: BufferFormat::new(line_ending, encoding),
316 version: 0,
317 config: BufferConfig::default(),
318 }
319 }
320
321 /// Create a text buffer from bytes with a specific encoding (no auto-detection).
322 pub fn from_bytes_with_encoding(
323 content: Vec<u8>,
324 encoding: Encoding,
325 fs: Arc<dyn FileSystem + Send + Sync>,
326 ) -> Self {
327 // Convert from specified encoding to UTF-8
328 let utf8_content = encoding::convert_to_utf8(&content, encoding);
329
330 let bytes = utf8_content.len();
331
332 // Auto-detect line ending format from content
333 let line_ending = format::detect_line_ending(&utf8_content);
334
335 // Create initial StringBuffer with ID 0
336 let buffer = StringBuffer::new(0, utf8_content);
337 let line_feed_cnt = buffer.line_feed_count();
338
339 let piece_tree = if bytes > 0 {
340 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
341 } else {
342 PieceTree::empty()
343 };
344
345 let saved_root = piece_tree.root();
346
347 TextBuffer {
348 piece_tree,
349 buffers: vec![buffer],
350 next_buffer_id: 1,
351 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
352 file_kind: BufferFileKind::new(false, false),
353 format: BufferFormat::new(line_ending, encoding),
354 version: 0,
355 config: BufferConfig::default(),
356 }
357 }
358
359 /// Create a text buffer from a string with the given filesystem.
360 pub fn from_str(
361 s: &str,
362 _large_file_threshold: usize,
363 fs: Arc<dyn FileSystem + Send + Sync>,
364 ) -> Self {
365 Self::from_bytes(s.as_bytes().to_vec(), fs)
366 }
367
368 /// Create an empty text buffer with the given filesystem.
369 pub fn empty(fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
370 let piece_tree = PieceTree::empty();
371 let saved_root = piece_tree.root();
372 let line_ending = LineEnding::default();
373 let encoding = Encoding::default();
374 TextBuffer {
375 piece_tree,
376 buffers: vec![StringBuffer::new(0, Vec::new())],
377 next_buffer_id: 1,
378 persistence: Persistence::new(fs, None, saved_root, None),
379 file_kind: BufferFileKind::new(false, false),
380 format: BufferFormat::new(line_ending, encoding),
381 version: 0,
382 config: BufferConfig::default(),
383 }
384 }
385
386 /// Load a text buffer from a file using the given filesystem.
387 pub fn load_from_file<P: AsRef<Path>>(
388 path: P,
389 large_file_threshold: usize,
390 fs: Arc<dyn FileSystem + Send + Sync>,
391 ) -> anyhow::Result<Self> {
392 Self::load_from_file_internal(path, large_file_threshold, fs, false)
393 }
394
395 /// Load a text buffer from a file, forcing it to be treated as text.
396 ///
397 /// Identical to [`load_from_file`](Self::load_from_file) but skips binary
398 /// detection entirely — the buffer is always loaded through the text path
399 /// and `is_binary` stays `false`. Used for the terminal scrollback backing
400 /// file, whose raw PTY output can contain control bytes that would
401 /// otherwise trip binary detection and suppress ANSI-color rendering in
402 /// scrollback mode (issue #2449).
403 pub fn load_from_file_force_text<P: AsRef<Path>>(
404 path: P,
405 large_file_threshold: usize,
406 fs: Arc<dyn FileSystem + Send + Sync>,
407 ) -> anyhow::Result<Self> {
408 Self::load_from_file_internal(path, large_file_threshold, fs, true)
409 }
410
411 fn load_from_file_internal<P: AsRef<Path>>(
412 path: P,
413 large_file_threshold: usize,
414 fs: Arc<dyn FileSystem + Send + Sync>,
415 force_text: bool,
416 ) -> anyhow::Result<Self> {
417 let path = path.as_ref();
418
419 // Get file size to determine loading strategy
420 let metadata = fs.metadata(path)?;
421 let file_size = metadata.size as usize;
422
423 // Use threshold parameter or default
424 let threshold = if large_file_threshold > 0 {
425 large_file_threshold
426 } else {
427 DEFAULT_LARGE_FILE_THRESHOLD
428 };
429
430 // Choose loading strategy based on file size
431 if file_size >= threshold {
432 Self::load_large_file_internal(path, file_size, fs, false, force_text)
433 } else {
434 Self::load_small_file(path, fs, force_text)
435 }
436 }
437
438 /// Load a text buffer from a file with a specific encoding (no auto-detection).
439 pub fn load_from_file_with_encoding<P: AsRef<Path>>(
440 path: P,
441 encoding: Encoding,
442 fs: Arc<dyn FileSystem + Send + Sync>,
443 config: BufferConfig,
444 ) -> anyhow::Result<Self> {
445 let path = path.as_ref();
446 let contents = fs.read_file(path)?;
447
448 let mut buffer = Self::from_bytes_with_encoding(contents, encoding, fs);
449 buffer.persistence.set_file_path(path.to_path_buf());
450 buffer.persistence.clear_modified();
451 buffer.config = config;
452 Ok(buffer)
453 }
454
455 /// Load a small file with full eager loading and line indexing
456 ///
457 /// When `force_text` is true, binary detection is ignored and the file is
458 /// always loaded through the text path (see `load_from_file_force_text`).
459 fn load_small_file(
460 path: &Path,
461 fs: Arc<dyn FileSystem + Send + Sync>,
462 force_text: bool,
463 ) -> anyhow::Result<Self> {
464 let contents = fs.read_file(path)?;
465
466 // Use unified encoding/binary detection
467 let (encoding, detected_binary) = format::detect_encoding_or_binary(&contents, false);
468 let is_binary = detected_binary && !force_text;
469
470 // For binary files, skip encoding conversion to preserve raw bytes
471 let mut buffer = if is_binary {
472 Self::from_bytes_raw(contents, fs)
473 } else {
474 // from_bytes handles encoding detection/conversion and line ending detection
475 Self::from_bytes(contents, fs)
476 };
477 buffer.persistence.set_file_path(path.to_path_buf());
478 buffer.persistence.clear_modified();
479 buffer.file_kind.set_large_file(false);
480 buffer.file_kind.set_binary(is_binary);
481 // For binary files, ensure encoding matches detection
482 if is_binary {
483 buffer.format.set_default_encoding(encoding);
484 }
485 // Note: line_ending and encoding are already set by from_bytes/from_bytes_raw
486 Ok(buffer)
487 }
488
489 /// Check if loading a large file requires user confirmation due to encoding.
490 ///
491 /// Some encodings (like Shift-JIS, GB18030, GBK, EUC-KR) cannot be "resynchronized" -
492 /// meaning you cannot determine character boundaries when jumping into the middle
493 /// of a file. These encodings require loading the entire file into memory.
494 ///
495 /// Returns `Some(confirmation)` if user confirmation is needed, `None` if the file
496 /// can be loaded with lazy/streaming loading.
497 pub fn check_large_file_encoding(
498 path: impl AsRef<Path>,
499 fs: Arc<dyn FileSystem + Send + Sync>,
500 ) -> anyhow::Result<Option<LargeFileEncodingConfirmation>> {
501 let path = path.as_ref();
502 let metadata = fs.metadata(path)?;
503 let file_size = metadata.size as usize;
504
505 // Only check for large files
506 if file_size < DEFAULT_LARGE_FILE_THRESHOLD {
507 return Ok(None);
508 }
509
510 // Read a sample to detect encoding
511 let sample_size = file_size.min(8 * 1024);
512 let sample = fs.read_range(path, 0, sample_size)?;
513 let (encoding, is_binary) =
514 format::detect_encoding_or_binary(&sample, file_size > sample_size);
515
516 // Binary files don't need confirmation (loaded as-is)
517 if is_binary {
518 return Ok(None);
519 }
520
521 // Check if the encoding requires full file loading
522 if encoding.requires_full_file_load() {
523 return Ok(Some(LargeFileEncodingConfirmation {
524 path: path.to_path_buf(),
525 file_size,
526 encoding,
527 }));
528 }
529
530 Ok(None)
531 }
532
533 /// Load a large file, optionally forcing full load for non-resynchronizable encodings.
534 ///
535 /// Called with `force_full_load=true` after user confirms the warning about
536 /// non-resynchronizable encodings requiring full file loading.
537 pub fn load_large_file_confirmed(
538 path: impl AsRef<Path>,
539 fs: Arc<dyn FileSystem + Send + Sync>,
540 ) -> anyhow::Result<Self> {
541 let path = path.as_ref();
542 let metadata = fs.metadata(path)?;
543 let file_size = metadata.size as usize;
544 Self::load_large_file_internal(path, file_size, fs, true, false)
545 }
546
547 /// Internal implementation for loading large files.
548 ///
549 /// When `force_text` is true, binary detection is ignored and the file is
550 /// always loaded through the text path (see `load_from_file_force_text`).
551 fn load_large_file_internal(
552 path: &Path,
553 file_size: usize,
554 fs: Arc<dyn FileSystem + Send + Sync>,
555 force_full_load: bool,
556 force_text: bool,
557 ) -> anyhow::Result<Self> {
558 use crate::model::piece_tree::{BufferData, BufferLocation};
559
560 // Read a sample of the file to detect encoding and whether it's binary
561 // We read the first 8KB for detection
562 let sample_size = file_size.min(8 * 1024);
563 let sample = fs.read_range(path, 0, sample_size)?;
564
565 // Use unified encoding/binary detection
566 let (encoding, detected_binary) =
567 format::detect_encoding_or_binary(&sample, file_size > sample_size);
568 let is_binary = detected_binary && !force_text;
569
570 // Binary files skip encoding conversion to preserve raw bytes
571 if is_binary {
572 tracing::info!("Large binary file detected, loading without encoding conversion");
573 let contents = fs.read_file(path)?;
574 let mut buffer = Self::from_bytes_raw(contents, fs);
575 buffer.persistence.set_file_path(path.to_path_buf());
576 buffer.persistence.clear_modified();
577 buffer.file_kind.set_large_file(true);
578 buffer.format.set_default_encoding(encoding);
579 return Ok(buffer);
580 }
581
582 // Check if encoding requires full file loading
583 let requires_full_load = encoding.requires_full_file_load();
584
585 // For non-resynchronizable encodings, require confirmation unless forced
586 if requires_full_load && !force_full_load {
587 anyhow::bail!(LargeFileEncodingConfirmation {
588 path: path.to_path_buf(),
589 file_size,
590 encoding,
591 });
592 }
593
594 // For encodings that require full load (non-resynchronizable or non-UTF-8),
595 // load the entire file and convert
596 if !matches!(encoding, Encoding::Utf8 | Encoding::Ascii) {
597 tracing::info!(
598 "Large file with non-UTF-8 encoding ({:?}), loading fully for conversion",
599 encoding
600 );
601 let contents = fs.read_file(path)?;
602 let mut buffer = Self::from_bytes(contents, fs);
603 buffer.persistence.set_file_path(path.to_path_buf());
604 buffer.persistence.clear_modified();
605 buffer.file_kind.set_large_file(true); // Still mark as large file for UI purposes
606 buffer.file_kind.set_binary(is_binary);
607 return Ok(buffer);
608 }
609
610 // UTF-8/ASCII files can use lazy loading
611 let line_ending = format::detect_line_ending(&sample);
612
613 // Create an unloaded buffer that references the entire file
614 let buffer = StringBuffer {
615 id: 0,
616 data: BufferData::Unloaded {
617 file_path: path.to_path_buf(),
618 file_offset: 0,
619 bytes: file_size,
620 },
621 stored_file_offset: None,
622 };
623
624 // Create piece tree with a single piece covering the whole file
625 // No line feed count (None) since we're not computing line indexing
626 let piece_tree = if file_size > 0 {
627 PieceTree::new(BufferLocation::Stored(0), 0, file_size, None)
628 } else {
629 PieceTree::empty()
630 };
631 let saved_root = piece_tree.root();
632
633 tracing::debug!(
634 "Buffer::load_from_file: loaded {} bytes, saved_file_size={}",
635 file_size,
636 file_size
637 );
638
639 Ok(TextBuffer {
640 piece_tree,
641 buffers: vec![buffer],
642 next_buffer_id: 1,
643 persistence: Persistence::new(
644 fs,
645 Some(path.to_path_buf()),
646 saved_root,
647 Some(file_size),
648 ),
649 file_kind: BufferFileKind::new(true, is_binary),
650 format: BufferFormat::new(line_ending, encoding),
651 version: 0,
652 config: BufferConfig::default(),
653 })
654 }
655
656 /// Save the buffer to its associated file
657 pub fn save(&mut self) -> anyhow::Result<()> {
658 if let Some(path) = self.persistence.file_path_owned() {
659 self.save_to_file(path)
660 } else {
661 anyhow::bail!(io::Error::new(
662 io::ErrorKind::NotFound,
663 "No file path associated with buffer",
664 ))
665 }
666 }
667
668 /// Build a write recipe from the piece tree for saving.
669 ///
670 /// Delegates to `save::build_write_recipe`.
671 #[cfg(test)]
672 pub(crate) fn build_write_recipe(&self) -> io::Result<WriteRecipe> {
673 save::build_write_recipe(
674 &self.piece_tree,
675 &self.buffers,
676 &self.format,
677 &self.file_kind,
678 &self.persistence,
679 )
680 }
681
682 /// Save the buffer to a specific file
683 ///
684 /// Uses the write recipe approach for both local and remote filesystems:
685 /// - Copy ops reference unchanged regions in the source file
686 /// - Insert ops contain new/modified data
687 ///
688 /// For remote filesystems, the recipe is sent to the agent which reconstructs
689 /// the file server-side, avoiding transfer of unchanged content.
690 ///
691 /// For local filesystems with ownership concerns (file owned by another user),
692 /// uses in-place writing to preserve ownership. Otherwise uses atomic writes.
693 ///
694 /// If the line ending format has been changed (via set_line_ending), all content
695 /// will be converted to the new format during save.
696 pub fn save_to_file<P: AsRef<Path>>(&mut self, path: P) -> anyhow::Result<()> {
697 let dest_path = path.as_ref();
698 let total = self.total_bytes();
699
700 // Handle empty files
701 if total == 0 {
702 self.persistence.fs().write_file(dest_path, &[])?;
703 self.finalize_save(dest_path)?;
704 return Ok(());
705 }
706
707 // Build the write recipe (unified for all filesystem types)
708 let recipe = save::build_write_recipe(
709 &self.piece_tree,
710 &self.buffers,
711 &self.format,
712 &self.file_kind,
713 &self.persistence,
714 )?;
715 let ops = recipe.to_write_ops();
716
717 // Check if we need in-place writing to preserve file ownership (local only)
718 // Remote filesystems handle this differently
719 let fs = self.persistence.fs();
720 let is_local = fs.remote_connection_info().is_none();
721 let use_inplace = is_local && save::should_use_inplace_write(fs, dest_path);
722
723 if use_inplace {
724 // In-place write: write directly to preserve ownership
725 save::save_with_inplace_write(fs, dest_path, &recipe)?;
726 } else if !recipe.has_copy_ops() && !is_local {
727 // Remote with no Copy ops: use write_file directly (more efficient)
728 let data = recipe.flatten_inserts();
729 fs.write_file(dest_path, &data)?;
730 } else if is_local {
731 // Local: use write_file or write_patched with sudo fallback
732 let write_result = if !recipe.has_copy_ops() {
733 let data = recipe.flatten_inserts();
734 fs.write_file(dest_path, &data)
735 } else {
736 let src_for_patch = recipe.src_path.as_deref().unwrap_or(dest_path);
737 fs.write_patched(src_for_patch, dest_path, &ops)
738 };
739
740 if let Err(e) = write_result {
741 if e.kind() == io::ErrorKind::PermissionDenied {
742 // Create temp file and return sudo error
743 let original_metadata = fs.metadata_if_exists(dest_path);
744 let (temp_path, mut temp_file) = save::create_temp_file(fs, dest_path)?;
745 save::write_recipe_to_file(fs, &mut temp_file, &recipe)?;
746 temp_file.sync_all()?;
747 drop(temp_file);
748 return Err(save::make_sudo_error(
749 temp_path,
750 dest_path,
751 original_metadata,
752 ));
753 }
754 return Err(e.into());
755 }
756 } else {
757 // Remote with Copy ops: use write_patched
758 let src_for_patch = recipe.src_path.as_deref().unwrap_or(dest_path);
759 fs.write_patched(src_for_patch, dest_path, &ops)?;
760 }
761
762 self.finalize_save(dest_path)?;
763 Ok(())
764 }
765
766 /// Finalize save state after successful write.
767 fn finalize_save(&mut self, dest_path: &Path) -> anyhow::Result<()> {
768 let new_size = self.persistence.fs().metadata(dest_path)?.size as usize;
769 tracing::debug!(
770 "Buffer::save: updating saved_file_size from {:?} to {}",
771 self.persistence.saved_file_size(),
772 new_size
773 );
774 self.persistence.set_saved_file_size(Some(new_size));
775 self.persistence.set_file_path(dest_path.to_path_buf());
776
777 // Consolidate the piece tree to synchronize with disk (for large files)
778 // or to simplify structure (for small files).
779 self.consolidate_after_save(dest_path, new_size);
780
781 self.mark_saved_snapshot();
782 self.format.promote_current_to_original();
783 Ok(())
784 }
785
786 /// Finalize buffer state after an external save operation (e.g., via sudo).
787 ///
788 /// This updates the saved snapshot and file size to match the new state on disk.
789 pub fn finalize_external_save(&mut self, dest_path: PathBuf) -> anyhow::Result<()> {
790 let new_size = self.persistence.fs().metadata(&dest_path)?.size as usize;
791 self.persistence.set_saved_file_size(Some(new_size));
792 self.persistence.set_file_path(dest_path.clone());
793
794 // Consolidate the piece tree to synchronize with disk or simplify structure.
795 self.consolidate_after_save(&dest_path, new_size);
796
797 self.mark_saved_snapshot();
798 self.format.promote_current_to_original();
799 Ok(())
800 }
801
802 /// Consolidate the piece tree into a single piece.
803 /// For large files, this creates a reference to the disk file to save memory and sync offsets.
804 /// For small files, this flattens all edits into a single in-memory buffer.
805 fn consolidate_after_save(&mut self, path: &Path, file_size: usize) {
806 if self.file_kind.is_large_file() {
807 self.consolidate_large_file(path, file_size);
808 } else {
809 self.consolidate_small_file();
810 }
811 }
812
813 /// Consolidate large file piece tree into a single piece pointing to the new file.
814 /// This ensures that subsequent operations correctly reference the new content and offsets.
815 /// Preserves total line feed count from the old tree if a scan was previously done.
816 fn consolidate_large_file(&mut self, path: &Path, file_size: usize) {
817 // Preserve line feed count from the old tree if we had scanned it
818 let preserved_lf = if self.file_kind.has_line_feed_scan() {
819 self.piece_tree.line_count().map(|c| c.saturating_sub(1))
820 } else {
821 None
822 };
823
824 let buffer = StringBuffer {
825 id: 0,
826 data: BufferData::Unloaded {
827 file_path: path.to_path_buf(),
828 file_offset: 0,
829 bytes: file_size,
830 },
831 stored_file_offset: None,
832 };
833
834 self.piece_tree = if file_size > 0 {
835 PieceTree::new(BufferLocation::Stored(0), 0, file_size, preserved_lf)
836 } else {
837 PieceTree::empty()
838 };
839
840 self.buffers = vec![buffer];
841 self.next_buffer_id = 1;
842
843 tracing::debug!(
844 "Buffer::consolidate_large_file: consolidated into single piece of {} bytes",
845 file_size
846 );
847 }
848
849 /// Consolidate small file edits into a single in-memory buffer and re-index lines.
850 fn consolidate_small_file(&mut self) {
851 if let Some(bytes) = self.get_all_text() {
852 let line_feed_cnt = bytes.iter().filter(|&&b| b == b'\n').count();
853 let len = bytes.len();
854
855 // Create a single loaded buffer with line indexing
856 let buffer = StringBuffer::new_loaded(0, bytes, true);
857
858 self.piece_tree = if len > 0 {
859 PieceTree::new(BufferLocation::Stored(0), 0, len, Some(line_feed_cnt))
860 } else {
861 PieceTree::empty()
862 };
863
864 self.buffers = vec![buffer];
865 self.next_buffer_id = 1;
866
867 tracing::debug!(
868 "Buffer::consolidate_small_file: consolidated into single loaded buffer of {} bytes",
869 len
870 );
871 }
872 }
873
874 /// Get the total number of bytes in the document
875 pub fn total_bytes(&self) -> usize {
876 self.piece_tree.total_bytes()
877 }
878
879 /// Get the total number of lines in the document
880 /// Uses the piece tree's integrated line tracking
881 /// Returns None if line count is unknown (e.g., for large files without line indexing)
882 pub fn line_count(&self) -> Option<usize> {
883 self.piece_tree.line_count()
884 }
885
886 /// Snapshot the current tree as the saved baseline
887 pub fn mark_saved_snapshot(&mut self) {
888 self.persistence.mark_saved_snapshot(&self.piece_tree);
889 }
890
891 /// Refresh the saved root to match the current tree structure without
892 /// clearing the modified flag. Call this after structural-only changes
893 /// (e.g. chunk_split_and_load during search scan) so that
894 /// `diff_since_saved()` can take the fast `Arc::ptr_eq` path.
895 pub fn refresh_saved_root_if_unmodified(&mut self) {
896 self.persistence
897 .refresh_saved_root_if_unmodified(&self.piece_tree);
898 }
899
900 /// Diff the current piece tree against the last saved snapshot.
901 ///
902 /// See `Persistence::diff_since_saved` for the algorithm.
903 pub fn diff_since_saved(&self) -> PieceTreeDiff {
904 let _span = tracing::info_span!(
905 "diff_since_saved",
906 large_file = self.file_kind.is_large_file(),
907 modified = self.persistence.is_modified(),
908 lf_scanned = self.file_kind.has_line_feed_scan()
909 )
910 .entered();
911
912 self.persistence
913 .diff_since_saved(&self.piece_tree, &self.buffers)
914 }
915
916 /// Convert a byte offset to a line/column position
917 pub fn offset_to_position(&self, offset: usize) -> Option<Position> {
918 self.piece_tree
919 .offset_to_position(offset, &self.buffers)
920 .map(|(line, column)| Position { line, column })
921 }
922
923 /// Convert a line/column position to a byte offset
924 pub fn position_to_offset(&self, position: Position) -> usize {
925 self.piece_tree
926 .position_to_offset(position.line, position.column, &self.buffers)
927 }
928
929 /// Insert text at the given byte offset
930 pub fn insert_bytes(&mut self, offset: usize, text: Vec<u8>) -> Cursor {
931 if text.is_empty() {
932 return self.piece_tree.cursor_at_offset(offset);
933 }
934
935 // Mark as modified (updates version)
936 self.mark_content_modified();
937
938 // Count line feeds in the text to insert
939 let line_feed_cnt = Some(text.iter().filter(|&&b| b == b'\n').count());
940
941 // Optimization: try to append to existing buffer if insertion is at piece boundary
942 let (buffer_location, buffer_offset, text_len) =
943 if let Some(append_info) = self.try_append_to_existing_buffer(offset, &text) {
944 append_info
945 } else {
946 // Create a new StringBuffer for this insertion
947 let buffer_id = self.next_buffer_id;
948 self.next_buffer_id += 1;
949 let buffer = StringBuffer::new(buffer_id, text.clone());
950 self.buffers.push(buffer);
951 (BufferLocation::Added(buffer_id), 0, text.len())
952 };
953
954 // When line feeds have been scanned, ensure the chunk at the insertion
955 // point is loaded so compute_line_feeds_static can recount during splits.
956 if self.file_kind.has_line_feed_scan() {
957 self.ensure_chunk_loaded_at(offset);
958 }
959
960 // Update piece tree (need to pass buffers reference)
961 self.piece_tree.insert(
962 offset,
963 buffer_location,
964 buffer_offset,
965 text_len,
966 line_feed_cnt,
967 &self.buffers,
968 )
969 }
970
971 /// Try to append to an existing buffer if insertion point aligns with buffer end
972 /// Returns (BufferLocation, buffer_offset, text_len) if append succeeds, None otherwise
973 fn try_append_to_existing_buffer(
974 &mut self,
975 offset: usize,
976 text: &[u8],
977 ) -> Option<(BufferLocation, usize, usize)> {
978 // Only optimize for non-empty insertions after existing content
979 if text.is_empty() || offset == 0 {
980 return None;
981 }
982
983 // Find the piece containing the byte just before the insertion point
984 // This avoids the saturating_sub issue
985 let piece_info = self.piece_tree.find_by_offset(offset - 1)?;
986
987 // Check if insertion is exactly at the end of this piece
988 // offset_in_piece tells us where (offset-1) is within the piece
989 // For insertion to be at piece end, (offset-1) must be the last byte
990 let offset_in_piece = piece_info.offset_in_piece?;
991 if offset_in_piece + 1 != piece_info.bytes {
992 return None; // Not at the end of the piece
993 }
994
995 // Only append to "Added" buffers (not original Stored buffers)
996 if !matches!(piece_info.location, BufferLocation::Added(_)) {
997 return None;
998 }
999
1000 let buffer_id = piece_info.location.buffer_id();
1001 let buffer = self.buffers.get_mut(buffer_id)?;
1002
1003 // Check if buffer is loaded
1004 let buffer_len = buffer.get_data()?.len();
1005
1006 // Check if this piece ends exactly at the end of its buffer
1007 if piece_info.offset + piece_info.bytes != buffer_len {
1008 return None;
1009 }
1010
1011 // Perfect! Append to this buffer
1012 let append_offset = buffer.append(text);
1013
1014 Some((piece_info.location, append_offset, text.len()))
1015 }
1016
1017 /// Insert text (from &str) at the given byte offset
1018 pub fn insert(&mut self, offset: usize, text: &str) {
1019 self.insert_bytes(offset, text.as_bytes().to_vec());
1020 }
1021
1022 /// Insert text at a line/column position
1023 /// This now uses the optimized piece_tree.insert_at_position() for a single traversal
1024 pub fn insert_at_position(&mut self, position: Position, text: Vec<u8>) -> Cursor {
1025 if text.is_empty() {
1026 let offset = self.position_to_offset(position);
1027 return self.piece_tree.cursor_at_offset(offset);
1028 }
1029
1030 self.mark_content_modified();
1031
1032 // Count line feeds in the text to insert
1033 let line_feed_cnt = text.iter().filter(|&&b| b == b'\n').count();
1034
1035 // Create a new StringBuffer for this insertion
1036 let buffer_id = self.next_buffer_id;
1037 self.next_buffer_id += 1;
1038 let buffer = StringBuffer::new(buffer_id, text.clone());
1039 self.buffers.push(buffer);
1040
1041 // Use the optimized position-based insertion (single traversal)
1042 self.piece_tree.insert_at_position(
1043 position.line,
1044 position.column,
1045 BufferLocation::Added(buffer_id),
1046 0,
1047 text.len(),
1048 line_feed_cnt,
1049 &self.buffers,
1050 )
1051 }
1052
1053 /// Delete text starting at the given byte offset
1054 pub fn delete_bytes(&mut self, offset: usize, bytes: usize) {
1055 if bytes == 0 || offset >= self.total_bytes() {
1056 return;
1057 }
1058
1059 // When line feeds have been scanned, ensure chunks at delete boundaries
1060 // are loaded so compute_line_feeds_static can recount during splits.
1061 if self.file_kind.has_line_feed_scan() {
1062 self.ensure_chunk_loaded_at(offset);
1063 let end = (offset + bytes).min(self.total_bytes());
1064 if end > offset {
1065 self.ensure_chunk_loaded_at(end.saturating_sub(1));
1066 }
1067 }
1068
1069 // Update piece tree
1070 self.piece_tree.delete(offset, bytes, &self.buffers);
1071
1072 self.mark_content_modified();
1073 }
1074
1075 /// Delete text in a range
1076 pub fn delete(&mut self, range: Range<usize>) {
1077 if range.end > range.start {
1078 self.delete_bytes(range.start, range.end - range.start);
1079 }
1080 }
1081
1082 /// Delete text in a line/column range
1083 /// This now uses the optimized piece_tree.delete_position_range() for a single traversal
1084 pub fn delete_range(&mut self, start: Position, end: Position) {
1085 // Use the optimized position-based deletion
1086 self.piece_tree.delete_position_range(
1087 start.line,
1088 start.column,
1089 end.line,
1090 end.column,
1091 &self.buffers,
1092 );
1093 self.mark_content_modified();
1094 }
1095
1096 /// Replace the entire buffer content with new content
1097 /// This is an O(n) operation that rebuilds the piece tree in a single pass,
1098 /// avoiding the O(n²) complexity of applying individual edits.
1099 ///
1100 /// This is used for bulk operations like "replace all" where applying
1101 /// individual edits would be prohibitively slow.
1102 pub fn replace_content(&mut self, new_content: &str) {
1103 let bytes = new_content.len();
1104 let content_bytes = new_content.as_bytes().to_vec();
1105
1106 // Count line feeds in the new content
1107 let line_feed_cnt = content_bytes.iter().filter(|&&b| b == b'\n').count();
1108
1109 // Create a new StringBuffer for the new content
1110 let buffer_id = self.next_buffer_id;
1111 self.next_buffer_id += 1;
1112 let buffer = StringBuffer::new(buffer_id, content_bytes);
1113 self.buffers.push(buffer);
1114
1115 // Rebuild the piece tree with a single piece containing all the new content
1116 if bytes > 0 {
1117 self.piece_tree = PieceTree::new(
1118 BufferLocation::Added(buffer_id),
1119 0,
1120 bytes,
1121 Some(line_feed_cnt),
1122 );
1123 } else {
1124 self.piece_tree = PieceTree::empty();
1125 }
1126
1127 self.mark_content_modified();
1128 }
1129
1130 /// Restore a previously saved buffer state (for undo/redo of BulkEdit).
1131 ///
1132 /// This restores the piece tree AND the buffers list, which is critical
1133 /// because consolidate_after_save() replaces self.buffers. Without restoring
1134 /// buffers, the piece tree would reference buffer IDs that no longer exist.
1135 pub fn restore_buffer_state(&mut self, snapshot: &BufferSnapshot) {
1136 self.piece_tree = snapshot.piece_tree.clone();
1137 self.buffers = snapshot.buffers.clone();
1138 self.next_buffer_id = snapshot.next_buffer_id;
1139 self.mark_content_modified();
1140 }
1141
1142 /// Snapshot the current buffer state (piece tree + buffers) for BulkEdit undo/redo.
1143 ///
1144 /// The snapshot includes buffers because consolidate_after_save() can replace
1145 /// self.buffers between the snapshot and restore, which would otherwise cause
1146 /// the restored piece tree to reference nonexistent buffer IDs.
1147 pub fn snapshot_buffer_state(&self) -> Arc<BufferSnapshot> {
1148 Arc::new(BufferSnapshot {
1149 piece_tree: self.piece_tree.clone(),
1150 buffers: self.buffers.clone(),
1151 next_buffer_id: self.next_buffer_id,
1152 })
1153 }
1154
1155 /// Apply bulk edits efficiently in a single pass
1156 /// Returns the net change in bytes
1157 pub fn apply_bulk_edits(&mut self, edits: &[(usize, usize, &str)]) -> isize {
1158 // Pre-allocate buffers for all insert texts (only non-empty texts)
1159 // This avoids the borrow conflict in the closure
1160 // IMPORTANT: Only add entries for non-empty texts because the closure
1161 // is only called for edits with non-empty insert text
1162 let mut buffer_info: Vec<(BufferLocation, usize, usize, Option<usize>)> = Vec::new();
1163
1164 for (_, _, text) in edits {
1165 if !text.is_empty() {
1166 let buffer_id = self.next_buffer_id;
1167 self.next_buffer_id += 1;
1168 let content = text.as_bytes().to_vec();
1169 let lf_cnt = content.iter().filter(|&&b| b == b'\n').count();
1170 let bytes = content.len();
1171 let buffer = StringBuffer::new(buffer_id, content);
1172 self.buffers.push(buffer);
1173 buffer_info.push((BufferLocation::Added(buffer_id), 0, bytes, Some(lf_cnt)));
1174 }
1175 // No placeholder for empty texts - the closure is only called for non-empty texts
1176 }
1177
1178 // Now call apply_bulk_edits with a simple index-based closure
1179 let mut idx = 0;
1180 let delta = self
1181 .piece_tree
1182 .apply_bulk_edits(edits, &self.buffers, |_text| {
1183 let info = buffer_info[idx];
1184 idx += 1;
1185 info
1186 });
1187
1188 self.mark_content_modified();
1189 delta
1190 }
1191
1192 /// Get text from a byte offset range
1193 /// This now uses the optimized piece_tree.iter_pieces_in_range() for a single traversal
1194 /// Get text from a byte offset range (read-only)
1195 /// Returns None if any buffer in the range is unloaded
1196 /// PRIVATE: External code should use get_text_range_mut() which handles lazy loading
1197 fn get_text_range(&self, offset: usize, bytes: usize) -> Option<Vec<u8>> {
1198 if bytes == 0 {
1199 return Some(Vec::new());
1200 }
1201
1202 let mut result = Vec::with_capacity(bytes);
1203 let end_offset = offset + bytes;
1204 let mut collected = 0;
1205
1206 // Use the efficient piece iterator (single O(log n) traversal + O(N) iteration)
1207 for piece_view in self.piece_tree.iter_pieces_in_range(offset, end_offset) {
1208 let buffer_id = piece_view.location.buffer_id();
1209 if let Some(buffer) = self.buffers.get(buffer_id) {
1210 // Calculate the range to read from this piece
1211 let piece_start_in_doc = piece_view.doc_offset;
1212 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
1213
1214 // Clip to the requested range
1215 let read_start = offset.max(piece_start_in_doc);
1216 let read_end = end_offset.min(piece_end_in_doc);
1217
1218 if read_end > read_start {
1219 let offset_in_piece = read_start - piece_start_in_doc;
1220 let bytes_to_read = read_end - read_start;
1221
1222 let buffer_start = piece_view.buffer_offset + offset_in_piece;
1223 let buffer_end = buffer_start + bytes_to_read;
1224
1225 // Return None if buffer is unloaded (type-safe)
1226 let data = buffer.get_data()?;
1227
1228 if buffer_end <= data.len() {
1229 result.extend_from_slice(&data[buffer_start..buffer_end]);
1230 collected += bytes_to_read;
1231
1232 if collected >= bytes {
1233 break;
1234 }
1235 }
1236 }
1237 }
1238 }
1239
1240 Some(result)
1241 }
1242
1243 /// Get text from a byte offset range with lazy loading
1244 /// This will load unloaded chunks on-demand and always returns complete data
1245 ///
1246 /// Returns an error if loading fails or if data cannot be read for any reason.
1247 ///
1248 /// NOTE: Currently loads entire buffers on-demand. Future optimization would split
1249 /// large pieces and load only LOAD_CHUNK_SIZE chunks at a time.
1250 pub fn get_text_range_mut(&mut self, offset: usize, bytes: usize) -> Result<Vec<u8>> {
1251 let _span = tracing::info_span!("get_text_range_mut", offset, bytes).entered();
1252 if bytes == 0 {
1253 return Ok(Vec::new());
1254 }
1255
1256 let mut result = Vec::with_capacity(bytes);
1257 // Clamp end_offset to buffer length to handle reads beyond EOF
1258 let end_offset = (offset + bytes).min(self.len());
1259 let mut current_offset = offset;
1260 let mut iteration_count = 0u32;
1261
1262 // Keep iterating until we've collected all requested bytes
1263 while current_offset < end_offset {
1264 iteration_count += 1;
1265 let mut made_progress = false;
1266 let mut restarted_iteration = false;
1267
1268 // Use the efficient piece iterator (single O(log n) traversal + O(N) iteration)
1269 for piece_view in self
1270 .piece_tree
1271 .iter_pieces_in_range(current_offset, end_offset)
1272 {
1273 let buffer_id = piece_view.location.buffer_id();
1274
1275 // Check if buffer needs loading
1276 let needs_loading = self
1277 .buffers
1278 .get(buffer_id)
1279 .map(|b| !b.is_loaded())
1280 .unwrap_or(false);
1281
1282 if needs_loading && self.chunk_split_and_load(&piece_view, current_offset)? {
1283 restarted_iteration = true;
1284 break;
1285 }
1286
1287 // Calculate the range to read from this piece
1288 let piece_start_in_doc = piece_view.doc_offset;
1289 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
1290
1291 // Clip to the requested range
1292 let read_start = current_offset.max(piece_start_in_doc);
1293 let read_end = end_offset.min(piece_end_in_doc);
1294
1295 if read_end > read_start {
1296 let offset_in_piece = read_start - piece_start_in_doc;
1297 let bytes_to_read = read_end - read_start;
1298
1299 let buffer_start = piece_view.buffer_offset + offset_in_piece;
1300 let buffer_end = buffer_start + bytes_to_read;
1301
1302 // Buffer should be loaded now
1303 let buffer = self.buffers.get(buffer_id).context("Buffer not found")?;
1304 let data = buffer
1305 .get_data()
1306 .context("Buffer data unavailable after load")?;
1307
1308 anyhow::ensure!(
1309 buffer_end <= data.len(),
1310 "Buffer range out of bounds: requested {}..{}, buffer size {}",
1311 buffer_start,
1312 buffer_end,
1313 data.len()
1314 );
1315
1316 result.extend_from_slice(&data[buffer_start..buffer_end]);
1317 current_offset = read_end;
1318 made_progress = true;
1319 }
1320 }
1321
1322 // If we didn't make progress and didn't restart iteration, this is an error
1323 if !made_progress && !restarted_iteration {
1324 tracing::error!(
1325 "get_text_range_mut: No progress at offset {} (requested range: {}..{}, buffer len: {})",
1326 current_offset,
1327 offset,
1328 end_offset,
1329 self.len()
1330 );
1331 tracing::error!(
1332 "Piece tree stats: {} total bytes",
1333 self.piece_tree.stats().total_bytes
1334 );
1335 anyhow::bail!(
1336 "Failed to read data at offset {}: no progress made (requested {}..{}, buffer len: {})",
1337 current_offset,
1338 offset,
1339 end_offset,
1340 self.len()
1341 );
1342 }
1343 }
1344
1345 if iteration_count > 1 {
1346 tracing::info!(
1347 iteration_count,
1348 result_len = result.len(),
1349 "get_text_range_mut: completed with multiple iterations"
1350 );
1351 }
1352
1353 Ok(result)
1354 }
1355
1356 /// Prepare a viewport for rendering
1357 ///
1358 /// This is called before rendering with &mut access to pre-load all data
1359 /// that will be needed for the viewport. It estimates the number of bytes
1360 /// needed based on the line count and pre-loads them.
1361 ///
1362 /// # Arguments
1363 /// * `start_offset` - The byte offset where the viewport starts
1364 /// * `line_count` - The number of lines to prepare (estimate)
1365 ///
1366 /// # Returns
1367 /// Ok(()) if preparation succeeded, Err if loading failed
1368 pub fn prepare_viewport(&mut self, start_offset: usize, line_count: usize) -> Result<()> {
1369 let _span = tracing::info_span!("prepare_viewport", start_offset, line_count).entered();
1370 // Estimate how many bytes we need (pessimistic assumption)
1371 // Average line length is typically 80-100 bytes, but we use 200 to be safe
1372 let estimated_bytes = line_count.saturating_mul(200);
1373
1374 // Cap the estimate at the remaining bytes in the document
1375 let remaining_bytes = self.total_bytes().saturating_sub(start_offset);
1376 let bytes_to_load = estimated_bytes.min(remaining_bytes);
1377 tracing::trace!(
1378 bytes_to_load,
1379 total_bytes = self.total_bytes(),
1380 "prepare_viewport loading"
1381 );
1382
1383 // Pre-load with full chunk-splitting support
1384 // This may load more than we need, but ensures all data is available
1385 self.get_text_range_mut(start_offset, bytes_to_load)?;
1386
1387 Ok(())
1388 }
1389
1390 /// Split a piece that references a large unloaded buffer, create a chunk
1391 /// buffer for the region around `current_offset`, and load it.
1392 ///
1393 /// Returns `true` if the piece tree was modified (caller must restart its
1394 /// iteration), `false` if the piece was small enough to load in-place.
1395 fn chunk_split_and_load(
1396 &mut self,
1397 piece_view: &PieceView,
1398 current_offset: usize,
1399 ) -> Result<bool> {
1400 let buffer_id = piece_view.location.buffer_id();
1401
1402 // The underlying buffer may be much larger than this piece (e.g. the
1403 // whole-file Stored buffer after rebuild_with_pristine_saved_root).
1404 // We must chunk-split if either the piece or its buffer exceeds
1405 // LOAD_CHUNK_SIZE, because `load()` loads the entire buffer.
1406 let buffer_bytes = self
1407 .buffers
1408 .get(buffer_id)
1409 .and_then(|b| b.unloaded_bytes())
1410 .unwrap_or(0);
1411 let needs_chunk_split =
1412 piece_view.bytes > LOAD_CHUNK_SIZE || buffer_bytes > piece_view.bytes;
1413
1414 tracing::info!(
1415 buffer_id,
1416 piece_bytes = piece_view.bytes,
1417 buffer_bytes,
1418 needs_chunk_split,
1419 piece_doc_offset = piece_view.doc_offset,
1420 current_offset,
1421 "chunk_split_and_load: loading unloaded piece"
1422 );
1423
1424 if !needs_chunk_split {
1425 // Piece is small enough and its buffer matches — load in-place.
1426 let _span = tracing::info_span!(
1427 "load_small_buffer",
1428 piece_bytes = piece_view.bytes,
1429 buffer_id,
1430 )
1431 .entered();
1432 self.buffers
1433 .get_mut(buffer_id)
1434 .context("Buffer not found")?
1435 .load(&**self.persistence.fs())
1436 .context("Failed to load buffer")?;
1437 return Ok(false);
1438 }
1439
1440 let _span = tracing::info_span!(
1441 "chunk_split_and_load",
1442 piece_bytes = piece_view.bytes,
1443 buffer_id,
1444 )
1445 .entered();
1446
1447 let piece_start_in_doc = piece_view.doc_offset;
1448 let offset_in_piece = current_offset.saturating_sub(piece_start_in_doc);
1449
1450 // When the piece already fits within LOAD_CHUNK_SIZE, create a chunk
1451 // buffer for the exact piece range (no alignment/splitting needed).
1452 // Alignment rounding is only useful when carving a sub-range out of a
1453 // piece larger than LOAD_CHUNK_SIZE.
1454 let (chunk_start_in_buffer, chunk_bytes) = if piece_view.bytes <= LOAD_CHUNK_SIZE {
1455 (piece_view.buffer_offset, piece_view.bytes)
1456 } else {
1457 let start =
1458 (piece_view.buffer_offset + offset_in_piece) / CHUNK_ALIGNMENT * CHUNK_ALIGNMENT;
1459 let bytes = LOAD_CHUNK_SIZE
1460 .min((piece_view.buffer_offset + piece_view.bytes).saturating_sub(start));
1461 (start, bytes)
1462 };
1463
1464 // Calculate document offsets for splitting
1465 let chunk_start_offset_in_piece =
1466 chunk_start_in_buffer.saturating_sub(piece_view.buffer_offset);
1467 let split_start_in_doc = piece_start_in_doc + chunk_start_offset_in_piece;
1468 let split_end_in_doc = split_start_in_doc + chunk_bytes;
1469
1470 // Split the piece to isolate the chunk
1471 if chunk_start_offset_in_piece > 0 {
1472 self.piece_tree
1473 .split_at_offset(split_start_in_doc, &self.buffers);
1474 }
1475 if split_end_in_doc < piece_start_in_doc + piece_view.bytes {
1476 self.piece_tree
1477 .split_at_offset(split_end_in_doc, &self.buffers);
1478 }
1479
1480 // Create a new buffer for this chunk
1481 let chunk_buffer = self
1482 .buffers
1483 .get(buffer_id)
1484 .context("Buffer not found")?
1485 .create_chunk_buffer(self.next_buffer_id, chunk_start_in_buffer, chunk_bytes)
1486 .context("Failed to create chunk buffer")?;
1487
1488 self.next_buffer_id += 1;
1489 let new_buffer_id = chunk_buffer.id;
1490 self.buffers.push(chunk_buffer);
1491
1492 // Update the piece to reference the new chunk buffer
1493 self.piece_tree.replace_buffer_reference(
1494 buffer_id,
1495 piece_view.buffer_offset + chunk_start_offset_in_piece,
1496 chunk_bytes,
1497 BufferLocation::Added(new_buffer_id),
1498 );
1499
1500 // Load the chunk buffer
1501 self.buffers
1502 .get_mut(new_buffer_id)
1503 .context("Chunk buffer not found")?
1504 .load(&**self.persistence.fs())
1505 .context("Failed to load chunk")?;
1506
1507 // split_at_offset uses compute_line_feeds_static which returns None
1508 // for unloaded buffers, destroying the scanned line feed counts.
1509 // Fix up: the loaded chunk is counted from memory, remaining unloaded
1510 // pieces use the filesystem's count_line_feeds_in_range.
1511 if self.file_kind.has_line_feed_scan() {
1512 let leaves = self.piece_tree.get_leaves();
1513 let mut fixups: Vec<(usize, usize)> = Vec::new();
1514 for (idx, leaf) in leaves.iter().enumerate() {
1515 if leaf.line_feed_cnt.is_none() {
1516 if let Ok(count) = self.scan_leaf(leaf) {
1517 fixups.push((idx, count));
1518 }
1519 }
1520 }
1521 if !fixups.is_empty() {
1522 self.piece_tree.update_leaf_line_feeds_path_copy(&fixups);
1523 }
1524 }
1525
1526 // Keep saved_root in sync with viewport-loading tree restructures so
1527 // that diff_since_saved() can match by (location, offset) identity.
1528 //
1529 // When !modified the current tree IS the saved state, so just snapshot.
1530 // When modified, we must apply the same Stored→Added leaf replacement
1531 // to saved_root so the diff doesn't see loaded-but-unedited regions as
1532 // changed.
1533 if !self.persistence.is_modified() {
1534 self.persistence.set_saved_root(self.piece_tree.root());
1535 } else {
1536 self.persistence.apply_chunk_load_to_saved_root(
1537 buffer_id,
1538 chunk_start_in_buffer,
1539 chunk_bytes,
1540 new_buffer_id,
1541 );
1542 }
1543
1544 Ok(true)
1545 }
1546
1547 /// Get all text as a single Vec<u8>
1548 /// Returns None if any buffers are unloaded (lazy loading)
1549 /// CRATE-PRIVATE: External code should use get_text_range_mut() or DocumentModel methods
1550 pub(crate) fn get_all_text(&self) -> Option<Vec<u8>> {
1551 self.get_text_range(0, self.total_bytes())
1552 }
1553
1554 /// Get all text as a String
1555 /// Returns None if any buffers are unloaded (lazy loading)
1556 /// CRATE-PRIVATE: External code should use get_text_range_mut() or DocumentModel methods
1557 pub(crate) fn get_all_text_string(&self) -> Option<String> {
1558 self.get_all_text()
1559 .map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
1560 }
1561
1562 /// Get text from a byte range as bytes
1563 /// CRATE-PRIVATE: Returns empty vector if any buffers are unloaded (silently fails!)
1564 /// Only use this when you KNOW the data is loaded (e.g., for syntax highlighting small regions)
1565 /// External code should use get_text_range_mut() or DocumentModel methods
1566 pub(crate) fn slice_bytes(&self, range: Range<usize>) -> Vec<u8> {
1567 self.get_text_range(range.start, range.end.saturating_sub(range.start))
1568 .unwrap_or_default()
1569 }
1570
1571 /// Get all text as a String
1572 /// Returns None if any buffers are unloaded (lazy loading)
1573 pub fn to_string(&self) -> Option<String> {
1574 self.get_all_text_string()
1575 }
1576
1577 /// Get the total number of bytes
1578 pub fn len(&self) -> usize {
1579 self.total_bytes()
1580 }
1581
1582 /// Check if the buffer is empty
1583 pub fn is_empty(&self) -> bool {
1584 self.total_bytes() == 0
1585 }
1586
1587 /// Get the file path associated with this buffer
1588 pub fn file_path(&self) -> Option<&Path> {
1589 self.persistence.file_path()
1590 }
1591
1592 /// Update the file path after a rename operation on disk.
1593 pub fn rename_file_path(&mut self, path: PathBuf) {
1594 self.persistence.set_file_path(path);
1595 }
1596
1597 /// Clear the file path (make buffer unnamed)
1598 /// Note: This does NOT affect Unloaded chunk file_paths used for lazy loading.
1599 /// Those still point to the original source file for chunk loading.
1600 pub fn clear_file_path(&mut self) {
1601 self.persistence.clear_file_path();
1602 }
1603
1604 /// Extend buffer to include more bytes from a streaming source file.
1605 /// Used for stdin streaming where the temp file grows over time, and
1606 /// for plugin streaming via `RefreshBufferFromDisk`.
1607 ///
1608 /// Counts line feeds in the appended region so the new piece carries
1609 /// a real `line_feed_cnt` instead of `None`. Without this, any
1610 /// previously-known line count on the existing pieces propagates to
1611 /// `line_count() = None` (the piece-tree's `total_line_feeds`
1612 /// returns `None` if any piece is unknown), which in turn breaks the
1613 /// visual-row index used by the scrollbar.
1614 ///
1615 /// Falls back to `None` only when the filesystem can't count
1616 /// (errored stat / read). The buffer is still usable then — just
1617 /// without precise line indexing, same as a large file opened
1618 /// without a scan.
1619 pub fn extend_streaming(&mut self, source_path: &Path, new_size: usize) {
1620 let old_size = self.total_bytes();
1621 if new_size <= old_size {
1622 return;
1623 }
1624
1625 let additional_bytes = new_size - old_size;
1626
1627 // Create new Unloaded buffer for the appended region
1628 let buffer_id = self.next_buffer_id;
1629 self.next_buffer_id += 1;
1630
1631 let new_buffer = StringBuffer::new_unloaded(
1632 buffer_id,
1633 source_path.to_path_buf(),
1634 old_size, // file_offset - where this chunk starts in the file
1635 additional_bytes, // bytes - size of this chunk
1636 );
1637 self.buffers.push(new_buffer);
1638
1639 // Count line feeds in the appended region from disk so the
1640 // piece carries a known line count. Counting is cheap — it's a
1641 // streaming scan of `additional_bytes`, no buffer materialisation.
1642 let line_feed_cnt = self
1643 .persistence
1644 .fs()
1645 .count_line_feeds_in_range(source_path, old_size as u64, additional_bytes)
1646 .ok();
1647
1648 // Append piece at end of document (insert at offset == total_bytes)
1649 self.piece_tree.insert(
1650 old_size,
1651 BufferLocation::Stored(buffer_id),
1652 0,
1653 additional_bytes,
1654 line_feed_cnt,
1655 &self.buffers,
1656 );
1657 }
1658
1659 /// Check if the buffer has been modified since last save
1660 pub fn is_modified(&self) -> bool {
1661 self.persistence.is_modified()
1662 }
1663
1664 /// Clear the modified flag (after save)
1665 pub fn clear_modified(&mut self) {
1666 self.persistence.clear_modified();
1667 }
1668
1669 /// Set the modified flag explicitly
1670 /// Used by undo/redo to restore the correct modified state
1671 pub fn set_modified(&mut self, modified: bool) {
1672 self.persistence.set_modified(modified);
1673 }
1674
1675 /// Check if buffer has pending changes for recovery auto-save
1676 pub fn is_recovery_pending(&self) -> bool {
1677 self.persistence.is_recovery_pending()
1678 }
1679
1680 /// Mark buffer as needing recovery auto-save (call after edits)
1681 pub fn set_recovery_pending(&mut self, pending: bool) {
1682 self.persistence.set_recovery_pending(pending);
1683 }
1684
1685 /// Ensure the buffer chunk at the given byte offset is loaded.
1686 ///
1687 /// When `line_feeds_scanned` is true, piece splits during insert/delete need
1688 /// the buffer data to be loaded so `compute_line_feeds_static` can accurately
1689 /// recount line feeds for each half. This method loads the chunk if needed.
1690 fn ensure_chunk_loaded_at(&mut self, offset: usize) {
1691 if let Some(piece_info) = self.piece_tree.find_by_offset(offset) {
1692 let buffer_id = piece_info.location.buffer_id();
1693 if let Some(buffer) = self.buffers.get_mut(buffer_id) {
1694 if !buffer.is_loaded() {
1695 let buf_bytes = buffer.unloaded_bytes().unwrap_or(0);
1696 tracing::info!(
1697 "ensure_chunk_loaded_at: loading buffer {} ({} bytes) for offset {}",
1698 buffer_id,
1699 buf_bytes,
1700 offset
1701 );
1702 if let Err(e) = buffer.load(&**self.persistence.fs()) {
1703 tracing::warn!("Failed to load chunk at offset {offset}: {e}");
1704 }
1705 }
1706 }
1707 }
1708 }
1709
1710 /// Check if this is a large file with lazy loading enabled
1711 pub fn is_large_file(&self) -> bool {
1712 self.file_kind.is_large_file()
1713 }
1714
1715 /// Check if line feeds have been scanned for this large file.
1716 /// When true, `line_count()` returns exact values.
1717 pub fn has_line_feed_scan(&self) -> bool {
1718 self.file_kind.has_line_feed_scan()
1719 }
1720
1721 /// Get the raw piece tree leaves (for storing alongside scan chunks).
1722 pub fn piece_tree_leaves(&self) -> Vec<crate::model::piece_tree::LeafData> {
1723 self.piece_tree.get_leaves()
1724 }
1725
1726 /// Prepare work items for an incremental line scan.
1727 ///
1728 /// First splits any oversized leaves in the piece tree so every leaf is
1729 /// at most `LOAD_CHUNK_SIZE` bytes. Then returns one work item per leaf.
1730 /// After scanning, `get_text_range_mut` will never need to split a scanned
1731 /// leaf (it's already chunk-sized), so line-feed counts are preserved.
1732 ///
1733 /// Returns `(chunks, total_bytes)`.
1734 pub fn prepare_line_scan(&mut self) -> (Vec<LineScanChunk>, usize) {
1735 // Pre-split the tree so every leaf ≤ LOAD_CHUNK_SIZE.
1736 self.piece_tree.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
1737
1738 let leaves = self.piece_tree.get_leaves();
1739 let total_bytes: usize = leaves.iter().map(|l| l.bytes).sum();
1740 let mut chunks = Vec::new();
1741
1742 for (idx, leaf) in leaves.iter().enumerate() {
1743 chunks.push(LineScanChunk {
1744 leaf_index: idx,
1745 byte_len: leaf.bytes,
1746 already_known: leaf.line_feed_cnt.is_some(),
1747 });
1748 }
1749
1750 (chunks, total_bytes)
1751 }
1752
1753 /// Initialize a chunked search scan over this buffer's piece tree.
1754 ///
1755 /// Used for in-editor Ctrl+F (incremental, yields to the event loop
1756 /// between chunks) and for searching dirty buffers during project grep.
1757 /// For searching files on disk, use `FileSystem::search_file` instead.
1758 pub fn search_scan_init(
1759 &mut self,
1760 regex: regex::bytes::Regex,
1761 max_matches: usize,
1762 query_len: usize,
1763 ) -> ChunkedSearchState {
1764 let (chunks, total_bytes) = self.prepare_line_scan();
1765 ChunkedSearchState {
1766 chunks,
1767 next_chunk: 0,
1768 next_doc_offset: 0,
1769 total_bytes,
1770 scanned_bytes: 0,
1771 regex,
1772 matches: Vec::new(),
1773 overlap_tail: Vec::new(),
1774 overlap_doc_offset: 0,
1775 max_matches,
1776 capped: false,
1777 query_len,
1778 running_line: 1,
1779 }
1780 }
1781
1782 /// Process one chunk of a chunked search scan.
1783 ///
1784 /// Loads the next chunk via `get_text_range_mut`, prepends overlap from
1785 /// the previous chunk, runs the regex, and appends matches to `state`
1786 /// with line/column/context computed on the fly from the loaded bytes.
1787 ///
1788 /// Line numbers are tracked incrementally via `running_line` — each
1789 /// chunk counts newlines in its non-overlap portion to advance the
1790 /// counter for the next chunk, and matches use an incremental cursor
1791 /// so total line-counting work is O(chunk_size), not O(chunk × matches).
1792 ///
1793 /// Returns `Ok(true)` if there are more chunks to process, `Ok(false)`
1794 /// when the scan is complete.
1795 ///
1796 /// TODO: For concurrent/parallel search (searching multiple files at once),
1797 /// chunks would need to return chunk-relative line numbers and have them
1798 /// fixed up with each file's starting line offset after all chunks complete.
1799 pub fn search_scan_next_chunk(
1800 &mut self,
1801 state: &mut ChunkedSearchState,
1802 ) -> std::io::Result<bool> {
1803 if state.is_done() {
1804 return Ok(false);
1805 }
1806
1807 let chunk_info = state.chunks[state.next_chunk].clone();
1808 let doc_offset = state.next_doc_offset;
1809
1810 state.next_chunk += 1;
1811 state.scanned_bytes += chunk_info.byte_len;
1812 state.next_doc_offset += chunk_info.byte_len;
1813
1814 // Load the chunk bytes
1815 let chunk_bytes = self
1816 .get_text_range_mut(doc_offset, chunk_info.byte_len)
1817 .map_err(std::io::Error::other)?;
1818
1819 // Build search buffer: overlap tail + new chunk
1820 let overlap_len = state.overlap_tail.len();
1821 let mut search_buf = Vec::with_capacity(overlap_len + chunk_bytes.len());
1822 search_buf.extend_from_slice(&state.overlap_tail);
1823 search_buf.extend_from_slice(&chunk_bytes);
1824
1825 let buf_doc_offset = if overlap_len > 0 {
1826 state.overlap_doc_offset
1827 } else {
1828 doc_offset
1829 };
1830
1831 // Line number at buf_doc_offset: running_line tracks the line at
1832 // doc_offset (start of new chunk data). Count newlines in the overlap
1833 // prefix to get the line at the start of the full search_buf.
1834 let newlines_in_overlap = search_buf[..overlap_len]
1835 .iter()
1836 .filter(|&&b| b == b'\n')
1837 .count();
1838 let mut line_at = state.running_line - newlines_in_overlap;
1839 let mut counted_to = 0usize;
1840
1841 // Run regex on the combined buffer
1842 for m in state.regex.find_iter(&search_buf) {
1843 // Skip matches entirely within the overlap (already found)
1844 if overlap_len > 0 && m.end() <= overlap_len {
1845 continue;
1846 }
1847
1848 if state.matches.len() >= state.max_matches {
1849 state.capped = true;
1850 break;
1851 }
1852
1853 // Advance line counter incrementally to this match
1854 line_at += search_buf[counted_to..m.start()]
1855 .iter()
1856 .filter(|&&b| b == b'\n')
1857 .count();
1858 counted_to = m.start();
1859
1860 // Find line boundaries in search_buf for context
1861 let line_start = search_buf[..m.start()]
1862 .iter()
1863 .rposition(|&b| b == b'\n')
1864 .map(|p| p + 1)
1865 .unwrap_or(0);
1866 let line_end = search_buf[m.start()..]
1867 .iter()
1868 .position(|&b| b == b'\n')
1869 .map(|p| m.start() + p)
1870 .unwrap_or(search_buf.len());
1871
1872 let match_doc_offset = buf_doc_offset + m.start();
1873 let match_len = m.end() - m.start();
1874 let column = m.start() - line_start + 1;
1875 let context = String::from_utf8_lossy(&search_buf[line_start..line_end]).into_owned();
1876
1877 state.matches.push(SearchMatch {
1878 byte_offset: match_doc_offset,
1879 length: match_len,
1880 line: line_at,
1881 column,
1882 context,
1883 });
1884 }
1885
1886 // Advance running_line by newlines in the new (non-overlap) chunk data
1887 let newlines_in_chunk = chunk_bytes.iter().filter(|&&b| b == b'\n').count();
1888 state.running_line += newlines_in_chunk;
1889
1890 // Save overlap tail for next chunk
1891 let max_overlap = state.query_len.max(256).min(chunk_bytes.len());
1892 let tail_start = chunk_bytes.len().saturating_sub(max_overlap);
1893 state.overlap_tail = chunk_bytes[tail_start..].to_vec();
1894 state.overlap_doc_offset = doc_offset + tail_start;
1895
1896 Ok(!state.is_done())
1897 }
1898
1899 /// Run a complete chunked search over the piece tree (all chunks).
1900 ///
1901 /// Synchronous variant — used for dirty buffer snapshots in project
1902 /// grep and in tests. For on-disk files, use `FileSystem::search_file`.
1903 pub fn search_scan_all(
1904 &mut self,
1905 regex: regex::bytes::Regex,
1906 max_matches: usize,
1907 query_len: usize,
1908 ) -> std::io::Result<ChunkedSearchState> {
1909 let mut state = self.search_scan_init(regex, max_matches, query_len);
1910 while self.search_scan_next_chunk(&mut state)? {}
1911 Ok(state)
1912 }
1913
1914 /// Build a hybrid search plan from the piece tree.
1915 ///
1916 /// Extracts regions (unloaded file ranges + loaded in-memory data) that
1917 /// can be searched independently. The plan is `Send` so it can be
1918 /// executed on a background thread via `HybridSearchPlan::execute`.
1919 ///
1920 /// Returns `None` if the buffer has no file path (caller should fall
1921 /// back to `search_scan_all`).
1922 pub fn search_hybrid_plan(&mut self) -> Option<HybridSearchPlan> {
1923 let file_path = self.persistence.file_path_owned()?;
1924
1925 self.piece_tree.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
1926 let leaves = self.piece_tree.get_leaves();
1927
1928 let mut regions: Vec<SearchRegion> = Vec::new();
1929 let mut doc_offset = 0usize;
1930
1931 for leaf in &leaves {
1932 let buf = self.buffers.get(leaf.location.buffer_id());
1933 let is_unloaded_stored = matches!(
1934 (&leaf.location, buf),
1935 (
1936 BufferLocation::Stored(_),
1937 Some(StringBuffer {
1938 data: BufferData::Unloaded { .. },
1939 ..
1940 }),
1941 )
1942 );
1943
1944 if is_unloaded_stored {
1945 let file_offset = match buf.unwrap().data {
1946 BufferData::Unloaded {
1947 file_offset: fo, ..
1948 } => fo + leaf.offset,
1949 _ => unreachable!(),
1950 };
1951
1952 // Merge with previous unloaded region if contiguous
1953 if let Some(SearchRegion::Unloaded {
1954 file_offset: prev_fo,
1955 bytes: prev_bytes,
1956 ..
1957 }) = regions.last_mut()
1958 {
1959 if *prev_fo + *prev_bytes == file_offset {
1960 *prev_bytes += leaf.bytes;
1961 doc_offset += leaf.bytes;
1962 continue;
1963 }
1964 }
1965 regions.push(SearchRegion::Unloaded {
1966 file_offset,
1967 bytes: leaf.bytes,
1968 doc_offset,
1969 });
1970 } else {
1971 let data = match buf.and_then(|b| b.get_data()) {
1972 Some(full) => {
1973 let end = (leaf.offset + leaf.bytes).min(full.len());
1974 full[leaf.offset..end].to_vec()
1975 }
1976 None => match self.get_text_range_mut(doc_offset, leaf.bytes) {
1977 Ok(d) => d,
1978 Err(_) => {
1979 doc_offset += leaf.bytes;
1980 continue;
1981 }
1982 },
1983 };
1984
1985 // Merge with previous loaded region
1986 if let Some(SearchRegion::Loaded {
1987 data: prev_data, ..
1988 }) = regions.last_mut()
1989 {
1990 prev_data.extend_from_slice(&data);
1991 doc_offset += leaf.bytes;
1992 continue;
1993 }
1994 regions.push(SearchRegion::Loaded { data, doc_offset });
1995 }
1996
1997 doc_offset += leaf.bytes;
1998 }
1999
2000 Some(HybridSearchPlan { file_path, regions })
2001 }
2002
2003 /// Hybrid search: uses `fs.search_file` for unloaded piece-tree regions
2004 /// (searches where the data lives, no network transfer) and in-memory regex
2005 /// for loaded/edited regions. Handles overlap at region boundaries.
2006 ///
2007 /// For a huge remote file with a small local edit, this avoids transferring
2008 /// the entire file — only match metadata crosses the network.
2009 ///
2010 /// Falls back to `search_scan_all` when the buffer has no file path or is
2011 /// fully loaded.
2012 pub fn search_hybrid(
2013 &mut self,
2014 pattern: &str,
2015 opts: &FileSearchOptions,
2016 regex: Regex,
2017 max_matches: usize,
2018 query_len: usize,
2019 ) -> io::Result<Vec<SearchMatch>> {
2020 let plan = match self.search_hybrid_plan() {
2021 Some(p) => p,
2022 None => {
2023 let state = self.search_scan_all(regex, max_matches, query_len)?;
2024 return Ok(state.matches);
2025 }
2026 };
2027 plan.execute(
2028 &**self.persistence.fs(),
2029 pattern,
2030 opts,
2031 ®ex,
2032 max_matches,
2033 query_len,
2034 )
2035 }
2036
2037 /// Count `\n` bytes in a single leaf.
2038 ///
2039 /// Uses `count_line_feeds_in_range` for unloaded buffers, which remote
2040 /// filesystem implementations can override to count server-side.
2041 pub fn scan_leaf(&self, leaf: &crate::model::piece_tree::LeafData) -> std::io::Result<usize> {
2042 let buffer_id = leaf.location.buffer_id();
2043 let buffer = self
2044 .buffers
2045 .get(buffer_id)
2046 .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::NotFound, "buffer not found"))?;
2047
2048 let count = match &buffer.data {
2049 crate::model::piece_tree::BufferData::Loaded { data, .. } => {
2050 let end = (leaf.offset + leaf.bytes).min(data.len());
2051 data[leaf.offset..end]
2052 .iter()
2053 .filter(|&&b| b == b'\n')
2054 .count()
2055 }
2056 crate::model::piece_tree::BufferData::Unloaded {
2057 file_path,
2058 file_offset,
2059 ..
2060 } => {
2061 let read_offset = *file_offset as u64 + leaf.offset as u64;
2062 self.persistence.fs().count_line_feeds_in_range(
2063 file_path,
2064 read_offset,
2065 leaf.bytes,
2066 )?
2067 }
2068 };
2069 Ok(count)
2070 }
2071
2072 /// Return the I/O parameters for an unloaded leaf, or `None` if loaded.
2073 ///
2074 /// Used by the incremental scan to distinguish leaves that can be counted
2075 /// in-memory (via `scan_leaf`) from those that need filesystem I/O.
2076 pub fn leaf_io_params(
2077 &self,
2078 leaf: &crate::model::piece_tree::LeafData,
2079 ) -> Option<(std::path::PathBuf, u64, usize)> {
2080 let buffer_id = leaf.location.buffer_id();
2081 let buffer = self.buffers.get(buffer_id)?;
2082 match &buffer.data {
2083 crate::model::piece_tree::BufferData::Loaded { .. } => None,
2084 crate::model::piece_tree::BufferData::Unloaded {
2085 file_path,
2086 file_offset,
2087 ..
2088 } => {
2089 let read_offset = *file_offset as u64 + leaf.offset as u64;
2090 Some((file_path.clone(), read_offset, leaf.bytes))
2091 }
2092 }
2093 }
2094
2095 /// Get a reference to the string buffers (for parallel scanning).
2096 pub fn buffer_slice(&self) -> &[StringBuffer] {
2097 &self.buffers
2098 }
2099
2100 /// Apply the results of an incremental line scan.
2101 pub fn apply_scan_updates(&mut self, updates: &[(usize, usize)]) {
2102 self.piece_tree.update_leaf_line_feeds(updates);
2103 self.file_kind.mark_line_feed_scan_complete();
2104 }
2105
2106 /// After an incremental line-feed scan completes, rebuild the tree so that
2107 /// `saved_root` and the current tree share `Arc` pointers for unedited
2108 /// subtrees. This makes `diff_since_saved()` O(edited regions) instead of
2109 /// O(file size).
2110 pub fn rebuild_with_pristine_saved_root(&mut self, scan_updates: &[(usize, usize)]) {
2111 let file_size = match self.persistence.saved_file_size() {
2112 Some(s) => s,
2113 None => {
2114 // Fallback: no saved file size means we can't build a pristine
2115 // tree. Just apply updates the old way.
2116 self.apply_scan_updates(scan_updates);
2117 return;
2118 }
2119 };
2120
2121 // --- Walk the current tree to extract deletions and insertions ---
2122 let total = self.total_bytes();
2123 // Deletions: gaps in Stored coverage (orig_offset, len).
2124 let mut deletions: Vec<(usize, usize)> = Vec::new();
2125 // Insertions: (post_delete_offset, location, buf_offset, bytes, lf_cnt).
2126 // post_delete_offset = cumulative surviving Stored bytes before this point.
2127 let mut insertions: Vec<(usize, BufferLocation, usize, usize, Option<usize>)> = Vec::new();
2128 let mut orig_cursor: usize = 0;
2129 let mut stored_bytes_in_doc: usize = 0;
2130
2131 for piece in self.piece_tree.iter_pieces_in_range(0, total) {
2132 match piece.location {
2133 BufferLocation::Stored(_) => {
2134 if piece.buffer_offset > orig_cursor {
2135 deletions.push((orig_cursor, piece.buffer_offset - orig_cursor));
2136 }
2137 orig_cursor = piece.buffer_offset + piece.bytes;
2138 stored_bytes_in_doc += piece.bytes;
2139 }
2140 BufferLocation::Added(id) => {
2141 // Check if this Added buffer was created by loading a chunk
2142 // from the stored file (via get_text_range_mut chunk loading).
2143 // If so, treat it as stored content, not a user edit.
2144 if let Some(file_off) = self.buffers.get(id).and_then(|b| b.stored_file_offset)
2145 {
2146 if file_off > orig_cursor {
2147 deletions.push((orig_cursor, file_off - orig_cursor));
2148 }
2149 orig_cursor = file_off + piece.bytes;
2150 stored_bytes_in_doc += piece.bytes;
2151 } else {
2152 insertions.push((
2153 stored_bytes_in_doc,
2154 piece.location,
2155 piece.buffer_offset,
2156 piece.bytes,
2157 piece.line_feed_cnt,
2158 ));
2159 }
2160 }
2161 }
2162 }
2163 // Trailing deletion.
2164 if orig_cursor < file_size {
2165 deletions.push((orig_cursor, file_size - orig_cursor));
2166 }
2167
2168 // --- Build pristine tree (full original file, pre-split, with lf counts) ---
2169 let mut pristine = if file_size > 0 {
2170 PieceTree::new(BufferLocation::Stored(0), 0, file_size, None)
2171 } else {
2172 PieceTree::empty()
2173 };
2174 pristine.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
2175 pristine.update_leaf_line_feeds(scan_updates);
2176
2177 // Snapshot the pristine tree as saved_root.
2178 self.persistence.set_saved_root(pristine.root());
2179
2180 // If no edits, the pristine tree IS the current tree.
2181 if deletions.is_empty() && insertions.is_empty() {
2182 self.piece_tree = pristine;
2183 self.file_kind.mark_line_feed_scan_complete();
2184 return;
2185 }
2186
2187 // --- Replay edits onto a clone of the pristine tree ---
2188 let mut tree = pristine;
2189
2190 // Apply deletions from HIGH to LOW offset so earlier offsets stay valid.
2191 deletions.sort_by(|a, b| b.0.cmp(&a.0));
2192 for &(offset, len) in &deletions {
2193 tree.delete(offset, len, &self.buffers);
2194 }
2195
2196 // Apply insertions from LOW to HIGH. Each insertion shifts subsequent
2197 // offsets by its byte count, tracked via insert_delta.
2198 let mut insert_delta: usize = 0;
2199 for &(offset, location, buf_offset, bytes, lf_cnt) in &insertions {
2200 tree.insert(
2201 offset + insert_delta,
2202 location,
2203 buf_offset,
2204 bytes,
2205 lf_cnt,
2206 &self.buffers,
2207 );
2208 insert_delta += bytes;
2209 }
2210
2211 // Path-copy insert/delete may split Stored leaves whose data is
2212 // Unloaded, producing fragments with line_feed_cnt = None
2213 // (compute_line_feeds_static can't read unloaded data). Fix them up
2214 // by scanning any remaining None leaves.
2215 let leaves = tree.get_leaves();
2216 let mut fixups: Vec<(usize, usize)> = Vec::new();
2217 for (idx, leaf) in leaves.iter().enumerate() {
2218 if leaf.line_feed_cnt.is_none() {
2219 if let Ok(count) = self.scan_leaf(leaf) {
2220 fixups.push((idx, count));
2221 }
2222 }
2223 }
2224 if !fixups.is_empty() {
2225 tree.update_leaf_line_feeds_path_copy(&fixups);
2226 }
2227
2228 self.piece_tree = tree;
2229 self.file_kind.mark_line_feed_scan_complete();
2230 }
2231
2232 /// Resolve the exact byte offset for a given line number (0-indexed).
2233 ///
2234 /// Uses the tree's line feed counts to find the piece containing the target line,
2235 /// then loads/reads that piece's data to find the exact newline position.
2236 /// This works even when buffers are unloaded (large file with scanned line index).
2237 pub fn resolve_line_byte_offset(&mut self, target_line: usize) -> Option<usize> {
2238 if target_line == 0 {
2239 return Some(0);
2240 }
2241
2242 // Use tree metadata to find the piece containing the target line
2243 let (doc_offset, buffer_id, piece_offset, piece_bytes, lines_before) =
2244 self.piece_tree.piece_info_for_line(target_line)?;
2245
2246 // We need to find the (target_line - lines_before)-th newline within this piece
2247 let lines_to_skip = target_line - lines_before;
2248
2249 // Get the piece data — either from loaded buffer or read from disk
2250 let buffer = self.buffers.get(buffer_id)?;
2251 let piece_data: Vec<u8> = match &buffer.data {
2252 crate::model::piece_tree::BufferData::Loaded { data, .. } => {
2253 let end = (piece_offset + piece_bytes).min(data.len());
2254 data[piece_offset..end].to_vec()
2255 }
2256 crate::model::piece_tree::BufferData::Unloaded {
2257 file_path,
2258 file_offset,
2259 ..
2260 } => {
2261 let read_offset = *file_offset as u64 + piece_offset as u64;
2262 self.persistence
2263 .fs()
2264 .read_range(file_path, read_offset, piece_bytes)
2265 .ok()?
2266 }
2267 };
2268
2269 // Count newlines to find the target line start
2270 let mut newlines_found = 0;
2271 for (i, &byte) in piece_data.iter().enumerate() {
2272 if byte == b'\n' {
2273 newlines_found += 1;
2274 if newlines_found == lines_to_skip {
2275 // The target line starts right after this newline
2276 return Some(doc_offset + i + 1);
2277 }
2278 }
2279 }
2280
2281 // If we didn't find enough newlines, the line starts in the next piece
2282 // Return the end of this piece as an approximation
2283 Some(doc_offset + piece_bytes)
2284 }
2285
2286 /// Get the saved file size (size of the file on disk after last load/save)
2287 /// For large files, this is used during recovery to know the expected original file size.
2288 /// Returns None for new unsaved buffers.
2289 pub fn original_file_size(&self) -> Option<usize> {
2290 // Return the tracked saved file size - this is updated when the file is
2291 // loaded or saved, so it always reflects the current file on disk.
2292 self.persistence.saved_file_size()
2293 }
2294
2295 /// Get recovery chunks for this buffer (only modified portions)
2296 ///
2297 /// For large files, this returns only the pieces that come from Added buffers
2298 /// (i.e., the modifications), not the original file content. This allows
2299 /// efficient incremental recovery without reading/writing the entire file.
2300 ///
2301 /// Returns: Vec of (original_file_offset, data) for each modified chunk
2302 /// The offset is the position in the ORIGINAL file where this chunk should be inserted.
2303 pub fn get_recovery_chunks(&self) -> Vec<(usize, Vec<u8>)> {
2304 use crate::model::piece_tree::BufferLocation;
2305
2306 let mut chunks = Vec::new();
2307 let total = self.total_bytes();
2308
2309 // Track cumulative bytes from Stored pieces as we iterate.
2310 // This gives us the original file offset for Added pieces.
2311 // The key insight: Added pieces should be inserted at the position
2312 // corresponding to where they appear relative to Stored content,
2313 // not their position in the current document.
2314 let mut stored_bytes_before = 0;
2315
2316 for piece in self.piece_tree.iter_pieces_in_range(0, total) {
2317 match piece.location {
2318 BufferLocation::Stored(_) => {
2319 // Accumulate stored bytes to track position in original file
2320 stored_bytes_before += piece.bytes;
2321 }
2322 BufferLocation::Added(buffer_id) => {
2323 if let Some(buffer) = self.buffers.iter().find(|b| b.id == buffer_id) {
2324 // Skip buffers that originate from the original file
2325 // (loaded by chunk_split_and_load for viewport display).
2326 // These have stored_file_offset set and are not user edits.
2327 //
2328 // Why Added and not Stored? The piece tree only has two
2329 // variants: Stored and Added. chunk_split_and_load marks
2330 // loaded chunks as Added(new_id) because
2331 // rebuild_with_pristine_saved_root interprets Stored
2332 // pieces' buffer_offset as a position in the original
2333 // file — but a chunk buffer starts at offset 0, so using
2334 // Stored would corrupt the rebuild logic. We rely on
2335 // stored_file_offset instead to distinguish "loaded from
2336 // disk" from "user edit". A third BufferLocation variant
2337 // (e.g. LoadedChunk) would make this distinction explicit
2338 // in the type system rather than requiring this runtime
2339 // check.
2340 if buffer.stored_file_offset.is_some() {
2341 stored_bytes_before += piece.bytes;
2342 continue;
2343 }
2344 // Get the data from the buffer if loaded
2345 if let Some(data) = buffer.get_data() {
2346 // Extract just the portion this piece references
2347 let start = piece.buffer_offset;
2348 let end = start + piece.bytes;
2349 if end <= data.len() {
2350 // Use stored_bytes_before as the original file offset.
2351 // This is where this insertion should go relative to
2352 // the original file content.
2353 chunks.push((stored_bytes_before, data[start..end].to_vec()));
2354 }
2355 }
2356 }
2357 }
2358 }
2359 }
2360
2361 chunks
2362 }
2363
2364 /// Check if this buffer contains binary content
2365 pub fn is_binary(&self) -> bool {
2366 self.file_kind.is_binary()
2367 }
2368
2369 /// Get the line ending format for this buffer
2370 pub fn line_ending(&self) -> LineEnding {
2371 self.format.line_ending()
2372 }
2373
2374 /// Set the line ending format for this buffer
2375 ///
2376 /// This marks the buffer as modified since the line ending format has changed.
2377 /// On save, the buffer content will be converted to the new format.
2378 pub fn set_line_ending(&mut self, line_ending: LineEnding) {
2379 self.format.set_line_ending(line_ending);
2380 self.mark_content_modified();
2381 }
2382
2383 /// Set the default line ending format for a new/empty buffer
2384 ///
2385 /// Unlike `set_line_ending`, this does NOT mark the buffer as modified.
2386 /// This should be used when initializing a new buffer with a configured default.
2387 pub fn set_default_line_ending(&mut self, line_ending: LineEnding) {
2388 self.format.set_default_line_ending(line_ending);
2389 }
2390
2391 /// Get the encoding format for this buffer
2392 pub fn encoding(&self) -> Encoding {
2393 self.format.encoding()
2394 }
2395
2396 /// Set the encoding format for this buffer
2397 ///
2398 /// This marks the buffer as modified since the encoding format has changed.
2399 /// On save, the buffer content will be converted to the new encoding.
2400 pub fn set_encoding(&mut self, encoding: Encoding) {
2401 self.format.set_encoding(encoding);
2402 self.mark_content_modified();
2403 }
2404
2405 /// Set the default encoding format for a new/empty buffer
2406 ///
2407 /// Unlike `set_encoding`, this does NOT mark the buffer as modified.
2408 /// This should be used when initializing a new buffer with a configured default.
2409 pub fn set_default_encoding(&mut self, encoding: Encoding) {
2410 self.format.set_default_encoding(encoding);
2411 }
2412
2413 /// Get the first line of the buffer as a lossy UTF-8 string, suitable
2414 /// for shebang / first-line grammar detection. Returns `None` for an
2415 /// empty buffer. Non-UTF-8 bytes are replaced with U+FFFD.
2416 pub fn first_line_lossy(&self) -> Option<String> {
2417 let bytes = self.get_line(0)?;
2418 if bytes.is_empty() {
2419 return None;
2420 }
2421 Some(String::from_utf8_lossy(&bytes).into_owned())
2422 }
2423
2424 /// Get text for a specific line
2425 pub fn get_line(&self, line: usize) -> Option<Vec<u8>> {
2426 let (start, end) = self.piece_tree.line_range(line, &self.buffers)?;
2427
2428 let bytes = if let Some(end_offset) = end {
2429 end_offset.saturating_sub(start)
2430 } else {
2431 self.total_bytes().saturating_sub(start)
2432 };
2433
2434 self.get_text_range(start, bytes)
2435 }
2436
2437 /// Get the byte offset where a line starts
2438 pub fn line_start_offset(&self, line: usize) -> Option<usize> {
2439 let (start, _) = self.piece_tree.line_range(line, &self.buffers)?;
2440 Some(start)
2441 }
2442
2443 /// Get piece information at a byte offset
2444 pub fn piece_info_at_offset(&self, offset: usize) -> Option<PieceInfo> {
2445 self.piece_tree.find_by_offset(offset)
2446 }
2447
2448 /// Get tree statistics for debugging
2449 pub fn stats(&self) -> TreeStats {
2450 self.piece_tree.stats()
2451 }
2452
2453 // Search and Replace Operations
2454
2455 /// Find the next occurrence of a pattern, with wrap-around
2456 pub fn find_next(&self, pattern: &str, start_pos: usize) -> Option<usize> {
2457 if pattern.is_empty() {
2458 return None;
2459 }
2460
2461 let pattern_bytes = pattern.as_bytes();
2462 let buffer_len = self.len();
2463
2464 // Search from start_pos to end
2465 if start_pos < buffer_len {
2466 if let Some(offset) = self.find_pattern(start_pos, buffer_len, pattern_bytes) {
2467 return Some(offset);
2468 }
2469 }
2470
2471 // Wrap around: search from beginning to start_pos
2472 if start_pos > 0 {
2473 if let Some(offset) = self.find_pattern(0, start_pos, pattern_bytes) {
2474 return Some(offset);
2475 }
2476 }
2477
2478 None
2479 }
2480
2481 /// Find the next occurrence of a pattern within an optional range
2482 /// If range is None, searches the entire buffer with wrap-around (same as find_next)
2483 /// If range is Some, searches only within that range without wrap-around
2484 pub fn find_next_in_range(
2485 &self,
2486 pattern: &str,
2487 start_pos: usize,
2488 range: Option<Range<usize>>,
2489 ) -> Option<usize> {
2490 if pattern.is_empty() {
2491 return None;
2492 }
2493
2494 if let Some(search_range) = range {
2495 // Search within range only, no wrap-around
2496 let pattern_bytes = pattern.as_bytes();
2497 let search_start = start_pos.max(search_range.start);
2498 let search_end = search_range.end.min(self.len());
2499
2500 if search_start < search_end {
2501 self.find_pattern(search_start, search_end, pattern_bytes)
2502 } else {
2503 None
2504 }
2505 } else {
2506 // No range specified, use normal find_next with wrap-around
2507 self.find_next(pattern, start_pos)
2508 }
2509 }
2510
2511 /// Find pattern in a byte range using overlapping chunks
2512 fn find_pattern(&self, start: usize, end: usize, pattern: &[u8]) -> Option<usize> {
2513 if pattern.is_empty() || start >= end {
2514 return None;
2515 }
2516
2517 const CHUNK_SIZE: usize = 65536; // 64KB chunks
2518 let overlap = pattern.len().saturating_sub(1).max(1);
2519
2520 // Use the overlapping chunks iterator for efficient streaming search
2521 let chunks = OverlappingChunks::new(self, start, end, CHUNK_SIZE, overlap);
2522
2523 for chunk in chunks {
2524 // Search the entire chunk buffer
2525 if let Some(pos) = Self::find_in_bytes(&chunk.buffer, pattern) {
2526 let match_end = pos + pattern.len();
2527 // Only report if match ENDS in or after the valid zone
2528 // This ensures patterns spanning boundaries are found exactly once
2529 if match_end > chunk.valid_start {
2530 let absolute_pos = chunk.absolute_pos + pos;
2531 // Verify the match doesn't extend beyond our search range
2532 if absolute_pos + pattern.len() <= end {
2533 return Some(absolute_pos);
2534 }
2535 }
2536 }
2537 }
2538
2539 None
2540 }
2541
2542 /// Simple byte pattern search using naive algorithm
2543 fn find_in_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
2544 if needle.is_empty() || needle.len() > haystack.len() {
2545 return None;
2546 }
2547
2548 (0..=haystack.len() - needle.len()).find(|&i| &haystack[i..i + needle.len()] == needle)
2549 }
2550
2551 /// Find the next occurrence of a regex pattern, with wrap-around
2552 pub fn find_next_regex(&self, regex: &Regex, start_pos: usize) -> Option<usize> {
2553 let buffer_len = self.len();
2554
2555 // Search from start_pos to end
2556 if start_pos < buffer_len {
2557 if let Some(offset) = self.find_regex(start_pos, buffer_len, regex) {
2558 return Some(offset);
2559 }
2560 }
2561
2562 // Wrap around: search from beginning to start_pos
2563 if start_pos > 0 {
2564 if let Some(offset) = self.find_regex(0, start_pos, regex) {
2565 return Some(offset);
2566 }
2567 }
2568
2569 None
2570 }
2571
2572 /// Find the next occurrence of a regex pattern within an optional range
2573 pub fn find_next_regex_in_range(
2574 &self,
2575 regex: &Regex,
2576 start_pos: usize,
2577 range: Option<Range<usize>>,
2578 ) -> Option<usize> {
2579 if let Some(search_range) = range {
2580 let search_start = start_pos.max(search_range.start);
2581 let search_end = search_range.end.min(self.len());
2582
2583 if search_start < search_end {
2584 self.find_regex(search_start, search_end, regex)
2585 } else {
2586 None
2587 }
2588 } else {
2589 self.find_next_regex(regex, start_pos)
2590 }
2591 }
2592
2593 /// Find regex pattern in a byte range using overlapping chunks
2594 fn find_regex(&self, start: usize, end: usize, regex: &Regex) -> Option<usize> {
2595 if start >= end {
2596 return None;
2597 }
2598
2599 const CHUNK_SIZE: usize = 1048576; // 1MB chunks
2600 const OVERLAP: usize = 4096; // 4KB overlap for regex
2601
2602 // Use the overlapping chunks iterator for efficient streaming search
2603 // This fixes the critical bug where regex patterns spanning chunk boundaries were missed
2604 let chunks = OverlappingChunks::new(self, start, end, CHUNK_SIZE, OVERLAP);
2605
2606 for chunk in chunks {
2607 // Search the entire chunk buffer
2608 if let Some(mat) = regex.find(&chunk.buffer) {
2609 let match_end = mat.end();
2610 // Only report if match ENDS in or after the valid zone
2611 // This ensures patterns spanning boundaries are found exactly once
2612 if match_end > chunk.valid_start {
2613 let absolute_pos = chunk.absolute_pos + mat.start();
2614 // Verify the match doesn't extend beyond our search range
2615 let match_len = mat.end() - mat.start();
2616 if absolute_pos + match_len <= end {
2617 return Some(absolute_pos);
2618 }
2619 }
2620 }
2621 }
2622
2623 None
2624 }
2625
2626 /// Replace a range with replacement text
2627 pub fn replace_range(&mut self, range: Range<usize>, replacement: &str) -> bool {
2628 if range.start >= self.len() {
2629 return false;
2630 }
2631
2632 let end = range.end.min(self.len());
2633 if end > range.start {
2634 self.delete_bytes(range.start, end - range.start);
2635 }
2636
2637 if !replacement.is_empty() {
2638 self.insert(range.start, replacement);
2639 }
2640
2641 true
2642 }
2643
2644 /// Find and replace the next occurrence of a pattern
2645 pub fn replace_next(
2646 &mut self,
2647 pattern: &str,
2648 replacement: &str,
2649 start_pos: usize,
2650 range: Option<Range<usize>>,
2651 ) -> Option<usize> {
2652 if let Some(pos) = self.find_next_in_range(pattern, start_pos, range.clone()) {
2653 self.replace_range(pos..pos + pattern.len(), replacement);
2654 Some(pos)
2655 } else {
2656 None
2657 }
2658 }
2659
2660 /// Replace all occurrences of a pattern with replacement text
2661 pub fn replace_all(&mut self, pattern: &str, replacement: &str) -> usize {
2662 if pattern.is_empty() {
2663 return 0;
2664 }
2665
2666 let mut count = 0;
2667 let mut pos = 0;
2668
2669 // Keep searching and replacing
2670 // Note: we search forward from last replacement to handle growth/shrinkage
2671 // Find next occurrence (no wrap-around for replace_all)
2672 while let Some(found_pos) = self.find_next_in_range(pattern, pos, Some(0..self.len())) {
2673 self.replace_range(found_pos..found_pos + pattern.len(), replacement);
2674 count += 1;
2675
2676 // Move past the replacement
2677 pos = found_pos + replacement.len();
2678
2679 // If we're at or past the end, stop
2680 if pos >= self.len() {
2681 break;
2682 }
2683 }
2684
2685 count
2686 }
2687
2688 /// Replace all occurrences of a regex pattern with replacement text
2689 pub fn replace_all_regex(&mut self, regex: &Regex, replacement: &str) -> Result<usize> {
2690 let mut count = 0;
2691 let mut pos = 0;
2692
2693 while let Some(found_pos) = self.find_next_regex_in_range(regex, pos, Some(0..self.len())) {
2694 // Get the match to find its length
2695 let text = self
2696 .get_text_range_mut(found_pos, self.len() - found_pos)
2697 .context("Failed to read text for regex match")?;
2698
2699 if let Some(mat) = regex.find(&text) {
2700 self.replace_range(found_pos..found_pos + mat.len(), replacement);
2701 count += 1;
2702 pos = found_pos + replacement.len();
2703
2704 if pos >= self.len() {
2705 break;
2706 }
2707 } else {
2708 break;
2709 }
2710 }
2711
2712 Ok(count)
2713 }
2714
2715 // LSP Support (UTF-16 conversions)
2716
2717 /// Convert byte position to (line, column) in bytes
2718 pub fn position_to_line_col(&self, byte_pos: usize) -> (usize, usize) {
2719 self.offset_to_position(byte_pos)
2720 .map(|pos| (pos.line, pos.column))
2721 .unwrap_or_else(|| (byte_pos / 80, 0)) // Estimate if metadata unavailable
2722 }
2723
2724 /// Convert (line, character) to byte position - 0-indexed
2725 /// character is in BYTES, not UTF-16 code units
2726 /// Optimized to use single line_range() call instead of two
2727 pub fn line_col_to_position(&self, line: usize, character: usize) -> usize {
2728 if let Some((start, end)) = self.piece_tree.line_range(line, &self.buffers) {
2729 // Calculate line length from the range
2730 let line_len = if let Some(end_offset) = end {
2731 end_offset.saturating_sub(start)
2732 } else {
2733 self.total_bytes().saturating_sub(start)
2734 };
2735 let byte_offset = character.min(line_len);
2736 start + byte_offset
2737 } else {
2738 // Line doesn't exist, return end of buffer
2739 self.len()
2740 }
2741 }
2742
2743 /// Convert byte position to LSP position (line, UTF-16 code units)
2744 /// LSP protocol uses UTF-16 code units for character offsets
2745 pub fn position_to_lsp_position(&self, byte_pos: usize) -> (usize, usize) {
2746 let (line, column_bytes) = self
2747 .offset_to_position(byte_pos)
2748 .map(|pos| (pos.line, pos.column))
2749 .unwrap_or_else(|| (byte_pos / 80, 0)); // Estimate if metadata unavailable
2750
2751 // Get the line content
2752 if let Some(line_bytes) = self.get_line(line) {
2753 // Convert byte offset to UTF-16 code units
2754 let text_before = &line_bytes[..column_bytes.min(line_bytes.len())];
2755 let text_str = String::from_utf8_lossy(text_before);
2756 let utf16_offset = text_str.encode_utf16().count();
2757 (line, utf16_offset)
2758 } else {
2759 (line, 0)
2760 }
2761 }
2762
2763 /// Convert LSP position (line, UTF-16 code units) to byte position
2764 /// LSP uses UTF-16 code units for character offsets, not bytes
2765 /// Optimized to use single line_range() call instead of two
2766 pub fn lsp_position_to_byte(&self, line: usize, utf16_offset: usize) -> usize {
2767 if let Some((line_start, end)) = self.piece_tree.line_range(line, &self.buffers) {
2768 // Calculate line length and get line content
2769 let line_len = if let Some(end_offset) = end {
2770 end_offset.saturating_sub(line_start)
2771 } else {
2772 self.total_bytes().saturating_sub(line_start)
2773 };
2774
2775 if line_len > 0 {
2776 // If data is unloaded, return line_start as fallback
2777 let Some(line_bytes) = self.get_text_range(line_start, line_len) else {
2778 return line_start;
2779 };
2780 let line_str = String::from_utf8_lossy(&line_bytes);
2781
2782 // Convert UTF-16 offset to byte offset
2783 let mut utf16_count = 0;
2784 let mut byte_offset = 0;
2785
2786 for ch in line_str.chars() {
2787 if utf16_count >= utf16_offset {
2788 break;
2789 }
2790 utf16_count += ch.len_utf16();
2791 byte_offset += ch.len_utf8();
2792 }
2793
2794 line_start + byte_offset
2795 } else {
2796 line_start
2797 }
2798 } else {
2799 // Line doesn't exist, return end of buffer
2800 self.len()
2801 }
2802 }
2803
2804 // Navigation helpers
2805
2806 /// Find the previous character boundary (UTF-8 aware)
2807 pub fn prev_char_boundary(&self, pos: usize) -> usize {
2808 if pos == 0 {
2809 return 0;
2810 }
2811
2812 // Get a few bytes before pos to find the character boundary
2813 let start = pos.saturating_sub(4);
2814 let Some(bytes) = self.get_text_range(start, pos - start) else {
2815 // Data unloaded, return pos as fallback
2816 return pos;
2817 };
2818
2819 // Walk backwards to find a UTF-8 leading byte
2820 for i in (0..bytes.len()).rev() {
2821 let byte = bytes[i];
2822 // Check if this is a UTF-8 leading byte (not a continuation byte)
2823 if (byte & 0b1100_0000) != 0b1000_0000 {
2824 return start + i;
2825 }
2826 }
2827
2828 // Fallback
2829 pos.saturating_sub(1)
2830 }
2831
2832 /// Find the next character boundary (UTF-8 aware)
2833 pub fn next_char_boundary(&self, pos: usize) -> usize {
2834 let len = self.len();
2835 if pos >= len {
2836 return len;
2837 }
2838
2839 // Get a few bytes after pos to find the character boundary
2840 let end = (pos + 5).min(len);
2841 let Some(bytes) = self.get_text_range(pos, end - pos) else {
2842 // Data unloaded, return pos as fallback
2843 return pos;
2844 };
2845
2846 // Start from index 1 (we want the NEXT boundary)
2847 for (i, &byte) in bytes.iter().enumerate().skip(1) {
2848 // Check if this is a UTF-8 leading byte (not a continuation byte)
2849 if (byte & 0b1100_0000) != 0b1000_0000 {
2850 return pos + i;
2851 }
2852 }
2853
2854 // If we got here, we're at the end or found no boundary in the range
2855 end
2856 }
2857
2858 /// Check if a byte is a UTF-8 continuation byte (not at a char boundary)
2859 /// UTF-8 continuation bytes have the pattern 10xxxxxx (0x80-0xBF)
2860 /// This is the same check that str::is_char_boundary uses internally.
2861 #[inline]
2862 fn is_utf8_continuation_byte(byte: u8) -> bool {
2863 (byte & 0b1100_0000) == 0b1000_0000
2864 }
2865
2866 /// Snap position to a valid UTF-8 character boundary
2867 /// If already at a boundary, returns the same position.
2868 /// Otherwise, moves to the previous valid boundary.
2869 pub fn snap_to_char_boundary(&self, pos: usize) -> usize {
2870 let len = self.len();
2871 if pos == 0 || pos >= len {
2872 return pos.min(len);
2873 }
2874
2875 // Get the byte at pos to check if we're at a character boundary
2876 let Some(bytes) = self.get_text_range(pos, 1) else {
2877 // Data unloaded, return pos as fallback
2878 return pos;
2879 };
2880
2881 // A position is at a char boundary if the byte there is NOT a continuation byte
2882 if !Self::is_utf8_continuation_byte(bytes[0]) {
2883 // Already at a character boundary
2884 return pos;
2885 }
2886
2887 // Not at a boundary, find the previous one
2888 self.prev_char_boundary(pos)
2889 }
2890
2891 /// Find the previous grapheme cluster boundary (for proper cursor movement with combining characters)
2892 ///
2893 /// This handles complex scripts like Thai where multiple Unicode code points
2894 /// form a single visual character (grapheme cluster). For example, Thai "ที่"
2895 /// is 3 code points but 1 grapheme cluster.
2896 ///
2897 /// The lookahead window starts at 32 bytes but grows whenever the
2898 /// returned boundary sits at the start of the chunk — that is, whenever
2899 /// the chunk might not contain the full grapheme. This matters for ZWJ
2900 /// emoji sequences and Zalgo strings with many combining marks, which
2901 /// can easily exceed 32 bytes.
2902 pub fn prev_grapheme_boundary(&self, pos: usize) -> usize {
2903 if pos == 0 {
2904 return 0;
2905 }
2906
2907 let mut lookback: usize = 32;
2908 loop {
2909 // IMPORTANT: Align start to a valid character boundary to avoid invalid UTF-8
2910 // when get_text_range starts mid-character
2911 let raw_start = pos.saturating_sub(lookback);
2912 let start = if raw_start == 0 {
2913 0
2914 } else {
2915 // Find the character boundary at or before raw_start
2916 self.prev_char_boundary(raw_start + 1)
2917 };
2918
2919 let Some(bytes) = self.get_text_range(start, pos - start) else {
2920 // Data unloaded, fall back to char boundary
2921 return self.prev_char_boundary(pos);
2922 };
2923
2924 let text = match std::str::from_utf8(&bytes) {
2925 Ok(s) => s,
2926 Err(e) => {
2927 // Still got invalid UTF-8 (shouldn't happen after alignment)
2928 // Try using just the valid portion
2929 let valid_bytes = &bytes[..e.valid_up_to()];
2930 match std::str::from_utf8(valid_bytes) {
2931 Ok(s) if !s.is_empty() => s,
2932 _ => return self.prev_char_boundary(pos),
2933 }
2934 }
2935 };
2936
2937 // Use shared grapheme utility with relative position
2938 let rel_pos = pos - start;
2939 let new_rel_pos = grapheme::prev_grapheme_boundary(text, rel_pos);
2940
2941 // If the returned boundary is at the start of our chunk, the
2942 // grapheme may extend further back. Only trust the answer when
2943 // either we already reached the beginning of the buffer or the
2944 // boundary sits strictly inside the chunk.
2945 if new_rel_pos > 0 || start == 0 {
2946 return start + new_rel_pos;
2947 }
2948
2949 // Expand the lookback window and retry. Cap at the full buffer.
2950 if lookback >= pos {
2951 return 0;
2952 }
2953 lookback = lookback.saturating_mul(2);
2954 }
2955 }
2956
2957 /// Find the next grapheme cluster boundary (for proper cursor movement with combining characters)
2958 ///
2959 /// This handles complex scripts like Thai where multiple Unicode code points
2960 /// form a single visual character (grapheme cluster). For example, Thai "ที่"
2961 /// is 3 code points but 1 grapheme cluster.
2962 ///
2963 /// The lookahead window grows whenever the first grapheme reaches the
2964 /// end of the chunk — otherwise ZWJ emoji and Zalgo strings whose byte
2965 /// length exceeds the initial 32-byte window would be split mid-cluster.
2966 pub fn next_grapheme_boundary(&self, pos: usize) -> usize {
2967 let len = self.len();
2968 if pos >= len {
2969 return len;
2970 }
2971
2972 let mut lookahead: usize = 32;
2973 loop {
2974 let end = (pos + lookahead).min(len);
2975 let Some(bytes) = self.get_text_range(pos, end - pos) else {
2976 // Data unloaded, fall back to char boundary
2977 return self.next_char_boundary(pos);
2978 };
2979
2980 // Convert to UTF-8 string, handling the case where we might have
2981 // grabbed bytes that end mid-character (truncate to valid UTF-8)
2982 let text = match std::str::from_utf8(&bytes) {
2983 Ok(s) => s,
2984 Err(e) => {
2985 // The bytes end in an incomplete UTF-8 sequence
2986 // Use only the valid portion (which includes at least the first grapheme)
2987 let valid_bytes = &bytes[..e.valid_up_to()];
2988 match std::str::from_utf8(valid_bytes) {
2989 Ok(s) if !s.is_empty() => s,
2990 _ => return self.next_char_boundary(pos),
2991 }
2992 }
2993 };
2994
2995 let new_rel_pos = grapheme::next_grapheme_boundary(text, 0);
2996
2997 // If the first grapheme reaches the end of our chunk and there
2998 // is more buffer left beyond it, the grapheme may extend further.
2999 // Expand the window and retry.
3000 if new_rel_pos == text.len() && end < len {
3001 if lookahead >= len - pos {
3002 return len;
3003 }
3004 lookahead = lookahead.saturating_mul(2);
3005 continue;
3006 }
3007
3008 return pos + new_rel_pos;
3009 }
3010 }
3011
3012 /// Find the previous word boundary
3013 pub fn prev_word_boundary(&self, pos: usize) -> usize {
3014 if pos == 0 {
3015 return 0;
3016 }
3017
3018 // Get some text before pos
3019 let start = pos.saturating_sub(256).max(0);
3020 let Some(bytes) = self.get_text_range(start, pos - start) else {
3021 // Data unloaded, return pos as fallback
3022 return pos;
3023 };
3024 let text = String::from_utf8_lossy(&bytes);
3025
3026 let mut found_word_char = false;
3027 let chars: Vec<char> = text.chars().collect();
3028
3029 for i in (0..chars.len()).rev() {
3030 let ch = chars[i];
3031 let is_word_char = ch.is_alphanumeric() || ch == '_';
3032
3033 if found_word_char && !is_word_char {
3034 // We've transitioned from word to non-word
3035 // Calculate the byte position
3036 let byte_offset: usize = chars[0..=i].iter().map(|c| c.len_utf8()).sum();
3037 return start + byte_offset;
3038 }
3039
3040 if is_word_char {
3041 found_word_char = true;
3042 }
3043 }
3044
3045 0
3046 }
3047
3048 /// Find the next word boundary
3049 pub fn next_word_boundary(&self, pos: usize) -> usize {
3050 let len = self.len();
3051 if pos >= len {
3052 return len;
3053 }
3054
3055 // Get some text after pos
3056 let end = (pos + 256).min(len);
3057 let Some(bytes) = self.get_text_range(pos, end - pos) else {
3058 // Data unloaded, return pos as fallback
3059 return pos;
3060 };
3061 let text = String::from_utf8_lossy(&bytes);
3062
3063 let mut found_word_char = false;
3064 let mut byte_offset = 0;
3065
3066 for ch in text.chars() {
3067 let is_word_char = ch.is_alphanumeric() || ch == '_';
3068
3069 if found_word_char && !is_word_char {
3070 // We've transitioned from word to non-word
3071 return pos + byte_offset;
3072 }
3073
3074 if is_word_char {
3075 found_word_char = true;
3076 }
3077
3078 byte_offset += ch.len_utf8();
3079 }
3080
3081 len
3082 }
3083
3084 /// Create a line iterator starting at the given byte position
3085 ///
3086 /// This iterator lazily loads chunks as needed, never scanning the entire file.
3087 /// For large files with unloaded buffers, chunks are loaded on-demand (1MB at a time).
3088 pub fn line_iterator(
3089 &mut self,
3090 byte_pos: usize,
3091 estimated_line_length: usize,
3092 ) -> LineIterator<'_> {
3093 LineIterator::new(self, byte_pos, estimated_line_length)
3094 }
3095
3096 /// Iterate over lines starting from a given byte offset, with line numbers
3097 ///
3098 /// This is a more efficient alternative to using line_iterator() + offset_to_position()
3099 /// because it calculates line numbers incrementally during iteration by accumulating
3100 /// line_feed_cnt from pieces (which is already tracked in the piece tree).
3101 ///
3102 /// Returns: Iterator yielding (byte_offset, content, line_number: Option<usize>)
3103 /// - line_number is Some(n) for small files with line metadata
3104 /// - line_number is None for large files without line metadata
3105 ///
3106 /// # Performance
3107 /// - O(1) per line for line number calculation (vs O(log n) per line with offset_to_position)
3108 /// - Uses single source of truth: piece tree's existing line_feed_cnt metadata
3109 pub fn iter_lines_from(
3110 &mut self,
3111 byte_pos: usize,
3112 max_lines: usize,
3113 ) -> Result<TextBufferLineIterator> {
3114 TextBufferLineIterator::new(self, byte_pos, max_lines)
3115 }
3116
3117 // Legacy API methods for backwards compatibility
3118
3119 /// Get the line number for a given byte offset
3120 ///
3121 /// Returns exact line number if metadata available, otherwise estimates based on bytes.
3122 ///
3123 /// # Behavior by File Size:
3124 /// - **Small files (< 1MB)**: Returns exact line number from piece tree's `line_starts` metadata
3125 /// - **Large files (≥ 1MB)**: Returns estimated line number using `byte_offset / estimated_line_length`
3126 ///
3127 /// Large files don't maintain line metadata for performance reasons. The estimation
3128 /// uses the configured `estimated_line_length` (default 80 bytes).
3129 pub fn get_line_number(&self, byte_offset: usize) -> usize {
3130 self.offset_to_position(byte_offset)
3131 .map(|pos| pos.line)
3132 .unwrap_or_else(|| {
3133 // Estimate line number based on configured average line length
3134 byte_offset / self.config.estimated_line_length
3135 })
3136 }
3137
3138 /// Get the configured estimated line length for approximate line number calculations.
3139 pub fn estimated_line_length(&self) -> usize {
3140 self.config.estimated_line_length
3141 }
3142
3143 /// Get the starting line number at a byte offset (used for viewport rendering)
3144 ///
3145 /// # Line Cache Architecture (Post-Refactoring):
3146 ///
3147 /// The concept of a separate "line cache" is **now obsolete**. After the refactoring,
3148 /// line tracking is integrated directly into the piece tree via:
3149 /// ```rust
3150 /// BufferData::Loaded {
3151 /// data: Vec<u8>,
3152 /// line_starts: Option<Vec<usize>> // None = large file mode (no line metadata)
3153 /// }
3154 /// ```
3155 ///
3156 /// ## Why This Method Still Exists:
3157 /// The rendering code needs to know what line number to display in the margin at the
3158 /// top of the viewport. This method returns that line number, handling both small
3159 /// and large file modes transparently.
3160 ///
3161 /// ## Small vs Large File Modes:
3162 /// - **Small files**: `line_starts = Some(vec)` → returns exact line number from metadata
3163 /// - **Large files**: `line_starts = None` → returns estimated line number (byte_offset / estimated_line_length)
3164 ///
3165 /// ## Legacy Line Cache Methods:
3166 /// These methods are now no-ops and can be removed in a future cleanup:
3167 /// - `invalidate_line_cache_from()` - No-op (piece tree updates automatically)
3168 /// - `handle_line_cache_insertion()` - No-op (piece tree updates automatically)
3169 /// - `handle_line_cache_deletion()` - No-op (piece tree updates automatically)
3170 /// - `clear_line_cache()` - No-op (can't clear piece tree metadata)
3171 ///
3172 /// ## Bug Fix (2025-11):
3173 /// Previously this method always returned `0`, causing line numbers in the margin
3174 /// to always show 1, 2, 3... regardless of scroll position. Now it correctly returns
3175 /// the actual line number at `start_byte`.
3176 pub fn populate_line_cache(&mut self, start_byte: usize, _line_count: usize) -> usize {
3177 // No-op for cache population: LineIndex maintains all line starts automatically
3178 // But we need to return the actual line number at start_byte for rendering
3179 self.get_line_number(start_byte)
3180 }
3181
3182 /// Get cached byte offset for line (compatibility method)
3183 pub fn get_cached_byte_offset_for_line(&self, line_number: usize) -> Option<usize> {
3184 self.line_start_offset(line_number)
3185 }
3186
3187 /// Invalidate line cache from offset (no-op in new implementation)
3188 pub fn invalidate_line_cache_from(&mut self, _byte_offset: usize) {
3189 // No-op: LineIndex updates automatically
3190 }
3191
3192 /// Handle line cache insertion (no-op in new implementation)
3193 pub fn handle_line_cache_insertion(&mut self, _byte_offset: usize, _bytes_inserted: usize) {
3194 // No-op: LineIndex updates automatically during insert
3195 }
3196
3197 /// Handle line cache deletion (no-op in new implementation)
3198 pub fn handle_line_cache_deletion(&mut self, _byte_offset: usize, _bytes_deleted: usize) {
3199 // No-op: LineIndex updates automatically during delete
3200 }
3201
3202 /// Clear line cache (no-op in new implementation)
3203 pub fn clear_line_cache(&mut self) {
3204 // No-op: LineIndex can't be cleared
3205 }
3206
3207 // Test helper methods
3208
3209 /// Create a buffer from a string for testing
3210 #[cfg(test)]
3211 pub fn from_str_test(s: &str) -> Self {
3212 Self::from_bytes(
3213 s.as_bytes().to_vec(),
3214 std::sync::Arc::new(crate::model::filesystem::StdFileSystem),
3215 )
3216 }
3217
3218 /// Create a new empty buffer for testing
3219 #[cfg(test)]
3220 pub fn new_test() -> Self {
3221 Self::empty(std::sync::Arc::new(crate::model::filesystem::StdFileSystem))
3222 }
3223}
3224
3225/// Type alias for backwards compatibility
3226pub type Buffer = TextBuffer;
3227
3228// Re-export LineIterator from the line_iterator module
3229pub use crate::primitives::line_iterator::LineIterator;
3230
3231// ============================================================================
3232// Overlapping Chunks Iterator for Efficient Search
3233// ============================================================================
3234
3235/// Information about a chunk of data for pattern matching
3236#[derive(Debug)]
3237pub struct ChunkInfo {
3238 /// The buffer containing this chunk's data (includes overlap from previous chunk)
3239 pub buffer: Vec<u8>,
3240
3241 /// Absolute position in the document where this buffer starts
3242 pub absolute_pos: usize,
3243
3244 /// Offset within buffer where "new" data starts (valid match zone)
3245 /// Matches starting before this offset were already checked in the previous chunk
3246 pub valid_start: usize,
3247}
3248
3249/// Iterator that yields overlapping chunks for pattern matching
3250///
3251/// This iterator implements the VSCode/Sublime approach: pull overlapping chunks
3252/// from the underlying piece tree and use standard search algorithms on them.
3253///
3254/// # Algorithm
3255///
3256/// ```text
3257/// Chunk 1: [------------ valid -----------]
3258/// Chunk 2: [overlap][---- valid ----]
3259/// Chunk 3: [overlap][-- valid --]
3260///
3261/// Only matches starting in the "valid" zone are reported to avoid duplicates.
3262/// ```
3263///
3264/// # Example
3265///
3266/// ```ignore
3267/// let chunks = OverlappingChunks::new(&text_buffer, start, end, 4096, pattern.len()-1);
3268/// for chunk in chunks {
3269/// // Search only starting from chunk.valid_start
3270/// if let Some(pos) = search(&chunk.buffer[chunk.valid_start..]) {
3271/// let absolute_pos = chunk.absolute_pos + chunk.valid_start + pos;
3272/// return Some(absolute_pos);
3273/// }
3274/// }
3275/// ```
3276pub struct OverlappingChunks<'a> {
3277 piece_iter: PieceRangeIter,
3278 buffers: &'a [StringBuffer],
3279
3280 // Reusable chunk buffer that we fill from pieces
3281 buffer: Vec<u8>,
3282 buffer_absolute_pos: usize,
3283
3284 // Current state
3285 current_pos: usize,
3286 end_pos: usize,
3287
3288 // Configuration
3289 chunk_size: usize,
3290 overlap: usize,
3291
3292 // Track first chunk special case
3293 first_chunk: bool,
3294
3295 // Cached piece data for incremental reading
3296 current_piece_data: Option<Vec<u8>>,
3297 current_piece_offset: usize,
3298}
3299
3300impl<'a> OverlappingChunks<'a> {
3301 /// Create a new overlapping chunks iterator
3302 ///
3303 /// # Arguments
3304 ///
3305 /// * `text_buffer` - The text buffer to iterate over
3306 /// * `start` - Start position in the document
3307 /// * `end` - End position in the document (exclusive)
3308 /// * `chunk_size` - Target size for each chunk (excluding overlap)
3309 /// * `overlap` - Number of bytes to overlap between chunks
3310 ///
3311 /// # Recommendations
3312 ///
3313 /// * For literal string search: `chunk_size=65536, overlap=pattern.len()-1`
3314 /// * For regex search: `chunk_size=1048576, overlap=4096`
3315 pub fn new(
3316 text_buffer: &'a TextBuffer,
3317 start: usize,
3318 end: usize,
3319 chunk_size: usize,
3320 overlap: usize,
3321 ) -> Self {
3322 let piece_iter = text_buffer.piece_tree.iter_pieces_in_range(start, end);
3323
3324 Self {
3325 piece_iter,
3326 buffers: &text_buffer.buffers,
3327 buffer: Vec::with_capacity(chunk_size + overlap),
3328 buffer_absolute_pos: start,
3329 current_pos: start,
3330 end_pos: end,
3331 chunk_size,
3332 overlap,
3333 first_chunk: true,
3334 current_piece_data: None,
3335 current_piece_offset: 0,
3336 }
3337 }
3338
3339 /// Read one byte from the piece iterator
3340 fn read_byte(&mut self) -> Option<u8> {
3341 loop {
3342 // If we have cached piece data, read from it
3343 if let Some(ref data) = self.current_piece_data {
3344 if self.current_piece_offset < data.len() {
3345 let byte = data[self.current_piece_offset];
3346 self.current_piece_offset += 1;
3347 self.current_pos += 1;
3348 return Some(byte);
3349 } else {
3350 // Exhausted current piece, move to next
3351 self.current_piece_data = None;
3352 self.current_piece_offset = 0;
3353 }
3354 }
3355
3356 // Get next piece
3357 if let Some(piece_view) = self.piece_iter.next() {
3358 let buffer_id = piece_view.location.buffer_id();
3359 if let Some(buffer) = self.buffers.get(buffer_id) {
3360 // Extract the relevant slice from this piece
3361 let piece_start_in_doc = piece_view.doc_offset;
3362 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
3363
3364 // Clip to our search range
3365 let read_start = self.current_pos.max(piece_start_in_doc);
3366 let read_end = self.end_pos.min(piece_end_in_doc);
3367
3368 if read_end > read_start {
3369 let offset_in_piece = read_start - piece_start_in_doc;
3370 let bytes_to_read = read_end - read_start;
3371
3372 let buffer_start = piece_view.buffer_offset + offset_in_piece;
3373 let buffer_end = buffer_start + bytes_to_read;
3374
3375 if let Some(data) = buffer.get_data() {
3376 if buffer_end <= data.len() {
3377 // Cache this piece's data
3378 self.current_piece_data =
3379 Some(data[buffer_start..buffer_end].to_vec());
3380 self.current_piece_offset = 0;
3381 continue;
3382 }
3383 }
3384 }
3385 }
3386 }
3387
3388 // No more data
3389 return None;
3390 }
3391 }
3392
3393 /// Fill the buffer with the next chunk of data
3394 fn fill_next_chunk(&mut self) -> bool {
3395 if self.first_chunk {
3396 // First chunk: fill up to chunk_size
3397 self.first_chunk = false;
3398 while self.buffer.len() < self.chunk_size && self.current_pos < self.end_pos {
3399 if let Some(byte) = self.read_byte() {
3400 self.buffer.push(byte);
3401 } else {
3402 break;
3403 }
3404 }
3405 !self.buffer.is_empty()
3406 } else {
3407 // Subsequent chunks: keep overlap, fill chunk_size NEW bytes
3408 if self.current_pos >= self.end_pos {
3409 return false;
3410 }
3411
3412 // Keep overlap bytes at the end
3413 if self.buffer.len() > self.overlap {
3414 let drain_amount = self.buffer.len() - self.overlap;
3415 self.buffer.drain(0..drain_amount);
3416 self.buffer_absolute_pos += drain_amount;
3417 }
3418
3419 // Fill chunk_size NEW bytes (in addition to overlap)
3420 let before_len = self.buffer.len();
3421 let target_len = self.overlap + self.chunk_size;
3422 while self.buffer.len() < target_len && self.current_pos < self.end_pos {
3423 if let Some(byte) = self.read_byte() {
3424 self.buffer.push(byte);
3425 } else {
3426 break;
3427 }
3428 }
3429
3430 // Return true if we added new data
3431 self.buffer.len() > before_len
3432 }
3433 }
3434}
3435
3436impl<'a> Iterator for OverlappingChunks<'a> {
3437 type Item = ChunkInfo;
3438
3439 fn next(&mut self) -> Option<Self::Item> {
3440 // Track if this is the first chunk before filling
3441 let is_first = self.buffer_absolute_pos == self.current_pos;
3442
3443 if !self.fill_next_chunk() {
3444 return None;
3445 }
3446
3447 // First chunk: all data is valid (no overlap from previous)
3448 // Subsequent chunks: overlap bytes are not valid (already checked)
3449 let valid_start = if is_first {
3450 0
3451 } else {
3452 self.overlap.min(self.buffer.len())
3453 };
3454
3455 Some(ChunkInfo {
3456 buffer: self.buffer.clone(),
3457 absolute_pos: self.buffer_absolute_pos,
3458 valid_start,
3459 })
3460 }
3461}
3462
3463#[cfg(test)]
3464mod tests;
3465
3466#[cfg(test)]
3467mod property_tests;
3468
3469/// Line data with optional line number
3470#[derive(Debug, Clone)]
3471pub struct LineData {
3472 /// Byte offset where this line starts in the document
3473 pub byte_offset: usize,
3474 /// Line content (without trailing newline)
3475 pub content: String,
3476 /// Whether this line ends with a newline
3477 pub has_newline: bool,
3478 /// Line number (None for large files without line metadata)
3479 pub line_number: Option<usize>,
3480}
3481
3482/// Iterator over lines in a TextBuffer that efficiently tracks line numbers
3483/// using piece tree metadata (single source of truth)
3484pub struct TextBufferLineIterator {
3485 /// Collected lines (we collect all at once since we need mutable access to load chunks)
3486 lines: Vec<LineData>,
3487 /// Current index in the lines vector
3488 current_index: usize,
3489 /// Whether there are more lines after these
3490 pub has_more: bool,
3491}
3492
3493impl TextBufferLineIterator {
3494 pub(crate) fn new(buffer: &mut TextBuffer, byte_pos: usize, max_lines: usize) -> Result<Self> {
3495 let buffer_len = buffer.len();
3496 if byte_pos >= buffer_len {
3497 return Ok(Self {
3498 lines: Vec::new(),
3499 current_index: 0,
3500 has_more: false,
3501 });
3502 }
3503
3504 // Check if buffer has line metadata (None for large files > 1MB)
3505 let has_line_metadata = buffer.line_count().is_some();
3506
3507 // Determine starting line number by querying piece tree once
3508 // (only if we have line metadata)
3509 let mut current_line = if has_line_metadata {
3510 buffer.offset_to_position(byte_pos).map(|pos| pos.line)
3511 } else {
3512 None
3513 };
3514
3515 let mut lines = Vec::with_capacity(max_lines);
3516 let mut current_offset = byte_pos;
3517 let estimated_line_length = 80; // Use default estimate
3518
3519 // Collect lines by scanning forward
3520 for _ in 0..max_lines {
3521 if current_offset >= buffer_len {
3522 break;
3523 }
3524
3525 let line_start = current_offset;
3526 let line_number = current_line;
3527
3528 // Estimate how many bytes to load for this line
3529 let estimated_max_line_length = estimated_line_length * 3;
3530 let bytes_to_scan = estimated_max_line_length.min(buffer_len - current_offset);
3531
3532 // Load chunk (this handles lazy loading)
3533 let chunk = buffer.get_text_range_mut(current_offset, bytes_to_scan)?;
3534
3535 // Scan for newline
3536 let mut line_len = 0;
3537 let mut found_newline = false;
3538 for &byte in chunk.iter() {
3539 line_len += 1;
3540 if byte == b'\n' {
3541 found_newline = true;
3542 break;
3543 }
3544 }
3545
3546 // Handle long lines (rare case)
3547 if !found_newline && current_offset + line_len < buffer_len {
3548 // Line is longer than expected, load more data
3549 let remaining = buffer_len - current_offset - line_len;
3550 let additional_bytes = estimated_max_line_length.min(remaining);
3551 let more_chunk =
3552 buffer.get_text_range_mut(current_offset + line_len, additional_bytes)?;
3553
3554 let mut extended_chunk = chunk;
3555 extended_chunk.extend_from_slice(&more_chunk);
3556
3557 for &byte in more_chunk.iter() {
3558 line_len += 1;
3559 if byte == b'\n' {
3560 found_newline = true;
3561 break;
3562 }
3563 }
3564
3565 let line_string = String::from_utf8_lossy(&extended_chunk[..line_len]).into_owned();
3566 let has_newline = line_string.ends_with('\n');
3567 let content = if has_newline {
3568 line_string[..line_string.len() - 1].to_string()
3569 } else {
3570 line_string
3571 };
3572
3573 lines.push(LineData {
3574 byte_offset: line_start,
3575 content,
3576 has_newline,
3577 line_number,
3578 });
3579
3580 current_offset += line_len;
3581 if has_line_metadata && found_newline {
3582 current_line = current_line.map(|n| n + 1);
3583 }
3584 continue;
3585 }
3586
3587 // Normal case
3588 let line_string = String::from_utf8_lossy(&chunk[..line_len]).into_owned();
3589 let has_newline = line_string.ends_with('\n');
3590 let content = if has_newline {
3591 line_string[..line_string.len() - 1].to_string()
3592 } else {
3593 line_string
3594 };
3595
3596 lines.push(LineData {
3597 byte_offset: line_start,
3598 content,
3599 has_newline,
3600 line_number,
3601 });
3602
3603 current_offset += line_len;
3604 // Increment line number if we have metadata and found a newline
3605 if has_line_metadata && found_newline {
3606 current_line = current_line.map(|n| n + 1);
3607 }
3608 }
3609
3610 // Check if there are more lines
3611 let has_more = current_offset < buffer_len;
3612
3613 Ok(Self {
3614 lines,
3615 current_index: 0,
3616 has_more,
3617 })
3618 }
3619}
3620
3621impl Iterator for TextBufferLineIterator {
3622 type Item = LineData;
3623
3624 fn next(&mut self) -> Option<Self::Item> {
3625 if self.current_index < self.lines.len() {
3626 let line = self.lines[self.current_index].clone();
3627 self.current_index += 1;
3628 Some(line)
3629 } else {
3630 None
3631 }
3632 }
3633}