fresh/model/buffer/mod.rs
1/// Text buffer that uses PieceTree with integrated line tracking
2/// Architecture where the tree is the single source of truth for text and line information
3use crate::model::encoding;
4use crate::model::filesystem::{FileSearchOptions, FileSystem};
5use crate::model::piece_tree::{
6 BufferData, BufferLocation, Cursor, PieceInfo, PieceRangeIter, PieceTree, PieceView, Position,
7 StringBuffer, TreeStats,
8};
9use crate::model::piece_tree_diff::PieceTreeDiff;
10use crate::primitives::grapheme;
11use anyhow::{Context, Result};
12use regex::bytes::Regex;
13use std::io;
14
15use std::ops::Range;
16use std::path::{Path, PathBuf};
17use std::sync::Arc;
18
19// Re-export Encoding for backward compatibility
20pub use encoding::Encoding;
21
22pub mod file_kind;
23pub mod format;
24pub mod persistence;
25pub mod save;
26pub mod search;
27pub use file_kind::BufferFileKind;
28pub use format::{BufferFormat, LineEnding};
29pub use persistence::Persistence;
30pub use save::SudoSaveRequired;
31#[cfg(test)]
32pub(crate) use save::{RecipeAction, WriteRecipe};
33#[cfg(test)]
34use search::search_boundary_overlap;
35use search::SearchRegion;
36pub use search::{ChunkedSearchState, HybridSearchPlan};
37
38/// Error returned when a large file has a non-resynchronizable encoding
39/// and requires user confirmation before loading the entire file into memory.
40///
41/// Non-resynchronizable encodings (like Shift-JIS, GB18030, GBK, EUC-KR) cannot
42/// determine character boundaries when jumping into the middle of a file.
43/// This means the entire file must be loaded and decoded sequentially.
44#[derive(Debug, Clone, PartialEq)]
45pub struct LargeFileEncodingConfirmation {
46 /// Path to the file
47 pub path: PathBuf,
48 /// Size of the file in bytes
49 pub file_size: usize,
50 /// The detected encoding that requires full loading
51 pub encoding: Encoding,
52}
53
54impl std::fmt::Display for LargeFileEncodingConfirmation {
55 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56 let size_mb = self.file_size as f64 / (1024.0 * 1024.0);
57 write!(
58 f,
59 "{} ({:.0} MB) requires full load. (l)oad, (e)ncoding, (C)ancel? ",
60 self.encoding.display_name(),
61 size_mb
62 )
63 }
64}
65
66impl std::error::Error for LargeFileEncodingConfirmation {}
67
68/// A work item for incremental line-feed scanning (one per leaf).
69#[derive(Debug, Clone)]
70pub struct LineScanChunk {
71 /// Index of the leaf in the piece tree's leaf array.
72 pub leaf_index: usize,
73 /// Number of bytes in this leaf.
74 pub byte_len: usize,
75 /// True if the leaf already had a known line_feed_cnt (no I/O needed).
76 pub already_known: bool,
77}
78
79// Re-export SearchMatch from filesystem — same type is used by both
80// FileSystem::search_file (project grep on disk) and the piece-tree
81// search below (in-editor Ctrl+F and dirty buffers).
82pub use crate::model::filesystem::SearchMatch;
83
84// Large file support configuration
85/// Default threshold for considering a file "large" (100 MB)
86pub const DEFAULT_LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024;
87
88/// Chunk size to load when lazy loading (1 MB)
89pub const LOAD_CHUNK_SIZE: usize = 1024 * 1024;
90
91/// Chunk alignment for lazy loading (64 KB)
92pub const CHUNK_ALIGNMENT: usize = 64 * 1024;
93
94/// Configuration passed to TextBuffer constructors.
95#[derive(Debug, Clone)]
96pub struct BufferConfig {
97 /// Estimated average line length in bytes. Used for approximate line number
98 /// display in large files and for goto-line byte offset estimation.
99 pub estimated_line_length: usize,
100}
101
102impl Default for BufferConfig {
103 fn default() -> Self {
104 Self {
105 estimated_line_length: 80,
106 }
107 }
108}
109
110/// Line ending format used in the file
111
112/// Represents a line number (simplified for new implementation)
113/// Legacy enum kept for backwards compatibility - always Absolute now
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub enum LineNumber {
116 /// Absolute line number - this is the actual line number in the file
117 Absolute(usize),
118 /// Relative line number (deprecated - now same as Absolute)
119 Relative {
120 line: usize,
121 from_cached_line: usize,
122 },
123}
124
125impl LineNumber {
126 /// Get the line number value
127 pub fn value(&self) -> usize {
128 match self {
129 Self::Absolute(line) | Self::Relative { line, .. } => *line,
130 }
131 }
132
133 /// Check if this is an absolute line number
134 pub fn is_absolute(&self) -> bool {
135 matches!(self, LineNumber::Absolute(_))
136 }
137
138 /// Check if this is a relative line number
139 pub fn is_relative(&self) -> bool {
140 matches!(self, LineNumber::Relative { .. })
141 }
142
143 /// Format the line number for display
144 pub fn format(&self) -> String {
145 match self {
146 Self::Absolute(line) => format!("{}", line + 1),
147 Self::Relative { line, .. } => format!("~{}", line + 1),
148 }
149 }
150}
151
152/// A text buffer that manages document content using a piece table
153/// with integrated line tracking
154pub struct TextBuffer {
155 /// The piece tree for efficient text manipulation with integrated line tracking
156 piece_tree: PieceTree,
157
158 /// List of string buffers containing chunks of text data.
159 /// Index 0 is typically the original/stored buffer.
160 /// Additional buffers are added for modifications.
161 buffers: Vec<StringBuffer>,
162
163 /// Next buffer ID to assign.
164 next_buffer_id: usize,
165
166 /// Filesystem handle, optional file path, dirty/recovery flags,
167 /// saved-root snapshot, and saved-file size — see
168 /// `persistence.rs`.
169 persistence: Persistence,
170
171 /// File-kind flags (large_file, line_feeds_scanned, is_binary) —
172 /// see `file_kind.rs`.
173 file_kind: BufferFileKind,
174
175 /// Encoding + line-ending state — see `format.rs`.
176 format: BufferFormat,
177
178 /// Monotonic version counter for change tracking.
179 version: u64,
180
181 /// Buffer configuration (estimated line length, etc.)
182 config: BufferConfig,
183}
184
185/// Snapshot of a TextBuffer's piece tree and associated string buffers.
186///
187/// Used by BulkEdit undo/redo to capture the complete buffer state.
188/// Without this, consolidate_after_save() would destroy the string buffers
189/// that a BulkEdit's piece tree snapshot references, causing corruption on undo.
190#[derive(Debug, Clone)]
191pub struct BufferSnapshot {
192 pub piece_tree: PieceTree,
193 pub buffers: Vec<StringBuffer>,
194 pub next_buffer_id: usize,
195}
196
197impl TextBuffer {
198 /// Create a new text buffer with the given filesystem implementation.
199 /// Note: large_file_threshold is ignored in the new implementation
200 pub fn new(_large_file_threshold: usize, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
201 let piece_tree = PieceTree::empty();
202 let saved_root = piece_tree.root();
203 let line_ending = LineEnding::default();
204 let encoding = Encoding::default();
205 TextBuffer {
206 piece_tree,
207 buffers: vec![StringBuffer::new(0, Vec::new())],
208 next_buffer_id: 1,
209 persistence: Persistence::new(fs, None, saved_root, None),
210 file_kind: BufferFileKind::new(false, false),
211 format: BufferFormat::new(line_ending, encoding),
212 version: 0,
213 config: BufferConfig::default(),
214 }
215 }
216
217 /// Create an empty buffer associated with a file path.
218 /// Used for files that don't exist yet — the path is set so saving will create the file.
219 pub fn new_with_path(
220 large_file_threshold: usize,
221 fs: Arc<dyn FileSystem + Send + Sync>,
222 path: PathBuf,
223 ) -> Self {
224 let mut buffer = Self::new(large_file_threshold, fs);
225 buffer.persistence.set_file_path(path);
226 buffer
227 }
228
229 /// Current buffer version (monotonic, wraps on overflow)
230 pub fn version(&self) -> u64 {
231 self.version
232 }
233
234 /// Get a reference to the filesystem implementation used by this buffer.
235 pub fn filesystem(&self) -> &Arc<dyn FileSystem + Send + Sync> {
236 self.persistence.fs()
237 }
238
239 /// Set the filesystem implementation for this buffer.
240 pub fn set_filesystem(&mut self, fs: Arc<dyn FileSystem + Send + Sync>) {
241 self.persistence.set_fs(fs);
242 }
243
244 #[inline]
245 fn bump_version(&mut self) {
246 self.version = self.version.wrapping_add(1);
247 }
248
249 #[inline]
250 fn mark_content_modified(&mut self) {
251 self.persistence.mark_dirty();
252 self.bump_version();
253 }
254
255 /// Create a text buffer from raw bytes WITHOUT encoding conversion.
256 /// Used for binary files where we want to preserve the exact bytes.
257 fn from_bytes_raw(content: Vec<u8>, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
258 let bytes = content.len();
259
260 // For binary files, detect line ending but don't convert encoding
261 let line_ending = format::detect_line_ending(&content);
262
263 // Create initial StringBuffer with ID 0
264 let buffer = StringBuffer::new(0, content);
265 let line_feed_cnt = buffer.line_feed_count();
266
267 let piece_tree = if bytes > 0 {
268 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
269 } else {
270 PieceTree::empty()
271 };
272
273 let saved_root = piece_tree.root();
274
275 TextBuffer {
276 piece_tree,
277 buffers: vec![buffer],
278 next_buffer_id: 1,
279 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
280 file_kind: BufferFileKind::new(false, true),
281 format: BufferFormat::new(line_ending, Encoding::Utf8),
282 version: 0,
283 config: BufferConfig::default(),
284 }
285 }
286
287 /// Create a text buffer from initial content with the given filesystem.
288 pub fn from_bytes(content: Vec<u8>, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
289 // Auto-detect encoding and convert to UTF-8 if needed
290 let (encoding, utf8_content) = format::detect_and_convert_encoding(&content);
291
292 let bytes = utf8_content.len();
293
294 // Auto-detect line ending format from content
295 let line_ending = format::detect_line_ending(&utf8_content);
296
297 // Create initial StringBuffer with ID 0
298 let buffer = StringBuffer::new(0, utf8_content);
299 let line_feed_cnt = buffer.line_feed_count();
300
301 let piece_tree = if bytes > 0 {
302 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
303 } else {
304 PieceTree::empty()
305 };
306
307 let saved_root = piece_tree.root();
308
309 TextBuffer {
310 piece_tree,
311 buffers: vec![buffer],
312 next_buffer_id: 1,
313 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
314 file_kind: BufferFileKind::new(false, false),
315 format: BufferFormat::new(line_ending, encoding),
316 version: 0,
317 config: BufferConfig::default(),
318 }
319 }
320
321 /// Create a text buffer from bytes with a specific encoding (no auto-detection).
322 pub fn from_bytes_with_encoding(
323 content: Vec<u8>,
324 encoding: Encoding,
325 fs: Arc<dyn FileSystem + Send + Sync>,
326 ) -> Self {
327 // Convert from specified encoding to UTF-8
328 let utf8_content = encoding::convert_to_utf8(&content, encoding);
329
330 let bytes = utf8_content.len();
331
332 // Auto-detect line ending format from content
333 let line_ending = format::detect_line_ending(&utf8_content);
334
335 // Create initial StringBuffer with ID 0
336 let buffer = StringBuffer::new(0, utf8_content);
337 let line_feed_cnt = buffer.line_feed_count();
338
339 let piece_tree = if bytes > 0 {
340 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
341 } else {
342 PieceTree::empty()
343 };
344
345 let saved_root = piece_tree.root();
346
347 TextBuffer {
348 piece_tree,
349 buffers: vec![buffer],
350 next_buffer_id: 1,
351 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
352 file_kind: BufferFileKind::new(false, false),
353 format: BufferFormat::new(line_ending, encoding),
354 version: 0,
355 config: BufferConfig::default(),
356 }
357 }
358
359 /// Create a text buffer from a string with the given filesystem.
360 pub fn from_str(
361 s: &str,
362 _large_file_threshold: usize,
363 fs: Arc<dyn FileSystem + Send + Sync>,
364 ) -> Self {
365 Self::from_bytes(s.as_bytes().to_vec(), fs)
366 }
367
368 /// Create an empty text buffer with the given filesystem.
369 pub fn empty(fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
370 let piece_tree = PieceTree::empty();
371 let saved_root = piece_tree.root();
372 let line_ending = LineEnding::default();
373 let encoding = Encoding::default();
374 TextBuffer {
375 piece_tree,
376 buffers: vec![StringBuffer::new(0, Vec::new())],
377 next_buffer_id: 1,
378 persistence: Persistence::new(fs, None, saved_root, None),
379 file_kind: BufferFileKind::new(false, false),
380 format: BufferFormat::new(line_ending, encoding),
381 version: 0,
382 config: BufferConfig::default(),
383 }
384 }
385
386 /// Load a text buffer from a file using the given filesystem.
387 pub fn load_from_file<P: AsRef<Path>>(
388 path: P,
389 large_file_threshold: usize,
390 fs: Arc<dyn FileSystem + Send + Sync>,
391 ) -> anyhow::Result<Self> {
392 let path = path.as_ref();
393
394 // Get file size to determine loading strategy
395 let metadata = fs.metadata(path)?;
396 let file_size = metadata.size as usize;
397
398 // Use threshold parameter or default
399 let threshold = if large_file_threshold > 0 {
400 large_file_threshold
401 } else {
402 DEFAULT_LARGE_FILE_THRESHOLD
403 };
404
405 // Choose loading strategy based on file size
406 if file_size >= threshold {
407 Self::load_large_file(path, file_size, fs)
408 } else {
409 Self::load_small_file(path, fs)
410 }
411 }
412
413 /// Load a text buffer from a file with a specific encoding (no auto-detection).
414 pub fn load_from_file_with_encoding<P: AsRef<Path>>(
415 path: P,
416 encoding: Encoding,
417 fs: Arc<dyn FileSystem + Send + Sync>,
418 config: BufferConfig,
419 ) -> anyhow::Result<Self> {
420 let path = path.as_ref();
421 let contents = fs.read_file(path)?;
422
423 let mut buffer = Self::from_bytes_with_encoding(contents, encoding, fs);
424 buffer.persistence.set_file_path(path.to_path_buf());
425 buffer.persistence.clear_modified();
426 buffer.config = config;
427 Ok(buffer)
428 }
429
430 /// Load a small file with full eager loading and line indexing
431 fn load_small_file(path: &Path, fs: Arc<dyn FileSystem + Send + Sync>) -> anyhow::Result<Self> {
432 let contents = fs.read_file(path)?;
433
434 // Use unified encoding/binary detection
435 let (encoding, is_binary) = format::detect_encoding_or_binary(&contents, false);
436
437 // For binary files, skip encoding conversion to preserve raw bytes
438 let mut buffer = if is_binary {
439 Self::from_bytes_raw(contents, fs)
440 } else {
441 // from_bytes handles encoding detection/conversion and line ending detection
442 Self::from_bytes(contents, fs)
443 };
444 buffer.persistence.set_file_path(path.to_path_buf());
445 buffer.persistence.clear_modified();
446 buffer.file_kind.set_large_file(false);
447 buffer.file_kind.set_binary(is_binary);
448 // For binary files, ensure encoding matches detection
449 if is_binary {
450 buffer.format.set_default_encoding(encoding);
451 }
452 // Note: line_ending and encoding are already set by from_bytes/from_bytes_raw
453 Ok(buffer)
454 }
455
456 /// Check if loading a large file requires user confirmation due to encoding.
457 ///
458 /// Some encodings (like Shift-JIS, GB18030, GBK, EUC-KR) cannot be "resynchronized" -
459 /// meaning you cannot determine character boundaries when jumping into the middle
460 /// of a file. These encodings require loading the entire file into memory.
461 ///
462 /// Returns `Some(confirmation)` if user confirmation is needed, `None` if the file
463 /// can be loaded with lazy/streaming loading.
464 pub fn check_large_file_encoding(
465 path: impl AsRef<Path>,
466 fs: Arc<dyn FileSystem + Send + Sync>,
467 ) -> anyhow::Result<Option<LargeFileEncodingConfirmation>> {
468 let path = path.as_ref();
469 let metadata = fs.metadata(path)?;
470 let file_size = metadata.size as usize;
471
472 // Only check for large files
473 if file_size < DEFAULT_LARGE_FILE_THRESHOLD {
474 return Ok(None);
475 }
476
477 // Read a sample to detect encoding
478 let sample_size = file_size.min(8 * 1024);
479 let sample = fs.read_range(path, 0, sample_size)?;
480 let (encoding, is_binary) =
481 format::detect_encoding_or_binary(&sample, file_size > sample_size);
482
483 // Binary files don't need confirmation (loaded as-is)
484 if is_binary {
485 return Ok(None);
486 }
487
488 // Check if the encoding requires full file loading
489 if encoding.requires_full_file_load() {
490 return Ok(Some(LargeFileEncodingConfirmation {
491 path: path.to_path_buf(),
492 file_size,
493 encoding,
494 }));
495 }
496
497 Ok(None)
498 }
499
500 /// Load a large file with unloaded buffer (no line indexing, lazy loading)
501 ///
502 /// If `force_full_load` is true, loads the entire file regardless of encoding.
503 /// This should be set to true after user confirms loading a non-resynchronizable encoding.
504 fn load_large_file(
505 path: &Path,
506 file_size: usize,
507 fs: Arc<dyn FileSystem + Send + Sync>,
508 ) -> anyhow::Result<Self> {
509 Self::load_large_file_internal(path, file_size, fs, false)
510 }
511
512 /// Load a large file, optionally forcing full load for non-resynchronizable encodings.
513 ///
514 /// Called with `force_full_load=true` after user confirms the warning about
515 /// non-resynchronizable encodings requiring full file loading.
516 pub fn load_large_file_confirmed(
517 path: impl AsRef<Path>,
518 fs: Arc<dyn FileSystem + Send + Sync>,
519 ) -> anyhow::Result<Self> {
520 let path = path.as_ref();
521 let metadata = fs.metadata(path)?;
522 let file_size = metadata.size as usize;
523 Self::load_large_file_internal(path, file_size, fs, true)
524 }
525
526 /// Internal implementation for loading large files.
527 fn load_large_file_internal(
528 path: &Path,
529 file_size: usize,
530 fs: Arc<dyn FileSystem + Send + Sync>,
531 force_full_load: bool,
532 ) -> anyhow::Result<Self> {
533 use crate::model::piece_tree::{BufferData, BufferLocation};
534
535 // Read a sample of the file to detect encoding and whether it's binary
536 // We read the first 8KB for detection
537 let sample_size = file_size.min(8 * 1024);
538 let sample = fs.read_range(path, 0, sample_size)?;
539
540 // Use unified encoding/binary detection
541 let (encoding, is_binary) =
542 format::detect_encoding_or_binary(&sample, file_size > sample_size);
543
544 // Binary files skip encoding conversion to preserve raw bytes
545 if is_binary {
546 tracing::info!("Large binary file detected, loading without encoding conversion");
547 let contents = fs.read_file(path)?;
548 let mut buffer = Self::from_bytes_raw(contents, fs);
549 buffer.persistence.set_file_path(path.to_path_buf());
550 buffer.persistence.clear_modified();
551 buffer.file_kind.set_large_file(true);
552 buffer.format.set_default_encoding(encoding);
553 return Ok(buffer);
554 }
555
556 // Check if encoding requires full file loading
557 let requires_full_load = encoding.requires_full_file_load();
558
559 // For non-resynchronizable encodings, require confirmation unless forced
560 if requires_full_load && !force_full_load {
561 anyhow::bail!(LargeFileEncodingConfirmation {
562 path: path.to_path_buf(),
563 file_size,
564 encoding,
565 });
566 }
567
568 // For encodings that require full load (non-resynchronizable or non-UTF-8),
569 // load the entire file and convert
570 if !matches!(encoding, Encoding::Utf8 | Encoding::Ascii) {
571 tracing::info!(
572 "Large file with non-UTF-8 encoding ({:?}), loading fully for conversion",
573 encoding
574 );
575 let contents = fs.read_file(path)?;
576 let mut buffer = Self::from_bytes(contents, fs);
577 buffer.persistence.set_file_path(path.to_path_buf());
578 buffer.persistence.clear_modified();
579 buffer.file_kind.set_large_file(true); // Still mark as large file for UI purposes
580 buffer.file_kind.set_binary(is_binary);
581 return Ok(buffer);
582 }
583
584 // UTF-8/ASCII files can use lazy loading
585 let line_ending = format::detect_line_ending(&sample);
586
587 // Create an unloaded buffer that references the entire file
588 let buffer = StringBuffer {
589 id: 0,
590 data: BufferData::Unloaded {
591 file_path: path.to_path_buf(),
592 file_offset: 0,
593 bytes: file_size,
594 },
595 stored_file_offset: None,
596 };
597
598 // Create piece tree with a single piece covering the whole file
599 // No line feed count (None) since we're not computing line indexing
600 let piece_tree = if file_size > 0 {
601 PieceTree::new(BufferLocation::Stored(0), 0, file_size, None)
602 } else {
603 PieceTree::empty()
604 };
605 let saved_root = piece_tree.root();
606
607 tracing::debug!(
608 "Buffer::load_from_file: loaded {} bytes, saved_file_size={}",
609 file_size,
610 file_size
611 );
612
613 Ok(TextBuffer {
614 piece_tree,
615 buffers: vec![buffer],
616 next_buffer_id: 1,
617 persistence: Persistence::new(
618 fs,
619 Some(path.to_path_buf()),
620 saved_root,
621 Some(file_size),
622 ),
623 file_kind: BufferFileKind::new(true, is_binary),
624 format: BufferFormat::new(line_ending, encoding),
625 version: 0,
626 config: BufferConfig::default(),
627 })
628 }
629
630 /// Save the buffer to its associated file
631 pub fn save(&mut self) -> anyhow::Result<()> {
632 if let Some(path) = self.persistence.file_path_owned() {
633 self.save_to_file(path)
634 } else {
635 anyhow::bail!(io::Error::new(
636 io::ErrorKind::NotFound,
637 "No file path associated with buffer",
638 ))
639 }
640 }
641
642 /// Build a write recipe from the piece tree for saving.
643 ///
644 /// Delegates to `save::build_write_recipe`.
645 #[cfg(test)]
646 pub(crate) fn build_write_recipe(&self) -> io::Result<WriteRecipe> {
647 save::build_write_recipe(
648 &self.piece_tree,
649 &self.buffers,
650 &self.format,
651 &self.file_kind,
652 &self.persistence,
653 )
654 }
655
656 /// Save the buffer to a specific file
657 ///
658 /// Uses the write recipe approach for both local and remote filesystems:
659 /// - Copy ops reference unchanged regions in the source file
660 /// - Insert ops contain new/modified data
661 ///
662 /// For remote filesystems, the recipe is sent to the agent which reconstructs
663 /// the file server-side, avoiding transfer of unchanged content.
664 ///
665 /// For local filesystems with ownership concerns (file owned by another user),
666 /// uses in-place writing to preserve ownership. Otherwise uses atomic writes.
667 ///
668 /// If the line ending format has been changed (via set_line_ending), all content
669 /// will be converted to the new format during save.
670 pub fn save_to_file<P: AsRef<Path>>(&mut self, path: P) -> anyhow::Result<()> {
671 let dest_path = path.as_ref();
672 let total = self.total_bytes();
673
674 // Handle empty files
675 if total == 0 {
676 self.persistence.fs().write_file(dest_path, &[])?;
677 self.finalize_save(dest_path)?;
678 return Ok(());
679 }
680
681 // Build the write recipe (unified for all filesystem types)
682 let recipe = save::build_write_recipe(
683 &self.piece_tree,
684 &self.buffers,
685 &self.format,
686 &self.file_kind,
687 &self.persistence,
688 )?;
689 let ops = recipe.to_write_ops();
690
691 // Check if we need in-place writing to preserve file ownership (local only)
692 // Remote filesystems handle this differently
693 let fs = self.persistence.fs();
694 let is_local = fs.remote_connection_info().is_none();
695 let use_inplace = is_local && save::should_use_inplace_write(fs, dest_path);
696
697 if use_inplace {
698 // In-place write: write directly to preserve ownership
699 save::save_with_inplace_write(fs, dest_path, &recipe)?;
700 } else if !recipe.has_copy_ops() && !is_local {
701 // Remote with no Copy ops: use write_file directly (more efficient)
702 let data = recipe.flatten_inserts();
703 fs.write_file(dest_path, &data)?;
704 } else if is_local {
705 // Local: use write_file or write_patched with sudo fallback
706 let write_result = if !recipe.has_copy_ops() {
707 let data = recipe.flatten_inserts();
708 fs.write_file(dest_path, &data)
709 } else {
710 let src_for_patch = recipe.src_path.as_deref().unwrap_or(dest_path);
711 fs.write_patched(src_for_patch, dest_path, &ops)
712 };
713
714 if let Err(e) = write_result {
715 if e.kind() == io::ErrorKind::PermissionDenied {
716 // Create temp file and return sudo error
717 let original_metadata = fs.metadata_if_exists(dest_path);
718 let (temp_path, mut temp_file) = save::create_temp_file(fs, dest_path)?;
719 save::write_recipe_to_file(fs, &mut temp_file, &recipe)?;
720 temp_file.sync_all()?;
721 drop(temp_file);
722 return Err(save::make_sudo_error(
723 temp_path,
724 dest_path,
725 original_metadata,
726 ));
727 }
728 return Err(e.into());
729 }
730 } else {
731 // Remote with Copy ops: use write_patched
732 let src_for_patch = recipe.src_path.as_deref().unwrap_or(dest_path);
733 fs.write_patched(src_for_patch, dest_path, &ops)?;
734 }
735
736 self.finalize_save(dest_path)?;
737 Ok(())
738 }
739
740 /// Finalize save state after successful write.
741 fn finalize_save(&mut self, dest_path: &Path) -> anyhow::Result<()> {
742 let new_size = self.persistence.fs().metadata(dest_path)?.size as usize;
743 tracing::debug!(
744 "Buffer::save: updating saved_file_size from {:?} to {}",
745 self.persistence.saved_file_size(),
746 new_size
747 );
748 self.persistence.set_saved_file_size(Some(new_size));
749 self.persistence.set_file_path(dest_path.to_path_buf());
750
751 // Consolidate the piece tree to synchronize with disk (for large files)
752 // or to simplify structure (for small files).
753 self.consolidate_after_save(dest_path, new_size);
754
755 self.mark_saved_snapshot();
756 self.format.promote_current_to_original();
757 Ok(())
758 }
759
760 /// Finalize buffer state after an external save operation (e.g., via sudo).
761 ///
762 /// This updates the saved snapshot and file size to match the new state on disk.
763 pub fn finalize_external_save(&mut self, dest_path: PathBuf) -> anyhow::Result<()> {
764 let new_size = self.persistence.fs().metadata(&dest_path)?.size as usize;
765 self.persistence.set_saved_file_size(Some(new_size));
766 self.persistence.set_file_path(dest_path.clone());
767
768 // Consolidate the piece tree to synchronize with disk or simplify structure.
769 self.consolidate_after_save(&dest_path, new_size);
770
771 self.mark_saved_snapshot();
772 self.format.promote_current_to_original();
773 Ok(())
774 }
775
776 /// Consolidate the piece tree into a single piece.
777 /// For large files, this creates a reference to the disk file to save memory and sync offsets.
778 /// For small files, this flattens all edits into a single in-memory buffer.
779 fn consolidate_after_save(&mut self, path: &Path, file_size: usize) {
780 if self.file_kind.is_large_file() {
781 self.consolidate_large_file(path, file_size);
782 } else {
783 self.consolidate_small_file();
784 }
785 }
786
787 /// Consolidate large file piece tree into a single piece pointing to the new file.
788 /// This ensures that subsequent operations correctly reference the new content and offsets.
789 /// Preserves total line feed count from the old tree if a scan was previously done.
790 fn consolidate_large_file(&mut self, path: &Path, file_size: usize) {
791 // Preserve line feed count from the old tree if we had scanned it
792 let preserved_lf = if self.file_kind.has_line_feed_scan() {
793 self.piece_tree.line_count().map(|c| c.saturating_sub(1))
794 } else {
795 None
796 };
797
798 let buffer = StringBuffer {
799 id: 0,
800 data: BufferData::Unloaded {
801 file_path: path.to_path_buf(),
802 file_offset: 0,
803 bytes: file_size,
804 },
805 stored_file_offset: None,
806 };
807
808 self.piece_tree = if file_size > 0 {
809 PieceTree::new(BufferLocation::Stored(0), 0, file_size, preserved_lf)
810 } else {
811 PieceTree::empty()
812 };
813
814 self.buffers = vec![buffer];
815 self.next_buffer_id = 1;
816
817 tracing::debug!(
818 "Buffer::consolidate_large_file: consolidated into single piece of {} bytes",
819 file_size
820 );
821 }
822
823 /// Consolidate small file edits into a single in-memory buffer and re-index lines.
824 fn consolidate_small_file(&mut self) {
825 if let Some(bytes) = self.get_all_text() {
826 let line_feed_cnt = bytes.iter().filter(|&&b| b == b'\n').count();
827 let len = bytes.len();
828
829 // Create a single loaded buffer with line indexing
830 let buffer = StringBuffer::new_loaded(0, bytes, true);
831
832 self.piece_tree = if len > 0 {
833 PieceTree::new(BufferLocation::Stored(0), 0, len, Some(line_feed_cnt))
834 } else {
835 PieceTree::empty()
836 };
837
838 self.buffers = vec![buffer];
839 self.next_buffer_id = 1;
840
841 tracing::debug!(
842 "Buffer::consolidate_small_file: consolidated into single loaded buffer of {} bytes",
843 len
844 );
845 }
846 }
847
848 /// Get the total number of bytes in the document
849 pub fn total_bytes(&self) -> usize {
850 self.piece_tree.total_bytes()
851 }
852
853 /// Get the total number of lines in the document
854 /// Uses the piece tree's integrated line tracking
855 /// Returns None if line count is unknown (e.g., for large files without line indexing)
856 pub fn line_count(&self) -> Option<usize> {
857 self.piece_tree.line_count()
858 }
859
860 /// Snapshot the current tree as the saved baseline
861 pub fn mark_saved_snapshot(&mut self) {
862 self.persistence.mark_saved_snapshot(&self.piece_tree);
863 }
864
865 /// Refresh the saved root to match the current tree structure without
866 /// clearing the modified flag. Call this after structural-only changes
867 /// (e.g. chunk_split_and_load during search scan) so that
868 /// `diff_since_saved()` can take the fast `Arc::ptr_eq` path.
869 pub fn refresh_saved_root_if_unmodified(&mut self) {
870 self.persistence
871 .refresh_saved_root_if_unmodified(&self.piece_tree);
872 }
873
874 /// Diff the current piece tree against the last saved snapshot.
875 ///
876 /// See `Persistence::diff_since_saved` for the algorithm.
877 pub fn diff_since_saved(&self) -> PieceTreeDiff {
878 let _span = tracing::info_span!(
879 "diff_since_saved",
880 large_file = self.file_kind.is_large_file(),
881 modified = self.persistence.is_modified(),
882 lf_scanned = self.file_kind.has_line_feed_scan()
883 )
884 .entered();
885
886 self.persistence
887 .diff_since_saved(&self.piece_tree, &self.buffers)
888 }
889
890 /// Convert a byte offset to a line/column position
891 pub fn offset_to_position(&self, offset: usize) -> Option<Position> {
892 self.piece_tree
893 .offset_to_position(offset, &self.buffers)
894 .map(|(line, column)| Position { line, column })
895 }
896
897 /// Convert a line/column position to a byte offset
898 pub fn position_to_offset(&self, position: Position) -> usize {
899 self.piece_tree
900 .position_to_offset(position.line, position.column, &self.buffers)
901 }
902
903 /// Insert text at the given byte offset
904 pub fn insert_bytes(&mut self, offset: usize, text: Vec<u8>) -> Cursor {
905 if text.is_empty() {
906 return self.piece_tree.cursor_at_offset(offset);
907 }
908
909 // Mark as modified (updates version)
910 self.mark_content_modified();
911
912 // Count line feeds in the text to insert
913 let line_feed_cnt = Some(text.iter().filter(|&&b| b == b'\n').count());
914
915 // Optimization: try to append to existing buffer if insertion is at piece boundary
916 let (buffer_location, buffer_offset, text_len) =
917 if let Some(append_info) = self.try_append_to_existing_buffer(offset, &text) {
918 append_info
919 } else {
920 // Create a new StringBuffer for this insertion
921 let buffer_id = self.next_buffer_id;
922 self.next_buffer_id += 1;
923 let buffer = StringBuffer::new(buffer_id, text.clone());
924 self.buffers.push(buffer);
925 (BufferLocation::Added(buffer_id), 0, text.len())
926 };
927
928 // When line feeds have been scanned, ensure the chunk at the insertion
929 // point is loaded so compute_line_feeds_static can recount during splits.
930 if self.file_kind.has_line_feed_scan() {
931 self.ensure_chunk_loaded_at(offset);
932 }
933
934 // Update piece tree (need to pass buffers reference)
935 self.piece_tree.insert(
936 offset,
937 buffer_location,
938 buffer_offset,
939 text_len,
940 line_feed_cnt,
941 &self.buffers,
942 )
943 }
944
945 /// Try to append to an existing buffer if insertion point aligns with buffer end
946 /// Returns (BufferLocation, buffer_offset, text_len) if append succeeds, None otherwise
947 fn try_append_to_existing_buffer(
948 &mut self,
949 offset: usize,
950 text: &[u8],
951 ) -> Option<(BufferLocation, usize, usize)> {
952 // Only optimize for non-empty insertions after existing content
953 if text.is_empty() || offset == 0 {
954 return None;
955 }
956
957 // Find the piece containing the byte just before the insertion point
958 // This avoids the saturating_sub issue
959 let piece_info = self.piece_tree.find_by_offset(offset - 1)?;
960
961 // Check if insertion is exactly at the end of this piece
962 // offset_in_piece tells us where (offset-1) is within the piece
963 // For insertion to be at piece end, (offset-1) must be the last byte
964 let offset_in_piece = piece_info.offset_in_piece?;
965 if offset_in_piece + 1 != piece_info.bytes {
966 return None; // Not at the end of the piece
967 }
968
969 // Only append to "Added" buffers (not original Stored buffers)
970 if !matches!(piece_info.location, BufferLocation::Added(_)) {
971 return None;
972 }
973
974 let buffer_id = piece_info.location.buffer_id();
975 let buffer = self.buffers.get_mut(buffer_id)?;
976
977 // Check if buffer is loaded
978 let buffer_len = buffer.get_data()?.len();
979
980 // Check if this piece ends exactly at the end of its buffer
981 if piece_info.offset + piece_info.bytes != buffer_len {
982 return None;
983 }
984
985 // Perfect! Append to this buffer
986 let append_offset = buffer.append(text);
987
988 Some((piece_info.location, append_offset, text.len()))
989 }
990
991 /// Insert text (from &str) at the given byte offset
992 pub fn insert(&mut self, offset: usize, text: &str) {
993 self.insert_bytes(offset, text.as_bytes().to_vec());
994 }
995
996 /// Insert text at a line/column position
997 /// This now uses the optimized piece_tree.insert_at_position() for a single traversal
998 pub fn insert_at_position(&mut self, position: Position, text: Vec<u8>) -> Cursor {
999 if text.is_empty() {
1000 let offset = self.position_to_offset(position);
1001 return self.piece_tree.cursor_at_offset(offset);
1002 }
1003
1004 self.mark_content_modified();
1005
1006 // Count line feeds in the text to insert
1007 let line_feed_cnt = text.iter().filter(|&&b| b == b'\n').count();
1008
1009 // Create a new StringBuffer for this insertion
1010 let buffer_id = self.next_buffer_id;
1011 self.next_buffer_id += 1;
1012 let buffer = StringBuffer::new(buffer_id, text.clone());
1013 self.buffers.push(buffer);
1014
1015 // Use the optimized position-based insertion (single traversal)
1016 self.piece_tree.insert_at_position(
1017 position.line,
1018 position.column,
1019 BufferLocation::Added(buffer_id),
1020 0,
1021 text.len(),
1022 line_feed_cnt,
1023 &self.buffers,
1024 )
1025 }
1026
1027 /// Delete text starting at the given byte offset
1028 pub fn delete_bytes(&mut self, offset: usize, bytes: usize) {
1029 if bytes == 0 || offset >= self.total_bytes() {
1030 return;
1031 }
1032
1033 // When line feeds have been scanned, ensure chunks at delete boundaries
1034 // are loaded so compute_line_feeds_static can recount during splits.
1035 if self.file_kind.has_line_feed_scan() {
1036 self.ensure_chunk_loaded_at(offset);
1037 let end = (offset + bytes).min(self.total_bytes());
1038 if end > offset {
1039 self.ensure_chunk_loaded_at(end.saturating_sub(1));
1040 }
1041 }
1042
1043 // Update piece tree
1044 self.piece_tree.delete(offset, bytes, &self.buffers);
1045
1046 self.mark_content_modified();
1047 }
1048
1049 /// Delete text in a range
1050 pub fn delete(&mut self, range: Range<usize>) {
1051 if range.end > range.start {
1052 self.delete_bytes(range.start, range.end - range.start);
1053 }
1054 }
1055
1056 /// Delete text in a line/column range
1057 /// This now uses the optimized piece_tree.delete_position_range() for a single traversal
1058 pub fn delete_range(&mut self, start: Position, end: Position) {
1059 // Use the optimized position-based deletion
1060 self.piece_tree.delete_position_range(
1061 start.line,
1062 start.column,
1063 end.line,
1064 end.column,
1065 &self.buffers,
1066 );
1067 self.mark_content_modified();
1068 }
1069
1070 /// Replace the entire buffer content with new content
1071 /// This is an O(n) operation that rebuilds the piece tree in a single pass,
1072 /// avoiding the O(n²) complexity of applying individual edits.
1073 ///
1074 /// This is used for bulk operations like "replace all" where applying
1075 /// individual edits would be prohibitively slow.
1076 pub fn replace_content(&mut self, new_content: &str) {
1077 let bytes = new_content.len();
1078 let content_bytes = new_content.as_bytes().to_vec();
1079
1080 // Count line feeds in the new content
1081 let line_feed_cnt = content_bytes.iter().filter(|&&b| b == b'\n').count();
1082
1083 // Create a new StringBuffer for the new content
1084 let buffer_id = self.next_buffer_id;
1085 self.next_buffer_id += 1;
1086 let buffer = StringBuffer::new(buffer_id, content_bytes);
1087 self.buffers.push(buffer);
1088
1089 // Rebuild the piece tree with a single piece containing all the new content
1090 if bytes > 0 {
1091 self.piece_tree = PieceTree::new(
1092 BufferLocation::Added(buffer_id),
1093 0,
1094 bytes,
1095 Some(line_feed_cnt),
1096 );
1097 } else {
1098 self.piece_tree = PieceTree::empty();
1099 }
1100
1101 self.mark_content_modified();
1102 }
1103
1104 /// Restore a previously saved buffer state (for undo/redo of BulkEdit).
1105 ///
1106 /// This restores the piece tree AND the buffers list, which is critical
1107 /// because consolidate_after_save() replaces self.buffers. Without restoring
1108 /// buffers, the piece tree would reference buffer IDs that no longer exist.
1109 pub fn restore_buffer_state(&mut self, snapshot: &BufferSnapshot) {
1110 self.piece_tree = snapshot.piece_tree.clone();
1111 self.buffers = snapshot.buffers.clone();
1112 self.next_buffer_id = snapshot.next_buffer_id;
1113 self.mark_content_modified();
1114 }
1115
1116 /// Snapshot the current buffer state (piece tree + buffers) for BulkEdit undo/redo.
1117 ///
1118 /// The snapshot includes buffers because consolidate_after_save() can replace
1119 /// self.buffers between the snapshot and restore, which would otherwise cause
1120 /// the restored piece tree to reference nonexistent buffer IDs.
1121 pub fn snapshot_buffer_state(&self) -> Arc<BufferSnapshot> {
1122 Arc::new(BufferSnapshot {
1123 piece_tree: self.piece_tree.clone(),
1124 buffers: self.buffers.clone(),
1125 next_buffer_id: self.next_buffer_id,
1126 })
1127 }
1128
1129 /// Apply bulk edits efficiently in a single pass
1130 /// Returns the net change in bytes
1131 pub fn apply_bulk_edits(&mut self, edits: &[(usize, usize, &str)]) -> isize {
1132 // Pre-allocate buffers for all insert texts (only non-empty texts)
1133 // This avoids the borrow conflict in the closure
1134 // IMPORTANT: Only add entries for non-empty texts because the closure
1135 // is only called for edits with non-empty insert text
1136 let mut buffer_info: Vec<(BufferLocation, usize, usize, Option<usize>)> = Vec::new();
1137
1138 for (_, _, text) in edits {
1139 if !text.is_empty() {
1140 let buffer_id = self.next_buffer_id;
1141 self.next_buffer_id += 1;
1142 let content = text.as_bytes().to_vec();
1143 let lf_cnt = content.iter().filter(|&&b| b == b'\n').count();
1144 let bytes = content.len();
1145 let buffer = StringBuffer::new(buffer_id, content);
1146 self.buffers.push(buffer);
1147 buffer_info.push((BufferLocation::Added(buffer_id), 0, bytes, Some(lf_cnt)));
1148 }
1149 // No placeholder for empty texts - the closure is only called for non-empty texts
1150 }
1151
1152 // Now call apply_bulk_edits with a simple index-based closure
1153 let mut idx = 0;
1154 let delta = self
1155 .piece_tree
1156 .apply_bulk_edits(edits, &self.buffers, |_text| {
1157 let info = buffer_info[idx];
1158 idx += 1;
1159 info
1160 });
1161
1162 self.mark_content_modified();
1163 delta
1164 }
1165
1166 /// Get text from a byte offset range
1167 /// This now uses the optimized piece_tree.iter_pieces_in_range() for a single traversal
1168 /// Get text from a byte offset range (read-only)
1169 /// Returns None if any buffer in the range is unloaded
1170 /// PRIVATE: External code should use get_text_range_mut() which handles lazy loading
1171 fn get_text_range(&self, offset: usize, bytes: usize) -> Option<Vec<u8>> {
1172 if bytes == 0 {
1173 return Some(Vec::new());
1174 }
1175
1176 let mut result = Vec::with_capacity(bytes);
1177 let end_offset = offset + bytes;
1178 let mut collected = 0;
1179
1180 // Use the efficient piece iterator (single O(log n) traversal + O(N) iteration)
1181 for piece_view in self.piece_tree.iter_pieces_in_range(offset, end_offset) {
1182 let buffer_id = piece_view.location.buffer_id();
1183 if let Some(buffer) = self.buffers.get(buffer_id) {
1184 // Calculate the range to read from this piece
1185 let piece_start_in_doc = piece_view.doc_offset;
1186 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
1187
1188 // Clip to the requested range
1189 let read_start = offset.max(piece_start_in_doc);
1190 let read_end = end_offset.min(piece_end_in_doc);
1191
1192 if read_end > read_start {
1193 let offset_in_piece = read_start - piece_start_in_doc;
1194 let bytes_to_read = read_end - read_start;
1195
1196 let buffer_start = piece_view.buffer_offset + offset_in_piece;
1197 let buffer_end = buffer_start + bytes_to_read;
1198
1199 // Return None if buffer is unloaded (type-safe)
1200 let data = buffer.get_data()?;
1201
1202 if buffer_end <= data.len() {
1203 result.extend_from_slice(&data[buffer_start..buffer_end]);
1204 collected += bytes_to_read;
1205
1206 if collected >= bytes {
1207 break;
1208 }
1209 }
1210 }
1211 }
1212 }
1213
1214 Some(result)
1215 }
1216
1217 /// Get text from a byte offset range with lazy loading
1218 /// This will load unloaded chunks on-demand and always returns complete data
1219 ///
1220 /// Returns an error if loading fails or if data cannot be read for any reason.
1221 ///
1222 /// NOTE: Currently loads entire buffers on-demand. Future optimization would split
1223 /// large pieces and load only LOAD_CHUNK_SIZE chunks at a time.
1224 pub fn get_text_range_mut(&mut self, offset: usize, bytes: usize) -> Result<Vec<u8>> {
1225 let _span = tracing::info_span!("get_text_range_mut", offset, bytes).entered();
1226 if bytes == 0 {
1227 return Ok(Vec::new());
1228 }
1229
1230 let mut result = Vec::with_capacity(bytes);
1231 // Clamp end_offset to buffer length to handle reads beyond EOF
1232 let end_offset = (offset + bytes).min(self.len());
1233 let mut current_offset = offset;
1234 let mut iteration_count = 0u32;
1235
1236 // Keep iterating until we've collected all requested bytes
1237 while current_offset < end_offset {
1238 iteration_count += 1;
1239 let mut made_progress = false;
1240 let mut restarted_iteration = false;
1241
1242 // Use the efficient piece iterator (single O(log n) traversal + O(N) iteration)
1243 for piece_view in self
1244 .piece_tree
1245 .iter_pieces_in_range(current_offset, end_offset)
1246 {
1247 let buffer_id = piece_view.location.buffer_id();
1248
1249 // Check if buffer needs loading
1250 let needs_loading = self
1251 .buffers
1252 .get(buffer_id)
1253 .map(|b| !b.is_loaded())
1254 .unwrap_or(false);
1255
1256 if needs_loading && self.chunk_split_and_load(&piece_view, current_offset)? {
1257 restarted_iteration = true;
1258 break;
1259 }
1260
1261 // Calculate the range to read from this piece
1262 let piece_start_in_doc = piece_view.doc_offset;
1263 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
1264
1265 // Clip to the requested range
1266 let read_start = current_offset.max(piece_start_in_doc);
1267 let read_end = end_offset.min(piece_end_in_doc);
1268
1269 if read_end > read_start {
1270 let offset_in_piece = read_start - piece_start_in_doc;
1271 let bytes_to_read = read_end - read_start;
1272
1273 let buffer_start = piece_view.buffer_offset + offset_in_piece;
1274 let buffer_end = buffer_start + bytes_to_read;
1275
1276 // Buffer should be loaded now
1277 let buffer = self.buffers.get(buffer_id).context("Buffer not found")?;
1278 let data = buffer
1279 .get_data()
1280 .context("Buffer data unavailable after load")?;
1281
1282 anyhow::ensure!(
1283 buffer_end <= data.len(),
1284 "Buffer range out of bounds: requested {}..{}, buffer size {}",
1285 buffer_start,
1286 buffer_end,
1287 data.len()
1288 );
1289
1290 result.extend_from_slice(&data[buffer_start..buffer_end]);
1291 current_offset = read_end;
1292 made_progress = true;
1293 }
1294 }
1295
1296 // If we didn't make progress and didn't restart iteration, this is an error
1297 if !made_progress && !restarted_iteration {
1298 tracing::error!(
1299 "get_text_range_mut: No progress at offset {} (requested range: {}..{}, buffer len: {})",
1300 current_offset,
1301 offset,
1302 end_offset,
1303 self.len()
1304 );
1305 tracing::error!(
1306 "Piece tree stats: {} total bytes",
1307 self.piece_tree.stats().total_bytes
1308 );
1309 anyhow::bail!(
1310 "Failed to read data at offset {}: no progress made (requested {}..{}, buffer len: {})",
1311 current_offset,
1312 offset,
1313 end_offset,
1314 self.len()
1315 );
1316 }
1317 }
1318
1319 if iteration_count > 1 {
1320 tracing::info!(
1321 iteration_count,
1322 result_len = result.len(),
1323 "get_text_range_mut: completed with multiple iterations"
1324 );
1325 }
1326
1327 Ok(result)
1328 }
1329
1330 /// Prepare a viewport for rendering
1331 ///
1332 /// This is called before rendering with &mut access to pre-load all data
1333 /// that will be needed for the viewport. It estimates the number of bytes
1334 /// needed based on the line count and pre-loads them.
1335 ///
1336 /// # Arguments
1337 /// * `start_offset` - The byte offset where the viewport starts
1338 /// * `line_count` - The number of lines to prepare (estimate)
1339 ///
1340 /// # Returns
1341 /// Ok(()) if preparation succeeded, Err if loading failed
1342 pub fn prepare_viewport(&mut self, start_offset: usize, line_count: usize) -> Result<()> {
1343 let _span = tracing::info_span!("prepare_viewport", start_offset, line_count).entered();
1344 // Estimate how many bytes we need (pessimistic assumption)
1345 // Average line length is typically 80-100 bytes, but we use 200 to be safe
1346 let estimated_bytes = line_count.saturating_mul(200);
1347
1348 // Cap the estimate at the remaining bytes in the document
1349 let remaining_bytes = self.total_bytes().saturating_sub(start_offset);
1350 let bytes_to_load = estimated_bytes.min(remaining_bytes);
1351 tracing::trace!(
1352 bytes_to_load,
1353 total_bytes = self.total_bytes(),
1354 "prepare_viewport loading"
1355 );
1356
1357 // Pre-load with full chunk-splitting support
1358 // This may load more than we need, but ensures all data is available
1359 self.get_text_range_mut(start_offset, bytes_to_load)?;
1360
1361 Ok(())
1362 }
1363
1364 /// Split a piece that references a large unloaded buffer, create a chunk
1365 /// buffer for the region around `current_offset`, and load it.
1366 ///
1367 /// Returns `true` if the piece tree was modified (caller must restart its
1368 /// iteration), `false` if the piece was small enough to load in-place.
1369 fn chunk_split_and_load(
1370 &mut self,
1371 piece_view: &PieceView,
1372 current_offset: usize,
1373 ) -> Result<bool> {
1374 let buffer_id = piece_view.location.buffer_id();
1375
1376 // The underlying buffer may be much larger than this piece (e.g. the
1377 // whole-file Stored buffer after rebuild_with_pristine_saved_root).
1378 // We must chunk-split if either the piece or its buffer exceeds
1379 // LOAD_CHUNK_SIZE, because `load()` loads the entire buffer.
1380 let buffer_bytes = self
1381 .buffers
1382 .get(buffer_id)
1383 .and_then(|b| b.unloaded_bytes())
1384 .unwrap_or(0);
1385 let needs_chunk_split =
1386 piece_view.bytes > LOAD_CHUNK_SIZE || buffer_bytes > piece_view.bytes;
1387
1388 tracing::info!(
1389 buffer_id,
1390 piece_bytes = piece_view.bytes,
1391 buffer_bytes,
1392 needs_chunk_split,
1393 piece_doc_offset = piece_view.doc_offset,
1394 current_offset,
1395 "chunk_split_and_load: loading unloaded piece"
1396 );
1397
1398 if !needs_chunk_split {
1399 // Piece is small enough and its buffer matches — load in-place.
1400 let _span = tracing::info_span!(
1401 "load_small_buffer",
1402 piece_bytes = piece_view.bytes,
1403 buffer_id,
1404 )
1405 .entered();
1406 self.buffers
1407 .get_mut(buffer_id)
1408 .context("Buffer not found")?
1409 .load(&**self.persistence.fs())
1410 .context("Failed to load buffer")?;
1411 return Ok(false);
1412 }
1413
1414 let _span = tracing::info_span!(
1415 "chunk_split_and_load",
1416 piece_bytes = piece_view.bytes,
1417 buffer_id,
1418 )
1419 .entered();
1420
1421 let piece_start_in_doc = piece_view.doc_offset;
1422 let offset_in_piece = current_offset.saturating_sub(piece_start_in_doc);
1423
1424 // When the piece already fits within LOAD_CHUNK_SIZE, create a chunk
1425 // buffer for the exact piece range (no alignment/splitting needed).
1426 // Alignment rounding is only useful when carving a sub-range out of a
1427 // piece larger than LOAD_CHUNK_SIZE.
1428 let (chunk_start_in_buffer, chunk_bytes) = if piece_view.bytes <= LOAD_CHUNK_SIZE {
1429 (piece_view.buffer_offset, piece_view.bytes)
1430 } else {
1431 let start =
1432 (piece_view.buffer_offset + offset_in_piece) / CHUNK_ALIGNMENT * CHUNK_ALIGNMENT;
1433 let bytes = LOAD_CHUNK_SIZE
1434 .min((piece_view.buffer_offset + piece_view.bytes).saturating_sub(start));
1435 (start, bytes)
1436 };
1437
1438 // Calculate document offsets for splitting
1439 let chunk_start_offset_in_piece =
1440 chunk_start_in_buffer.saturating_sub(piece_view.buffer_offset);
1441 let split_start_in_doc = piece_start_in_doc + chunk_start_offset_in_piece;
1442 let split_end_in_doc = split_start_in_doc + chunk_bytes;
1443
1444 // Split the piece to isolate the chunk
1445 if chunk_start_offset_in_piece > 0 {
1446 self.piece_tree
1447 .split_at_offset(split_start_in_doc, &self.buffers);
1448 }
1449 if split_end_in_doc < piece_start_in_doc + piece_view.bytes {
1450 self.piece_tree
1451 .split_at_offset(split_end_in_doc, &self.buffers);
1452 }
1453
1454 // Create a new buffer for this chunk
1455 let chunk_buffer = self
1456 .buffers
1457 .get(buffer_id)
1458 .context("Buffer not found")?
1459 .create_chunk_buffer(self.next_buffer_id, chunk_start_in_buffer, chunk_bytes)
1460 .context("Failed to create chunk buffer")?;
1461
1462 self.next_buffer_id += 1;
1463 let new_buffer_id = chunk_buffer.id;
1464 self.buffers.push(chunk_buffer);
1465
1466 // Update the piece to reference the new chunk buffer
1467 self.piece_tree.replace_buffer_reference(
1468 buffer_id,
1469 piece_view.buffer_offset + chunk_start_offset_in_piece,
1470 chunk_bytes,
1471 BufferLocation::Added(new_buffer_id),
1472 );
1473
1474 // Load the chunk buffer
1475 self.buffers
1476 .get_mut(new_buffer_id)
1477 .context("Chunk buffer not found")?
1478 .load(&**self.persistence.fs())
1479 .context("Failed to load chunk")?;
1480
1481 // split_at_offset uses compute_line_feeds_static which returns None
1482 // for unloaded buffers, destroying the scanned line feed counts.
1483 // Fix up: the loaded chunk is counted from memory, remaining unloaded
1484 // pieces use the filesystem's count_line_feeds_in_range.
1485 if self.file_kind.has_line_feed_scan() {
1486 let leaves = self.piece_tree.get_leaves();
1487 let mut fixups: Vec<(usize, usize)> = Vec::new();
1488 for (idx, leaf) in leaves.iter().enumerate() {
1489 if leaf.line_feed_cnt.is_none() {
1490 if let Ok(count) = self.scan_leaf(leaf) {
1491 fixups.push((idx, count));
1492 }
1493 }
1494 }
1495 if !fixups.is_empty() {
1496 self.piece_tree.update_leaf_line_feeds_path_copy(&fixups);
1497 }
1498 }
1499
1500 // Keep saved_root in sync with viewport-loading tree restructures so
1501 // that diff_since_saved() can match by (location, offset) identity.
1502 //
1503 // When !modified the current tree IS the saved state, so just snapshot.
1504 // When modified, we must apply the same Stored→Added leaf replacement
1505 // to saved_root so the diff doesn't see loaded-but-unedited regions as
1506 // changed.
1507 if !self.persistence.is_modified() {
1508 self.persistence.set_saved_root(self.piece_tree.root());
1509 } else {
1510 self.persistence.apply_chunk_load_to_saved_root(
1511 buffer_id,
1512 chunk_start_in_buffer,
1513 chunk_bytes,
1514 new_buffer_id,
1515 );
1516 }
1517
1518 Ok(true)
1519 }
1520
1521 /// Get all text as a single Vec<u8>
1522 /// Returns None if any buffers are unloaded (lazy loading)
1523 /// CRATE-PRIVATE: External code should use get_text_range_mut() or DocumentModel methods
1524 pub(crate) fn get_all_text(&self) -> Option<Vec<u8>> {
1525 self.get_text_range(0, self.total_bytes())
1526 }
1527
1528 /// Get all text as a String
1529 /// Returns None if any buffers are unloaded (lazy loading)
1530 /// CRATE-PRIVATE: External code should use get_text_range_mut() or DocumentModel methods
1531 pub(crate) fn get_all_text_string(&self) -> Option<String> {
1532 self.get_all_text()
1533 .map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
1534 }
1535
1536 /// Get text from a byte range as bytes
1537 /// CRATE-PRIVATE: Returns empty vector if any buffers are unloaded (silently fails!)
1538 /// Only use this when you KNOW the data is loaded (e.g., for syntax highlighting small regions)
1539 /// External code should use get_text_range_mut() or DocumentModel methods
1540 pub(crate) fn slice_bytes(&self, range: Range<usize>) -> Vec<u8> {
1541 self.get_text_range(range.start, range.end.saturating_sub(range.start))
1542 .unwrap_or_default()
1543 }
1544
1545 /// Get all text as a String
1546 /// Returns None if any buffers are unloaded (lazy loading)
1547 pub fn to_string(&self) -> Option<String> {
1548 self.get_all_text_string()
1549 }
1550
1551 /// Get the total number of bytes
1552 pub fn len(&self) -> usize {
1553 self.total_bytes()
1554 }
1555
1556 /// Check if the buffer is empty
1557 pub fn is_empty(&self) -> bool {
1558 self.total_bytes() == 0
1559 }
1560
1561 /// Get the file path associated with this buffer
1562 pub fn file_path(&self) -> Option<&Path> {
1563 self.persistence.file_path()
1564 }
1565
1566 /// Update the file path after a rename operation on disk.
1567 pub fn rename_file_path(&mut self, path: PathBuf) {
1568 self.persistence.set_file_path(path);
1569 }
1570
1571 /// Clear the file path (make buffer unnamed)
1572 /// Note: This does NOT affect Unloaded chunk file_paths used for lazy loading.
1573 /// Those still point to the original source file for chunk loading.
1574 pub fn clear_file_path(&mut self) {
1575 self.persistence.clear_file_path();
1576 }
1577
1578 /// Extend buffer to include more bytes from a streaming source file.
1579 /// Used for stdin streaming where the temp file grows over time, and
1580 /// for plugin streaming via `RefreshBufferFromDisk`.
1581 ///
1582 /// Counts line feeds in the appended region so the new piece carries
1583 /// a real `line_feed_cnt` instead of `None`. Without this, any
1584 /// previously-known line count on the existing pieces propagates to
1585 /// `line_count() = None` (the piece-tree's `total_line_feeds`
1586 /// returns `None` if any piece is unknown), which in turn breaks the
1587 /// visual-row index used by the scrollbar.
1588 ///
1589 /// Falls back to `None` only when the filesystem can't count
1590 /// (errored stat / read). The buffer is still usable then — just
1591 /// without precise line indexing, same as a large file opened
1592 /// without a scan.
1593 pub fn extend_streaming(&mut self, source_path: &Path, new_size: usize) {
1594 let old_size = self.total_bytes();
1595 if new_size <= old_size {
1596 return;
1597 }
1598
1599 let additional_bytes = new_size - old_size;
1600
1601 // Create new Unloaded buffer for the appended region
1602 let buffer_id = self.next_buffer_id;
1603 self.next_buffer_id += 1;
1604
1605 let new_buffer = StringBuffer::new_unloaded(
1606 buffer_id,
1607 source_path.to_path_buf(),
1608 old_size, // file_offset - where this chunk starts in the file
1609 additional_bytes, // bytes - size of this chunk
1610 );
1611 self.buffers.push(new_buffer);
1612
1613 // Count line feeds in the appended region from disk so the
1614 // piece carries a known line count. Counting is cheap — it's a
1615 // streaming scan of `additional_bytes`, no buffer materialisation.
1616 let line_feed_cnt = self
1617 .persistence
1618 .fs()
1619 .count_line_feeds_in_range(source_path, old_size as u64, additional_bytes)
1620 .ok();
1621
1622 // Append piece at end of document (insert at offset == total_bytes)
1623 self.piece_tree.insert(
1624 old_size,
1625 BufferLocation::Stored(buffer_id),
1626 0,
1627 additional_bytes,
1628 line_feed_cnt,
1629 &self.buffers,
1630 );
1631 }
1632
1633 /// Check if the buffer has been modified since last save
1634 pub fn is_modified(&self) -> bool {
1635 self.persistence.is_modified()
1636 }
1637
1638 /// Clear the modified flag (after save)
1639 pub fn clear_modified(&mut self) {
1640 self.persistence.clear_modified();
1641 }
1642
1643 /// Set the modified flag explicitly
1644 /// Used by undo/redo to restore the correct modified state
1645 pub fn set_modified(&mut self, modified: bool) {
1646 self.persistence.set_modified(modified);
1647 }
1648
1649 /// Check if buffer has pending changes for recovery auto-save
1650 pub fn is_recovery_pending(&self) -> bool {
1651 self.persistence.is_recovery_pending()
1652 }
1653
1654 /// Mark buffer as needing recovery auto-save (call after edits)
1655 pub fn set_recovery_pending(&mut self, pending: bool) {
1656 self.persistence.set_recovery_pending(pending);
1657 }
1658
1659 /// Ensure the buffer chunk at the given byte offset is loaded.
1660 ///
1661 /// When `line_feeds_scanned` is true, piece splits during insert/delete need
1662 /// the buffer data to be loaded so `compute_line_feeds_static` can accurately
1663 /// recount line feeds for each half. This method loads the chunk if needed.
1664 fn ensure_chunk_loaded_at(&mut self, offset: usize) {
1665 if let Some(piece_info) = self.piece_tree.find_by_offset(offset) {
1666 let buffer_id = piece_info.location.buffer_id();
1667 if let Some(buffer) = self.buffers.get_mut(buffer_id) {
1668 if !buffer.is_loaded() {
1669 let buf_bytes = buffer.unloaded_bytes().unwrap_or(0);
1670 tracing::info!(
1671 "ensure_chunk_loaded_at: loading buffer {} ({} bytes) for offset {}",
1672 buffer_id,
1673 buf_bytes,
1674 offset
1675 );
1676 if let Err(e) = buffer.load(&**self.persistence.fs()) {
1677 tracing::warn!("Failed to load chunk at offset {offset}: {e}");
1678 }
1679 }
1680 }
1681 }
1682 }
1683
1684 /// Check if this is a large file with lazy loading enabled
1685 pub fn is_large_file(&self) -> bool {
1686 self.file_kind.is_large_file()
1687 }
1688
1689 /// Check if line feeds have been scanned for this large file.
1690 /// When true, `line_count()` returns exact values.
1691 pub fn has_line_feed_scan(&self) -> bool {
1692 self.file_kind.has_line_feed_scan()
1693 }
1694
1695 /// Get the raw piece tree leaves (for storing alongside scan chunks).
1696 pub fn piece_tree_leaves(&self) -> Vec<crate::model::piece_tree::LeafData> {
1697 self.piece_tree.get_leaves()
1698 }
1699
1700 /// Prepare work items for an incremental line scan.
1701 ///
1702 /// First splits any oversized leaves in the piece tree so every leaf is
1703 /// at most `LOAD_CHUNK_SIZE` bytes. Then returns one work item per leaf.
1704 /// After scanning, `get_text_range_mut` will never need to split a scanned
1705 /// leaf (it's already chunk-sized), so line-feed counts are preserved.
1706 ///
1707 /// Returns `(chunks, total_bytes)`.
1708 pub fn prepare_line_scan(&mut self) -> (Vec<LineScanChunk>, usize) {
1709 // Pre-split the tree so every leaf ≤ LOAD_CHUNK_SIZE.
1710 self.piece_tree.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
1711
1712 let leaves = self.piece_tree.get_leaves();
1713 let total_bytes: usize = leaves.iter().map(|l| l.bytes).sum();
1714 let mut chunks = Vec::new();
1715
1716 for (idx, leaf) in leaves.iter().enumerate() {
1717 chunks.push(LineScanChunk {
1718 leaf_index: idx,
1719 byte_len: leaf.bytes,
1720 already_known: leaf.line_feed_cnt.is_some(),
1721 });
1722 }
1723
1724 (chunks, total_bytes)
1725 }
1726
1727 /// Initialize a chunked search scan over this buffer's piece tree.
1728 ///
1729 /// Used for in-editor Ctrl+F (incremental, yields to the event loop
1730 /// between chunks) and for searching dirty buffers during project grep.
1731 /// For searching files on disk, use `FileSystem::search_file` instead.
1732 pub fn search_scan_init(
1733 &mut self,
1734 regex: regex::bytes::Regex,
1735 max_matches: usize,
1736 query_len: usize,
1737 ) -> ChunkedSearchState {
1738 let (chunks, total_bytes) = self.prepare_line_scan();
1739 ChunkedSearchState {
1740 chunks,
1741 next_chunk: 0,
1742 next_doc_offset: 0,
1743 total_bytes,
1744 scanned_bytes: 0,
1745 regex,
1746 matches: Vec::new(),
1747 overlap_tail: Vec::new(),
1748 overlap_doc_offset: 0,
1749 max_matches,
1750 capped: false,
1751 query_len,
1752 running_line: 1,
1753 }
1754 }
1755
1756 /// Process one chunk of a chunked search scan.
1757 ///
1758 /// Loads the next chunk via `get_text_range_mut`, prepends overlap from
1759 /// the previous chunk, runs the regex, and appends matches to `state`
1760 /// with line/column/context computed on the fly from the loaded bytes.
1761 ///
1762 /// Line numbers are tracked incrementally via `running_line` — each
1763 /// chunk counts newlines in its non-overlap portion to advance the
1764 /// counter for the next chunk, and matches use an incremental cursor
1765 /// so total line-counting work is O(chunk_size), not O(chunk × matches).
1766 ///
1767 /// Returns `Ok(true)` if there are more chunks to process, `Ok(false)`
1768 /// when the scan is complete.
1769 ///
1770 /// TODO: For concurrent/parallel search (searching multiple files at once),
1771 /// chunks would need to return chunk-relative line numbers and have them
1772 /// fixed up with each file's starting line offset after all chunks complete.
1773 pub fn search_scan_next_chunk(
1774 &mut self,
1775 state: &mut ChunkedSearchState,
1776 ) -> std::io::Result<bool> {
1777 if state.is_done() {
1778 return Ok(false);
1779 }
1780
1781 let chunk_info = state.chunks[state.next_chunk].clone();
1782 let doc_offset = state.next_doc_offset;
1783
1784 state.next_chunk += 1;
1785 state.scanned_bytes += chunk_info.byte_len;
1786 state.next_doc_offset += chunk_info.byte_len;
1787
1788 // Load the chunk bytes
1789 let chunk_bytes = self
1790 .get_text_range_mut(doc_offset, chunk_info.byte_len)
1791 .map_err(std::io::Error::other)?;
1792
1793 // Build search buffer: overlap tail + new chunk
1794 let overlap_len = state.overlap_tail.len();
1795 let mut search_buf = Vec::with_capacity(overlap_len + chunk_bytes.len());
1796 search_buf.extend_from_slice(&state.overlap_tail);
1797 search_buf.extend_from_slice(&chunk_bytes);
1798
1799 let buf_doc_offset = if overlap_len > 0 {
1800 state.overlap_doc_offset
1801 } else {
1802 doc_offset
1803 };
1804
1805 // Line number at buf_doc_offset: running_line tracks the line at
1806 // doc_offset (start of new chunk data). Count newlines in the overlap
1807 // prefix to get the line at the start of the full search_buf.
1808 let newlines_in_overlap = search_buf[..overlap_len]
1809 .iter()
1810 .filter(|&&b| b == b'\n')
1811 .count();
1812 let mut line_at = state.running_line - newlines_in_overlap;
1813 let mut counted_to = 0usize;
1814
1815 // Run regex on the combined buffer
1816 for m in state.regex.find_iter(&search_buf) {
1817 // Skip matches entirely within the overlap (already found)
1818 if overlap_len > 0 && m.end() <= overlap_len {
1819 continue;
1820 }
1821
1822 if state.matches.len() >= state.max_matches {
1823 state.capped = true;
1824 break;
1825 }
1826
1827 // Advance line counter incrementally to this match
1828 line_at += search_buf[counted_to..m.start()]
1829 .iter()
1830 .filter(|&&b| b == b'\n')
1831 .count();
1832 counted_to = m.start();
1833
1834 // Find line boundaries in search_buf for context
1835 let line_start = search_buf[..m.start()]
1836 .iter()
1837 .rposition(|&b| b == b'\n')
1838 .map(|p| p + 1)
1839 .unwrap_or(0);
1840 let line_end = search_buf[m.start()..]
1841 .iter()
1842 .position(|&b| b == b'\n')
1843 .map(|p| m.start() + p)
1844 .unwrap_or(search_buf.len());
1845
1846 let match_doc_offset = buf_doc_offset + m.start();
1847 let match_len = m.end() - m.start();
1848 let column = m.start() - line_start + 1;
1849 let context = String::from_utf8_lossy(&search_buf[line_start..line_end]).into_owned();
1850
1851 state.matches.push(SearchMatch {
1852 byte_offset: match_doc_offset,
1853 length: match_len,
1854 line: line_at,
1855 column,
1856 context,
1857 });
1858 }
1859
1860 // Advance running_line by newlines in the new (non-overlap) chunk data
1861 let newlines_in_chunk = chunk_bytes.iter().filter(|&&b| b == b'\n').count();
1862 state.running_line += newlines_in_chunk;
1863
1864 // Save overlap tail for next chunk
1865 let max_overlap = state.query_len.max(256).min(chunk_bytes.len());
1866 let tail_start = chunk_bytes.len().saturating_sub(max_overlap);
1867 state.overlap_tail = chunk_bytes[tail_start..].to_vec();
1868 state.overlap_doc_offset = doc_offset + tail_start;
1869
1870 Ok(!state.is_done())
1871 }
1872
1873 /// Run a complete chunked search over the piece tree (all chunks).
1874 ///
1875 /// Synchronous variant — used for dirty buffer snapshots in project
1876 /// grep and in tests. For on-disk files, use `FileSystem::search_file`.
1877 pub fn search_scan_all(
1878 &mut self,
1879 regex: regex::bytes::Regex,
1880 max_matches: usize,
1881 query_len: usize,
1882 ) -> std::io::Result<ChunkedSearchState> {
1883 let mut state = self.search_scan_init(regex, max_matches, query_len);
1884 while self.search_scan_next_chunk(&mut state)? {}
1885 Ok(state)
1886 }
1887
1888 /// Build a hybrid search plan from the piece tree.
1889 ///
1890 /// Extracts regions (unloaded file ranges + loaded in-memory data) that
1891 /// can be searched independently. The plan is `Send` so it can be
1892 /// executed on a background thread via `HybridSearchPlan::execute`.
1893 ///
1894 /// Returns `None` if the buffer has no file path (caller should fall
1895 /// back to `search_scan_all`).
1896 pub fn search_hybrid_plan(&mut self) -> Option<HybridSearchPlan> {
1897 let file_path = self.persistence.file_path_owned()?;
1898
1899 self.piece_tree.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
1900 let leaves = self.piece_tree.get_leaves();
1901
1902 let mut regions: Vec<SearchRegion> = Vec::new();
1903 let mut doc_offset = 0usize;
1904
1905 for leaf in &leaves {
1906 let buf = self.buffers.get(leaf.location.buffer_id());
1907 let is_unloaded_stored = matches!(
1908 (&leaf.location, buf),
1909 (
1910 BufferLocation::Stored(_),
1911 Some(StringBuffer {
1912 data: BufferData::Unloaded { .. },
1913 ..
1914 }),
1915 )
1916 );
1917
1918 if is_unloaded_stored {
1919 let file_offset = match buf.unwrap().data {
1920 BufferData::Unloaded {
1921 file_offset: fo, ..
1922 } => fo + leaf.offset,
1923 _ => unreachable!(),
1924 };
1925
1926 // Merge with previous unloaded region if contiguous
1927 if let Some(SearchRegion::Unloaded {
1928 file_offset: prev_fo,
1929 bytes: prev_bytes,
1930 ..
1931 }) = regions.last_mut()
1932 {
1933 if *prev_fo + *prev_bytes == file_offset {
1934 *prev_bytes += leaf.bytes;
1935 doc_offset += leaf.bytes;
1936 continue;
1937 }
1938 }
1939 regions.push(SearchRegion::Unloaded {
1940 file_offset,
1941 bytes: leaf.bytes,
1942 doc_offset,
1943 });
1944 } else {
1945 let data = match buf.and_then(|b| b.get_data()) {
1946 Some(full) => {
1947 let end = (leaf.offset + leaf.bytes).min(full.len());
1948 full[leaf.offset..end].to_vec()
1949 }
1950 None => match self.get_text_range_mut(doc_offset, leaf.bytes) {
1951 Ok(d) => d,
1952 Err(_) => {
1953 doc_offset += leaf.bytes;
1954 continue;
1955 }
1956 },
1957 };
1958
1959 // Merge with previous loaded region
1960 if let Some(SearchRegion::Loaded {
1961 data: prev_data, ..
1962 }) = regions.last_mut()
1963 {
1964 prev_data.extend_from_slice(&data);
1965 doc_offset += leaf.bytes;
1966 continue;
1967 }
1968 regions.push(SearchRegion::Loaded { data, doc_offset });
1969 }
1970
1971 doc_offset += leaf.bytes;
1972 }
1973
1974 Some(HybridSearchPlan { file_path, regions })
1975 }
1976
1977 /// Hybrid search: uses `fs.search_file` for unloaded piece-tree regions
1978 /// (searches where the data lives, no network transfer) and in-memory regex
1979 /// for loaded/edited regions. Handles overlap at region boundaries.
1980 ///
1981 /// For a huge remote file with a small local edit, this avoids transferring
1982 /// the entire file — only match metadata crosses the network.
1983 ///
1984 /// Falls back to `search_scan_all` when the buffer has no file path or is
1985 /// fully loaded.
1986 pub fn search_hybrid(
1987 &mut self,
1988 pattern: &str,
1989 opts: &FileSearchOptions,
1990 regex: Regex,
1991 max_matches: usize,
1992 query_len: usize,
1993 ) -> io::Result<Vec<SearchMatch>> {
1994 let plan = match self.search_hybrid_plan() {
1995 Some(p) => p,
1996 None => {
1997 let state = self.search_scan_all(regex, max_matches, query_len)?;
1998 return Ok(state.matches);
1999 }
2000 };
2001 plan.execute(
2002 &**self.persistence.fs(),
2003 pattern,
2004 opts,
2005 ®ex,
2006 max_matches,
2007 query_len,
2008 )
2009 }
2010
2011 /// Count `\n` bytes in a single leaf.
2012 ///
2013 /// Uses `count_line_feeds_in_range` for unloaded buffers, which remote
2014 /// filesystem implementations can override to count server-side.
2015 pub fn scan_leaf(&self, leaf: &crate::model::piece_tree::LeafData) -> std::io::Result<usize> {
2016 let buffer_id = leaf.location.buffer_id();
2017 let buffer = self
2018 .buffers
2019 .get(buffer_id)
2020 .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::NotFound, "buffer not found"))?;
2021
2022 let count = match &buffer.data {
2023 crate::model::piece_tree::BufferData::Loaded { data, .. } => {
2024 let end = (leaf.offset + leaf.bytes).min(data.len());
2025 data[leaf.offset..end]
2026 .iter()
2027 .filter(|&&b| b == b'\n')
2028 .count()
2029 }
2030 crate::model::piece_tree::BufferData::Unloaded {
2031 file_path,
2032 file_offset,
2033 ..
2034 } => {
2035 let read_offset = *file_offset as u64 + leaf.offset as u64;
2036 self.persistence.fs().count_line_feeds_in_range(
2037 file_path,
2038 read_offset,
2039 leaf.bytes,
2040 )?
2041 }
2042 };
2043 Ok(count)
2044 }
2045
2046 /// Return the I/O parameters for an unloaded leaf, or `None` if loaded.
2047 ///
2048 /// Used by the incremental scan to distinguish leaves that can be counted
2049 /// in-memory (via `scan_leaf`) from those that need filesystem I/O.
2050 pub fn leaf_io_params(
2051 &self,
2052 leaf: &crate::model::piece_tree::LeafData,
2053 ) -> Option<(std::path::PathBuf, u64, usize)> {
2054 let buffer_id = leaf.location.buffer_id();
2055 let buffer = self.buffers.get(buffer_id)?;
2056 match &buffer.data {
2057 crate::model::piece_tree::BufferData::Loaded { .. } => None,
2058 crate::model::piece_tree::BufferData::Unloaded {
2059 file_path,
2060 file_offset,
2061 ..
2062 } => {
2063 let read_offset = *file_offset as u64 + leaf.offset as u64;
2064 Some((file_path.clone(), read_offset, leaf.bytes))
2065 }
2066 }
2067 }
2068
2069 /// Get a reference to the string buffers (for parallel scanning).
2070 pub fn buffer_slice(&self) -> &[StringBuffer] {
2071 &self.buffers
2072 }
2073
2074 /// Apply the results of an incremental line scan.
2075 pub fn apply_scan_updates(&mut self, updates: &[(usize, usize)]) {
2076 self.piece_tree.update_leaf_line_feeds(updates);
2077 self.file_kind.mark_line_feed_scan_complete();
2078 }
2079
2080 /// After an incremental line-feed scan completes, rebuild the tree so that
2081 /// `saved_root` and the current tree share `Arc` pointers for unedited
2082 /// subtrees. This makes `diff_since_saved()` O(edited regions) instead of
2083 /// O(file size).
2084 pub fn rebuild_with_pristine_saved_root(&mut self, scan_updates: &[(usize, usize)]) {
2085 let file_size = match self.persistence.saved_file_size() {
2086 Some(s) => s,
2087 None => {
2088 // Fallback: no saved file size means we can't build a pristine
2089 // tree. Just apply updates the old way.
2090 self.apply_scan_updates(scan_updates);
2091 return;
2092 }
2093 };
2094
2095 // --- Walk the current tree to extract deletions and insertions ---
2096 let total = self.total_bytes();
2097 // Deletions: gaps in Stored coverage (orig_offset, len).
2098 let mut deletions: Vec<(usize, usize)> = Vec::new();
2099 // Insertions: (post_delete_offset, location, buf_offset, bytes, lf_cnt).
2100 // post_delete_offset = cumulative surviving Stored bytes before this point.
2101 let mut insertions: Vec<(usize, BufferLocation, usize, usize, Option<usize>)> = Vec::new();
2102 let mut orig_cursor: usize = 0;
2103 let mut stored_bytes_in_doc: usize = 0;
2104
2105 for piece in self.piece_tree.iter_pieces_in_range(0, total) {
2106 match piece.location {
2107 BufferLocation::Stored(_) => {
2108 if piece.buffer_offset > orig_cursor {
2109 deletions.push((orig_cursor, piece.buffer_offset - orig_cursor));
2110 }
2111 orig_cursor = piece.buffer_offset + piece.bytes;
2112 stored_bytes_in_doc += piece.bytes;
2113 }
2114 BufferLocation::Added(id) => {
2115 // Check if this Added buffer was created by loading a chunk
2116 // from the stored file (via get_text_range_mut chunk loading).
2117 // If so, treat it as stored content, not a user edit.
2118 if let Some(file_off) = self.buffers.get(id).and_then(|b| b.stored_file_offset)
2119 {
2120 if file_off > orig_cursor {
2121 deletions.push((orig_cursor, file_off - orig_cursor));
2122 }
2123 orig_cursor = file_off + piece.bytes;
2124 stored_bytes_in_doc += piece.bytes;
2125 } else {
2126 insertions.push((
2127 stored_bytes_in_doc,
2128 piece.location,
2129 piece.buffer_offset,
2130 piece.bytes,
2131 piece.line_feed_cnt,
2132 ));
2133 }
2134 }
2135 }
2136 }
2137 // Trailing deletion.
2138 if orig_cursor < file_size {
2139 deletions.push((orig_cursor, file_size - orig_cursor));
2140 }
2141
2142 // --- Build pristine tree (full original file, pre-split, with lf counts) ---
2143 let mut pristine = if file_size > 0 {
2144 PieceTree::new(BufferLocation::Stored(0), 0, file_size, None)
2145 } else {
2146 PieceTree::empty()
2147 };
2148 pristine.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
2149 pristine.update_leaf_line_feeds(scan_updates);
2150
2151 // Snapshot the pristine tree as saved_root.
2152 self.persistence.set_saved_root(pristine.root());
2153
2154 // If no edits, the pristine tree IS the current tree.
2155 if deletions.is_empty() && insertions.is_empty() {
2156 self.piece_tree = pristine;
2157 self.file_kind.mark_line_feed_scan_complete();
2158 return;
2159 }
2160
2161 // --- Replay edits onto a clone of the pristine tree ---
2162 let mut tree = pristine;
2163
2164 // Apply deletions from HIGH to LOW offset so earlier offsets stay valid.
2165 deletions.sort_by(|a, b| b.0.cmp(&a.0));
2166 for &(offset, len) in &deletions {
2167 tree.delete(offset, len, &self.buffers);
2168 }
2169
2170 // Apply insertions from LOW to HIGH. Each insertion shifts subsequent
2171 // offsets by its byte count, tracked via insert_delta.
2172 let mut insert_delta: usize = 0;
2173 for &(offset, location, buf_offset, bytes, lf_cnt) in &insertions {
2174 tree.insert(
2175 offset + insert_delta,
2176 location,
2177 buf_offset,
2178 bytes,
2179 lf_cnt,
2180 &self.buffers,
2181 );
2182 insert_delta += bytes;
2183 }
2184
2185 // Path-copy insert/delete may split Stored leaves whose data is
2186 // Unloaded, producing fragments with line_feed_cnt = None
2187 // (compute_line_feeds_static can't read unloaded data). Fix them up
2188 // by scanning any remaining None leaves.
2189 let leaves = tree.get_leaves();
2190 let mut fixups: Vec<(usize, usize)> = Vec::new();
2191 for (idx, leaf) in leaves.iter().enumerate() {
2192 if leaf.line_feed_cnt.is_none() {
2193 if let Ok(count) = self.scan_leaf(leaf) {
2194 fixups.push((idx, count));
2195 }
2196 }
2197 }
2198 if !fixups.is_empty() {
2199 tree.update_leaf_line_feeds_path_copy(&fixups);
2200 }
2201
2202 self.piece_tree = tree;
2203 self.file_kind.mark_line_feed_scan_complete();
2204 }
2205
2206 /// Resolve the exact byte offset for a given line number (0-indexed).
2207 ///
2208 /// Uses the tree's line feed counts to find the piece containing the target line,
2209 /// then loads/reads that piece's data to find the exact newline position.
2210 /// This works even when buffers are unloaded (large file with scanned line index).
2211 pub fn resolve_line_byte_offset(&mut self, target_line: usize) -> Option<usize> {
2212 if target_line == 0 {
2213 return Some(0);
2214 }
2215
2216 // Use tree metadata to find the piece containing the target line
2217 let (doc_offset, buffer_id, piece_offset, piece_bytes, lines_before) =
2218 self.piece_tree.piece_info_for_line(target_line)?;
2219
2220 // We need to find the (target_line - lines_before)-th newline within this piece
2221 let lines_to_skip = target_line - lines_before;
2222
2223 // Get the piece data — either from loaded buffer or read from disk
2224 let buffer = self.buffers.get(buffer_id)?;
2225 let piece_data: Vec<u8> = match &buffer.data {
2226 crate::model::piece_tree::BufferData::Loaded { data, .. } => {
2227 let end = (piece_offset + piece_bytes).min(data.len());
2228 data[piece_offset..end].to_vec()
2229 }
2230 crate::model::piece_tree::BufferData::Unloaded {
2231 file_path,
2232 file_offset,
2233 ..
2234 } => {
2235 let read_offset = *file_offset as u64 + piece_offset as u64;
2236 self.persistence
2237 .fs()
2238 .read_range(file_path, read_offset, piece_bytes)
2239 .ok()?
2240 }
2241 };
2242
2243 // Count newlines to find the target line start
2244 let mut newlines_found = 0;
2245 for (i, &byte) in piece_data.iter().enumerate() {
2246 if byte == b'\n' {
2247 newlines_found += 1;
2248 if newlines_found == lines_to_skip {
2249 // The target line starts right after this newline
2250 return Some(doc_offset + i + 1);
2251 }
2252 }
2253 }
2254
2255 // If we didn't find enough newlines, the line starts in the next piece
2256 // Return the end of this piece as an approximation
2257 Some(doc_offset + piece_bytes)
2258 }
2259
2260 /// Get the saved file size (size of the file on disk after last load/save)
2261 /// For large files, this is used during recovery to know the expected original file size.
2262 /// Returns None for new unsaved buffers.
2263 pub fn original_file_size(&self) -> Option<usize> {
2264 // Return the tracked saved file size - this is updated when the file is
2265 // loaded or saved, so it always reflects the current file on disk.
2266 self.persistence.saved_file_size()
2267 }
2268
2269 /// Get recovery chunks for this buffer (only modified portions)
2270 ///
2271 /// For large files, this returns only the pieces that come from Added buffers
2272 /// (i.e., the modifications), not the original file content. This allows
2273 /// efficient incremental recovery without reading/writing the entire file.
2274 ///
2275 /// Returns: Vec of (original_file_offset, data) for each modified chunk
2276 /// The offset is the position in the ORIGINAL file where this chunk should be inserted.
2277 pub fn get_recovery_chunks(&self) -> Vec<(usize, Vec<u8>)> {
2278 use crate::model::piece_tree::BufferLocation;
2279
2280 let mut chunks = Vec::new();
2281 let total = self.total_bytes();
2282
2283 // Track cumulative bytes from Stored pieces as we iterate.
2284 // This gives us the original file offset for Added pieces.
2285 // The key insight: Added pieces should be inserted at the position
2286 // corresponding to where they appear relative to Stored content,
2287 // not their position in the current document.
2288 let mut stored_bytes_before = 0;
2289
2290 for piece in self.piece_tree.iter_pieces_in_range(0, total) {
2291 match piece.location {
2292 BufferLocation::Stored(_) => {
2293 // Accumulate stored bytes to track position in original file
2294 stored_bytes_before += piece.bytes;
2295 }
2296 BufferLocation::Added(buffer_id) => {
2297 if let Some(buffer) = self.buffers.iter().find(|b| b.id == buffer_id) {
2298 // Skip buffers that originate from the original file
2299 // (loaded by chunk_split_and_load for viewport display).
2300 // These have stored_file_offset set and are not user edits.
2301 //
2302 // Why Added and not Stored? The piece tree only has two
2303 // variants: Stored and Added. chunk_split_and_load marks
2304 // loaded chunks as Added(new_id) because
2305 // rebuild_with_pristine_saved_root interprets Stored
2306 // pieces' buffer_offset as a position in the original
2307 // file — but a chunk buffer starts at offset 0, so using
2308 // Stored would corrupt the rebuild logic. We rely on
2309 // stored_file_offset instead to distinguish "loaded from
2310 // disk" from "user edit". A third BufferLocation variant
2311 // (e.g. LoadedChunk) would make this distinction explicit
2312 // in the type system rather than requiring this runtime
2313 // check.
2314 if buffer.stored_file_offset.is_some() {
2315 stored_bytes_before += piece.bytes;
2316 continue;
2317 }
2318 // Get the data from the buffer if loaded
2319 if let Some(data) = buffer.get_data() {
2320 // Extract just the portion this piece references
2321 let start = piece.buffer_offset;
2322 let end = start + piece.bytes;
2323 if end <= data.len() {
2324 // Use stored_bytes_before as the original file offset.
2325 // This is where this insertion should go relative to
2326 // the original file content.
2327 chunks.push((stored_bytes_before, data[start..end].to_vec()));
2328 }
2329 }
2330 }
2331 }
2332 }
2333 }
2334
2335 chunks
2336 }
2337
2338 /// Check if this buffer contains binary content
2339 pub fn is_binary(&self) -> bool {
2340 self.file_kind.is_binary()
2341 }
2342
2343 /// Get the line ending format for this buffer
2344 pub fn line_ending(&self) -> LineEnding {
2345 self.format.line_ending()
2346 }
2347
2348 /// Set the line ending format for this buffer
2349 ///
2350 /// This marks the buffer as modified since the line ending format has changed.
2351 /// On save, the buffer content will be converted to the new format.
2352 pub fn set_line_ending(&mut self, line_ending: LineEnding) {
2353 self.format.set_line_ending(line_ending);
2354 self.mark_content_modified();
2355 }
2356
2357 /// Set the default line ending format for a new/empty buffer
2358 ///
2359 /// Unlike `set_line_ending`, this does NOT mark the buffer as modified.
2360 /// This should be used when initializing a new buffer with a configured default.
2361 pub fn set_default_line_ending(&mut self, line_ending: LineEnding) {
2362 self.format.set_default_line_ending(line_ending);
2363 }
2364
2365 /// Get the encoding format for this buffer
2366 pub fn encoding(&self) -> Encoding {
2367 self.format.encoding()
2368 }
2369
2370 /// Set the encoding format for this buffer
2371 ///
2372 /// This marks the buffer as modified since the encoding format has changed.
2373 /// On save, the buffer content will be converted to the new encoding.
2374 pub fn set_encoding(&mut self, encoding: Encoding) {
2375 self.format.set_encoding(encoding);
2376 self.mark_content_modified();
2377 }
2378
2379 /// Set the default encoding format for a new/empty buffer
2380 ///
2381 /// Unlike `set_encoding`, this does NOT mark the buffer as modified.
2382 /// This should be used when initializing a new buffer with a configured default.
2383 pub fn set_default_encoding(&mut self, encoding: Encoding) {
2384 self.format.set_default_encoding(encoding);
2385 }
2386
2387 /// Get the first line of the buffer as a lossy UTF-8 string, suitable
2388 /// for shebang / first-line grammar detection. Returns `None` for an
2389 /// empty buffer. Non-UTF-8 bytes are replaced with U+FFFD.
2390 pub fn first_line_lossy(&self) -> Option<String> {
2391 let bytes = self.get_line(0)?;
2392 if bytes.is_empty() {
2393 return None;
2394 }
2395 Some(String::from_utf8_lossy(&bytes).into_owned())
2396 }
2397
2398 /// Get text for a specific line
2399 pub fn get_line(&self, line: usize) -> Option<Vec<u8>> {
2400 let (start, end) = self.piece_tree.line_range(line, &self.buffers)?;
2401
2402 let bytes = if let Some(end_offset) = end {
2403 end_offset.saturating_sub(start)
2404 } else {
2405 self.total_bytes().saturating_sub(start)
2406 };
2407
2408 self.get_text_range(start, bytes)
2409 }
2410
2411 /// Get the byte offset where a line starts
2412 pub fn line_start_offset(&self, line: usize) -> Option<usize> {
2413 let (start, _) = self.piece_tree.line_range(line, &self.buffers)?;
2414 Some(start)
2415 }
2416
2417 /// Get piece information at a byte offset
2418 pub fn piece_info_at_offset(&self, offset: usize) -> Option<PieceInfo> {
2419 self.piece_tree.find_by_offset(offset)
2420 }
2421
2422 /// Get tree statistics for debugging
2423 pub fn stats(&self) -> TreeStats {
2424 self.piece_tree.stats()
2425 }
2426
2427 // Search and Replace Operations
2428
2429 /// Find the next occurrence of a pattern, with wrap-around
2430 pub fn find_next(&self, pattern: &str, start_pos: usize) -> Option<usize> {
2431 if pattern.is_empty() {
2432 return None;
2433 }
2434
2435 let pattern_bytes = pattern.as_bytes();
2436 let buffer_len = self.len();
2437
2438 // Search from start_pos to end
2439 if start_pos < buffer_len {
2440 if let Some(offset) = self.find_pattern(start_pos, buffer_len, pattern_bytes) {
2441 return Some(offset);
2442 }
2443 }
2444
2445 // Wrap around: search from beginning to start_pos
2446 if start_pos > 0 {
2447 if let Some(offset) = self.find_pattern(0, start_pos, pattern_bytes) {
2448 return Some(offset);
2449 }
2450 }
2451
2452 None
2453 }
2454
2455 /// Find the next occurrence of a pattern within an optional range
2456 /// If range is None, searches the entire buffer with wrap-around (same as find_next)
2457 /// If range is Some, searches only within that range without wrap-around
2458 pub fn find_next_in_range(
2459 &self,
2460 pattern: &str,
2461 start_pos: usize,
2462 range: Option<Range<usize>>,
2463 ) -> Option<usize> {
2464 if pattern.is_empty() {
2465 return None;
2466 }
2467
2468 if let Some(search_range) = range {
2469 // Search within range only, no wrap-around
2470 let pattern_bytes = pattern.as_bytes();
2471 let search_start = start_pos.max(search_range.start);
2472 let search_end = search_range.end.min(self.len());
2473
2474 if search_start < search_end {
2475 self.find_pattern(search_start, search_end, pattern_bytes)
2476 } else {
2477 None
2478 }
2479 } else {
2480 // No range specified, use normal find_next with wrap-around
2481 self.find_next(pattern, start_pos)
2482 }
2483 }
2484
2485 /// Find pattern in a byte range using overlapping chunks
2486 fn find_pattern(&self, start: usize, end: usize, pattern: &[u8]) -> Option<usize> {
2487 if pattern.is_empty() || start >= end {
2488 return None;
2489 }
2490
2491 const CHUNK_SIZE: usize = 65536; // 64KB chunks
2492 let overlap = pattern.len().saturating_sub(1).max(1);
2493
2494 // Use the overlapping chunks iterator for efficient streaming search
2495 let chunks = OverlappingChunks::new(self, start, end, CHUNK_SIZE, overlap);
2496
2497 for chunk in chunks {
2498 // Search the entire chunk buffer
2499 if let Some(pos) = Self::find_in_bytes(&chunk.buffer, pattern) {
2500 let match_end = pos + pattern.len();
2501 // Only report if match ENDS in or after the valid zone
2502 // This ensures patterns spanning boundaries are found exactly once
2503 if match_end > chunk.valid_start {
2504 let absolute_pos = chunk.absolute_pos + pos;
2505 // Verify the match doesn't extend beyond our search range
2506 if absolute_pos + pattern.len() <= end {
2507 return Some(absolute_pos);
2508 }
2509 }
2510 }
2511 }
2512
2513 None
2514 }
2515
2516 /// Simple byte pattern search using naive algorithm
2517 fn find_in_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
2518 if needle.is_empty() || needle.len() > haystack.len() {
2519 return None;
2520 }
2521
2522 (0..=haystack.len() - needle.len()).find(|&i| &haystack[i..i + needle.len()] == needle)
2523 }
2524
2525 /// Find the next occurrence of a regex pattern, with wrap-around
2526 pub fn find_next_regex(&self, regex: &Regex, start_pos: usize) -> Option<usize> {
2527 let buffer_len = self.len();
2528
2529 // Search from start_pos to end
2530 if start_pos < buffer_len {
2531 if let Some(offset) = self.find_regex(start_pos, buffer_len, regex) {
2532 return Some(offset);
2533 }
2534 }
2535
2536 // Wrap around: search from beginning to start_pos
2537 if start_pos > 0 {
2538 if let Some(offset) = self.find_regex(0, start_pos, regex) {
2539 return Some(offset);
2540 }
2541 }
2542
2543 None
2544 }
2545
2546 /// Find the next occurrence of a regex pattern within an optional range
2547 pub fn find_next_regex_in_range(
2548 &self,
2549 regex: &Regex,
2550 start_pos: usize,
2551 range: Option<Range<usize>>,
2552 ) -> Option<usize> {
2553 if let Some(search_range) = range {
2554 let search_start = start_pos.max(search_range.start);
2555 let search_end = search_range.end.min(self.len());
2556
2557 if search_start < search_end {
2558 self.find_regex(search_start, search_end, regex)
2559 } else {
2560 None
2561 }
2562 } else {
2563 self.find_next_regex(regex, start_pos)
2564 }
2565 }
2566
2567 /// Find regex pattern in a byte range using overlapping chunks
2568 fn find_regex(&self, start: usize, end: usize, regex: &Regex) -> Option<usize> {
2569 if start >= end {
2570 return None;
2571 }
2572
2573 const CHUNK_SIZE: usize = 1048576; // 1MB chunks
2574 const OVERLAP: usize = 4096; // 4KB overlap for regex
2575
2576 // Use the overlapping chunks iterator for efficient streaming search
2577 // This fixes the critical bug where regex patterns spanning chunk boundaries were missed
2578 let chunks = OverlappingChunks::new(self, start, end, CHUNK_SIZE, OVERLAP);
2579
2580 for chunk in chunks {
2581 // Search the entire chunk buffer
2582 if let Some(mat) = regex.find(&chunk.buffer) {
2583 let match_end = mat.end();
2584 // Only report if match ENDS in or after the valid zone
2585 // This ensures patterns spanning boundaries are found exactly once
2586 if match_end > chunk.valid_start {
2587 let absolute_pos = chunk.absolute_pos + mat.start();
2588 // Verify the match doesn't extend beyond our search range
2589 let match_len = mat.end() - mat.start();
2590 if absolute_pos + match_len <= end {
2591 return Some(absolute_pos);
2592 }
2593 }
2594 }
2595 }
2596
2597 None
2598 }
2599
2600 /// Replace a range with replacement text
2601 pub fn replace_range(&mut self, range: Range<usize>, replacement: &str) -> bool {
2602 if range.start >= self.len() {
2603 return false;
2604 }
2605
2606 let end = range.end.min(self.len());
2607 if end > range.start {
2608 self.delete_bytes(range.start, end - range.start);
2609 }
2610
2611 if !replacement.is_empty() {
2612 self.insert(range.start, replacement);
2613 }
2614
2615 true
2616 }
2617
2618 /// Find and replace the next occurrence of a pattern
2619 pub fn replace_next(
2620 &mut self,
2621 pattern: &str,
2622 replacement: &str,
2623 start_pos: usize,
2624 range: Option<Range<usize>>,
2625 ) -> Option<usize> {
2626 if let Some(pos) = self.find_next_in_range(pattern, start_pos, range.clone()) {
2627 self.replace_range(pos..pos + pattern.len(), replacement);
2628 Some(pos)
2629 } else {
2630 None
2631 }
2632 }
2633
2634 /// Replace all occurrences of a pattern with replacement text
2635 pub fn replace_all(&mut self, pattern: &str, replacement: &str) -> usize {
2636 if pattern.is_empty() {
2637 return 0;
2638 }
2639
2640 let mut count = 0;
2641 let mut pos = 0;
2642
2643 // Keep searching and replacing
2644 // Note: we search forward from last replacement to handle growth/shrinkage
2645 // Find next occurrence (no wrap-around for replace_all)
2646 while let Some(found_pos) = self.find_next_in_range(pattern, pos, Some(0..self.len())) {
2647 self.replace_range(found_pos..found_pos + pattern.len(), replacement);
2648 count += 1;
2649
2650 // Move past the replacement
2651 pos = found_pos + replacement.len();
2652
2653 // If we're at or past the end, stop
2654 if pos >= self.len() {
2655 break;
2656 }
2657 }
2658
2659 count
2660 }
2661
2662 /// Replace all occurrences of a regex pattern with replacement text
2663 pub fn replace_all_regex(&mut self, regex: &Regex, replacement: &str) -> Result<usize> {
2664 let mut count = 0;
2665 let mut pos = 0;
2666
2667 while let Some(found_pos) = self.find_next_regex_in_range(regex, pos, Some(0..self.len())) {
2668 // Get the match to find its length
2669 let text = self
2670 .get_text_range_mut(found_pos, self.len() - found_pos)
2671 .context("Failed to read text for regex match")?;
2672
2673 if let Some(mat) = regex.find(&text) {
2674 self.replace_range(found_pos..found_pos + mat.len(), replacement);
2675 count += 1;
2676 pos = found_pos + replacement.len();
2677
2678 if pos >= self.len() {
2679 break;
2680 }
2681 } else {
2682 break;
2683 }
2684 }
2685
2686 Ok(count)
2687 }
2688
2689 // LSP Support (UTF-16 conversions)
2690
2691 /// Convert byte position to (line, column) in bytes
2692 pub fn position_to_line_col(&self, byte_pos: usize) -> (usize, usize) {
2693 self.offset_to_position(byte_pos)
2694 .map(|pos| (pos.line, pos.column))
2695 .unwrap_or_else(|| (byte_pos / 80, 0)) // Estimate if metadata unavailable
2696 }
2697
2698 /// Convert (line, character) to byte position - 0-indexed
2699 /// character is in BYTES, not UTF-16 code units
2700 /// Optimized to use single line_range() call instead of two
2701 pub fn line_col_to_position(&self, line: usize, character: usize) -> usize {
2702 if let Some((start, end)) = self.piece_tree.line_range(line, &self.buffers) {
2703 // Calculate line length from the range
2704 let line_len = if let Some(end_offset) = end {
2705 end_offset.saturating_sub(start)
2706 } else {
2707 self.total_bytes().saturating_sub(start)
2708 };
2709 let byte_offset = character.min(line_len);
2710 start + byte_offset
2711 } else {
2712 // Line doesn't exist, return end of buffer
2713 self.len()
2714 }
2715 }
2716
2717 /// Convert byte position to LSP position (line, UTF-16 code units)
2718 /// LSP protocol uses UTF-16 code units for character offsets
2719 pub fn position_to_lsp_position(&self, byte_pos: usize) -> (usize, usize) {
2720 let (line, column_bytes) = self
2721 .offset_to_position(byte_pos)
2722 .map(|pos| (pos.line, pos.column))
2723 .unwrap_or_else(|| (byte_pos / 80, 0)); // Estimate if metadata unavailable
2724
2725 // Get the line content
2726 if let Some(line_bytes) = self.get_line(line) {
2727 // Convert byte offset to UTF-16 code units
2728 let text_before = &line_bytes[..column_bytes.min(line_bytes.len())];
2729 let text_str = String::from_utf8_lossy(text_before);
2730 let utf16_offset = text_str.encode_utf16().count();
2731 (line, utf16_offset)
2732 } else {
2733 (line, 0)
2734 }
2735 }
2736
2737 /// Convert LSP position (line, UTF-16 code units) to byte position
2738 /// LSP uses UTF-16 code units for character offsets, not bytes
2739 /// Optimized to use single line_range() call instead of two
2740 pub fn lsp_position_to_byte(&self, line: usize, utf16_offset: usize) -> usize {
2741 if let Some((line_start, end)) = self.piece_tree.line_range(line, &self.buffers) {
2742 // Calculate line length and get line content
2743 let line_len = if let Some(end_offset) = end {
2744 end_offset.saturating_sub(line_start)
2745 } else {
2746 self.total_bytes().saturating_sub(line_start)
2747 };
2748
2749 if line_len > 0 {
2750 // If data is unloaded, return line_start as fallback
2751 let Some(line_bytes) = self.get_text_range(line_start, line_len) else {
2752 return line_start;
2753 };
2754 let line_str = String::from_utf8_lossy(&line_bytes);
2755
2756 // Convert UTF-16 offset to byte offset
2757 let mut utf16_count = 0;
2758 let mut byte_offset = 0;
2759
2760 for ch in line_str.chars() {
2761 if utf16_count >= utf16_offset {
2762 break;
2763 }
2764 utf16_count += ch.len_utf16();
2765 byte_offset += ch.len_utf8();
2766 }
2767
2768 line_start + byte_offset
2769 } else {
2770 line_start
2771 }
2772 } else {
2773 // Line doesn't exist, return end of buffer
2774 self.len()
2775 }
2776 }
2777
2778 // Navigation helpers
2779
2780 /// Find the previous character boundary (UTF-8 aware)
2781 pub fn prev_char_boundary(&self, pos: usize) -> usize {
2782 if pos == 0 {
2783 return 0;
2784 }
2785
2786 // Get a few bytes before pos to find the character boundary
2787 let start = pos.saturating_sub(4);
2788 let Some(bytes) = self.get_text_range(start, pos - start) else {
2789 // Data unloaded, return pos as fallback
2790 return pos;
2791 };
2792
2793 // Walk backwards to find a UTF-8 leading byte
2794 for i in (0..bytes.len()).rev() {
2795 let byte = bytes[i];
2796 // Check if this is a UTF-8 leading byte (not a continuation byte)
2797 if (byte & 0b1100_0000) != 0b1000_0000 {
2798 return start + i;
2799 }
2800 }
2801
2802 // Fallback
2803 pos.saturating_sub(1)
2804 }
2805
2806 /// Find the next character boundary (UTF-8 aware)
2807 pub fn next_char_boundary(&self, pos: usize) -> usize {
2808 let len = self.len();
2809 if pos >= len {
2810 return len;
2811 }
2812
2813 // Get a few bytes after pos to find the character boundary
2814 let end = (pos + 5).min(len);
2815 let Some(bytes) = self.get_text_range(pos, end - pos) else {
2816 // Data unloaded, return pos as fallback
2817 return pos;
2818 };
2819
2820 // Start from index 1 (we want the NEXT boundary)
2821 for (i, &byte) in bytes.iter().enumerate().skip(1) {
2822 // Check if this is a UTF-8 leading byte (not a continuation byte)
2823 if (byte & 0b1100_0000) != 0b1000_0000 {
2824 return pos + i;
2825 }
2826 }
2827
2828 // If we got here, we're at the end or found no boundary in the range
2829 end
2830 }
2831
2832 /// Check if a byte is a UTF-8 continuation byte (not at a char boundary)
2833 /// UTF-8 continuation bytes have the pattern 10xxxxxx (0x80-0xBF)
2834 /// This is the same check that str::is_char_boundary uses internally.
2835 #[inline]
2836 fn is_utf8_continuation_byte(byte: u8) -> bool {
2837 (byte & 0b1100_0000) == 0b1000_0000
2838 }
2839
2840 /// Snap position to a valid UTF-8 character boundary
2841 /// If already at a boundary, returns the same position.
2842 /// Otherwise, moves to the previous valid boundary.
2843 pub fn snap_to_char_boundary(&self, pos: usize) -> usize {
2844 let len = self.len();
2845 if pos == 0 || pos >= len {
2846 return pos.min(len);
2847 }
2848
2849 // Get the byte at pos to check if we're at a character boundary
2850 let Some(bytes) = self.get_text_range(pos, 1) else {
2851 // Data unloaded, return pos as fallback
2852 return pos;
2853 };
2854
2855 // A position is at a char boundary if the byte there is NOT a continuation byte
2856 if !Self::is_utf8_continuation_byte(bytes[0]) {
2857 // Already at a character boundary
2858 return pos;
2859 }
2860
2861 // Not at a boundary, find the previous one
2862 self.prev_char_boundary(pos)
2863 }
2864
2865 /// Find the previous grapheme cluster boundary (for proper cursor movement with combining characters)
2866 ///
2867 /// This handles complex scripts like Thai where multiple Unicode code points
2868 /// form a single visual character (grapheme cluster). For example, Thai "ที่"
2869 /// is 3 code points but 1 grapheme cluster.
2870 ///
2871 /// The lookahead window starts at 32 bytes but grows whenever the
2872 /// returned boundary sits at the start of the chunk — that is, whenever
2873 /// the chunk might not contain the full grapheme. This matters for ZWJ
2874 /// emoji sequences and Zalgo strings with many combining marks, which
2875 /// can easily exceed 32 bytes.
2876 pub fn prev_grapheme_boundary(&self, pos: usize) -> usize {
2877 if pos == 0 {
2878 return 0;
2879 }
2880
2881 let mut lookback: usize = 32;
2882 loop {
2883 // IMPORTANT: Align start to a valid character boundary to avoid invalid UTF-8
2884 // when get_text_range starts mid-character
2885 let raw_start = pos.saturating_sub(lookback);
2886 let start = if raw_start == 0 {
2887 0
2888 } else {
2889 // Find the character boundary at or before raw_start
2890 self.prev_char_boundary(raw_start + 1)
2891 };
2892
2893 let Some(bytes) = self.get_text_range(start, pos - start) else {
2894 // Data unloaded, fall back to char boundary
2895 return self.prev_char_boundary(pos);
2896 };
2897
2898 let text = match std::str::from_utf8(&bytes) {
2899 Ok(s) => s,
2900 Err(e) => {
2901 // Still got invalid UTF-8 (shouldn't happen after alignment)
2902 // Try using just the valid portion
2903 let valid_bytes = &bytes[..e.valid_up_to()];
2904 match std::str::from_utf8(valid_bytes) {
2905 Ok(s) if !s.is_empty() => s,
2906 _ => return self.prev_char_boundary(pos),
2907 }
2908 }
2909 };
2910
2911 // Use shared grapheme utility with relative position
2912 let rel_pos = pos - start;
2913 let new_rel_pos = grapheme::prev_grapheme_boundary(text, rel_pos);
2914
2915 // If the returned boundary is at the start of our chunk, the
2916 // grapheme may extend further back. Only trust the answer when
2917 // either we already reached the beginning of the buffer or the
2918 // boundary sits strictly inside the chunk.
2919 if new_rel_pos > 0 || start == 0 {
2920 return start + new_rel_pos;
2921 }
2922
2923 // Expand the lookback window and retry. Cap at the full buffer.
2924 if lookback >= pos {
2925 return 0;
2926 }
2927 lookback = lookback.saturating_mul(2);
2928 }
2929 }
2930
2931 /// Find the next grapheme cluster boundary (for proper cursor movement with combining characters)
2932 ///
2933 /// This handles complex scripts like Thai where multiple Unicode code points
2934 /// form a single visual character (grapheme cluster). For example, Thai "ที่"
2935 /// is 3 code points but 1 grapheme cluster.
2936 ///
2937 /// The lookahead window grows whenever the first grapheme reaches the
2938 /// end of the chunk — otherwise ZWJ emoji and Zalgo strings whose byte
2939 /// length exceeds the initial 32-byte window would be split mid-cluster.
2940 pub fn next_grapheme_boundary(&self, pos: usize) -> usize {
2941 let len = self.len();
2942 if pos >= len {
2943 return len;
2944 }
2945
2946 let mut lookahead: usize = 32;
2947 loop {
2948 let end = (pos + lookahead).min(len);
2949 let Some(bytes) = self.get_text_range(pos, end - pos) else {
2950 // Data unloaded, fall back to char boundary
2951 return self.next_char_boundary(pos);
2952 };
2953
2954 // Convert to UTF-8 string, handling the case where we might have
2955 // grabbed bytes that end mid-character (truncate to valid UTF-8)
2956 let text = match std::str::from_utf8(&bytes) {
2957 Ok(s) => s,
2958 Err(e) => {
2959 // The bytes end in an incomplete UTF-8 sequence
2960 // Use only the valid portion (which includes at least the first grapheme)
2961 let valid_bytes = &bytes[..e.valid_up_to()];
2962 match std::str::from_utf8(valid_bytes) {
2963 Ok(s) if !s.is_empty() => s,
2964 _ => return self.next_char_boundary(pos),
2965 }
2966 }
2967 };
2968
2969 let new_rel_pos = grapheme::next_grapheme_boundary(text, 0);
2970
2971 // If the first grapheme reaches the end of our chunk and there
2972 // is more buffer left beyond it, the grapheme may extend further.
2973 // Expand the window and retry.
2974 if new_rel_pos == text.len() && end < len {
2975 if lookahead >= len - pos {
2976 return len;
2977 }
2978 lookahead = lookahead.saturating_mul(2);
2979 continue;
2980 }
2981
2982 return pos + new_rel_pos;
2983 }
2984 }
2985
2986 /// Find the previous word boundary
2987 pub fn prev_word_boundary(&self, pos: usize) -> usize {
2988 if pos == 0 {
2989 return 0;
2990 }
2991
2992 // Get some text before pos
2993 let start = pos.saturating_sub(256).max(0);
2994 let Some(bytes) = self.get_text_range(start, pos - start) else {
2995 // Data unloaded, return pos as fallback
2996 return pos;
2997 };
2998 let text = String::from_utf8_lossy(&bytes);
2999
3000 let mut found_word_char = false;
3001 let chars: Vec<char> = text.chars().collect();
3002
3003 for i in (0..chars.len()).rev() {
3004 let ch = chars[i];
3005 let is_word_char = ch.is_alphanumeric() || ch == '_';
3006
3007 if found_word_char && !is_word_char {
3008 // We've transitioned from word to non-word
3009 // Calculate the byte position
3010 let byte_offset: usize = chars[0..=i].iter().map(|c| c.len_utf8()).sum();
3011 return start + byte_offset;
3012 }
3013
3014 if is_word_char {
3015 found_word_char = true;
3016 }
3017 }
3018
3019 0
3020 }
3021
3022 /// Find the next word boundary
3023 pub fn next_word_boundary(&self, pos: usize) -> usize {
3024 let len = self.len();
3025 if pos >= len {
3026 return len;
3027 }
3028
3029 // Get some text after pos
3030 let end = (pos + 256).min(len);
3031 let Some(bytes) = self.get_text_range(pos, end - pos) else {
3032 // Data unloaded, return pos as fallback
3033 return pos;
3034 };
3035 let text = String::from_utf8_lossy(&bytes);
3036
3037 let mut found_word_char = false;
3038 let mut byte_offset = 0;
3039
3040 for ch in text.chars() {
3041 let is_word_char = ch.is_alphanumeric() || ch == '_';
3042
3043 if found_word_char && !is_word_char {
3044 // We've transitioned from word to non-word
3045 return pos + byte_offset;
3046 }
3047
3048 if is_word_char {
3049 found_word_char = true;
3050 }
3051
3052 byte_offset += ch.len_utf8();
3053 }
3054
3055 len
3056 }
3057
3058 /// Create a line iterator starting at the given byte position
3059 ///
3060 /// This iterator lazily loads chunks as needed, never scanning the entire file.
3061 /// For large files with unloaded buffers, chunks are loaded on-demand (1MB at a time).
3062 pub fn line_iterator(
3063 &mut self,
3064 byte_pos: usize,
3065 estimated_line_length: usize,
3066 ) -> LineIterator<'_> {
3067 LineIterator::new(self, byte_pos, estimated_line_length)
3068 }
3069
3070 /// Iterate over lines starting from a given byte offset, with line numbers
3071 ///
3072 /// This is a more efficient alternative to using line_iterator() + offset_to_position()
3073 /// because it calculates line numbers incrementally during iteration by accumulating
3074 /// line_feed_cnt from pieces (which is already tracked in the piece tree).
3075 ///
3076 /// Returns: Iterator yielding (byte_offset, content, line_number: Option<usize>)
3077 /// - line_number is Some(n) for small files with line metadata
3078 /// - line_number is None for large files without line metadata
3079 ///
3080 /// # Performance
3081 /// - O(1) per line for line number calculation (vs O(log n) per line with offset_to_position)
3082 /// - Uses single source of truth: piece tree's existing line_feed_cnt metadata
3083 pub fn iter_lines_from(
3084 &mut self,
3085 byte_pos: usize,
3086 max_lines: usize,
3087 ) -> Result<TextBufferLineIterator> {
3088 TextBufferLineIterator::new(self, byte_pos, max_lines)
3089 }
3090
3091 // Legacy API methods for backwards compatibility
3092
3093 /// Get the line number for a given byte offset
3094 ///
3095 /// Returns exact line number if metadata available, otherwise estimates based on bytes.
3096 ///
3097 /// # Behavior by File Size:
3098 /// - **Small files (< 1MB)**: Returns exact line number from piece tree's `line_starts` metadata
3099 /// - **Large files (≥ 1MB)**: Returns estimated line number using `byte_offset / estimated_line_length`
3100 ///
3101 /// Large files don't maintain line metadata for performance reasons. The estimation
3102 /// uses the configured `estimated_line_length` (default 80 bytes).
3103 pub fn get_line_number(&self, byte_offset: usize) -> usize {
3104 self.offset_to_position(byte_offset)
3105 .map(|pos| pos.line)
3106 .unwrap_or_else(|| {
3107 // Estimate line number based on configured average line length
3108 byte_offset / self.config.estimated_line_length
3109 })
3110 }
3111
3112 /// Get the configured estimated line length for approximate line number calculations.
3113 pub fn estimated_line_length(&self) -> usize {
3114 self.config.estimated_line_length
3115 }
3116
3117 /// Get the starting line number at a byte offset (used for viewport rendering)
3118 ///
3119 /// # Line Cache Architecture (Post-Refactoring):
3120 ///
3121 /// The concept of a separate "line cache" is **now obsolete**. After the refactoring,
3122 /// line tracking is integrated directly into the piece tree via:
3123 /// ```rust
3124 /// BufferData::Loaded {
3125 /// data: Vec<u8>,
3126 /// line_starts: Option<Vec<usize>> // None = large file mode (no line metadata)
3127 /// }
3128 /// ```
3129 ///
3130 /// ## Why This Method Still Exists:
3131 /// The rendering code needs to know what line number to display in the margin at the
3132 /// top of the viewport. This method returns that line number, handling both small
3133 /// and large file modes transparently.
3134 ///
3135 /// ## Small vs Large File Modes:
3136 /// - **Small files**: `line_starts = Some(vec)` → returns exact line number from metadata
3137 /// - **Large files**: `line_starts = None` → returns estimated line number (byte_offset / estimated_line_length)
3138 ///
3139 /// ## Legacy Line Cache Methods:
3140 /// These methods are now no-ops and can be removed in a future cleanup:
3141 /// - `invalidate_line_cache_from()` - No-op (piece tree updates automatically)
3142 /// - `handle_line_cache_insertion()` - No-op (piece tree updates automatically)
3143 /// - `handle_line_cache_deletion()` - No-op (piece tree updates automatically)
3144 /// - `clear_line_cache()` - No-op (can't clear piece tree metadata)
3145 ///
3146 /// ## Bug Fix (2025-11):
3147 /// Previously this method always returned `0`, causing line numbers in the margin
3148 /// to always show 1, 2, 3... regardless of scroll position. Now it correctly returns
3149 /// the actual line number at `start_byte`.
3150 pub fn populate_line_cache(&mut self, start_byte: usize, _line_count: usize) -> usize {
3151 // No-op for cache population: LineIndex maintains all line starts automatically
3152 // But we need to return the actual line number at start_byte for rendering
3153 self.get_line_number(start_byte)
3154 }
3155
3156 /// Get cached byte offset for line (compatibility method)
3157 pub fn get_cached_byte_offset_for_line(&self, line_number: usize) -> Option<usize> {
3158 self.line_start_offset(line_number)
3159 }
3160
3161 /// Invalidate line cache from offset (no-op in new implementation)
3162 pub fn invalidate_line_cache_from(&mut self, _byte_offset: usize) {
3163 // No-op: LineIndex updates automatically
3164 }
3165
3166 /// Handle line cache insertion (no-op in new implementation)
3167 pub fn handle_line_cache_insertion(&mut self, _byte_offset: usize, _bytes_inserted: usize) {
3168 // No-op: LineIndex updates automatically during insert
3169 }
3170
3171 /// Handle line cache deletion (no-op in new implementation)
3172 pub fn handle_line_cache_deletion(&mut self, _byte_offset: usize, _bytes_deleted: usize) {
3173 // No-op: LineIndex updates automatically during delete
3174 }
3175
3176 /// Clear line cache (no-op in new implementation)
3177 pub fn clear_line_cache(&mut self) {
3178 // No-op: LineIndex can't be cleared
3179 }
3180
3181 // Test helper methods
3182
3183 /// Create a buffer from a string for testing
3184 #[cfg(test)]
3185 pub fn from_str_test(s: &str) -> Self {
3186 Self::from_bytes(
3187 s.as_bytes().to_vec(),
3188 std::sync::Arc::new(crate::model::filesystem::StdFileSystem),
3189 )
3190 }
3191
3192 /// Create a new empty buffer for testing
3193 #[cfg(test)]
3194 pub fn new_test() -> Self {
3195 Self::empty(std::sync::Arc::new(crate::model::filesystem::StdFileSystem))
3196 }
3197}
3198
3199/// Type alias for backwards compatibility
3200pub type Buffer = TextBuffer;
3201
3202// Re-export LineIterator from the line_iterator module
3203pub use crate::primitives::line_iterator::LineIterator;
3204
3205// ============================================================================
3206// Overlapping Chunks Iterator for Efficient Search
3207// ============================================================================
3208
3209/// Information about a chunk of data for pattern matching
3210#[derive(Debug)]
3211pub struct ChunkInfo {
3212 /// The buffer containing this chunk's data (includes overlap from previous chunk)
3213 pub buffer: Vec<u8>,
3214
3215 /// Absolute position in the document where this buffer starts
3216 pub absolute_pos: usize,
3217
3218 /// Offset within buffer where "new" data starts (valid match zone)
3219 /// Matches starting before this offset were already checked in the previous chunk
3220 pub valid_start: usize,
3221}
3222
3223/// Iterator that yields overlapping chunks for pattern matching
3224///
3225/// This iterator implements the VSCode/Sublime approach: pull overlapping chunks
3226/// from the underlying piece tree and use standard search algorithms on them.
3227///
3228/// # Algorithm
3229///
3230/// ```text
3231/// Chunk 1: [------------ valid -----------]
3232/// Chunk 2: [overlap][---- valid ----]
3233/// Chunk 3: [overlap][-- valid --]
3234///
3235/// Only matches starting in the "valid" zone are reported to avoid duplicates.
3236/// ```
3237///
3238/// # Example
3239///
3240/// ```ignore
3241/// let chunks = OverlappingChunks::new(&text_buffer, start, end, 4096, pattern.len()-1);
3242/// for chunk in chunks {
3243/// // Search only starting from chunk.valid_start
3244/// if let Some(pos) = search(&chunk.buffer[chunk.valid_start..]) {
3245/// let absolute_pos = chunk.absolute_pos + chunk.valid_start + pos;
3246/// return Some(absolute_pos);
3247/// }
3248/// }
3249/// ```
3250pub struct OverlappingChunks<'a> {
3251 piece_iter: PieceRangeIter,
3252 buffers: &'a [StringBuffer],
3253
3254 // Reusable chunk buffer that we fill from pieces
3255 buffer: Vec<u8>,
3256 buffer_absolute_pos: usize,
3257
3258 // Current state
3259 current_pos: usize,
3260 end_pos: usize,
3261
3262 // Configuration
3263 chunk_size: usize,
3264 overlap: usize,
3265
3266 // Track first chunk special case
3267 first_chunk: bool,
3268
3269 // Cached piece data for incremental reading
3270 current_piece_data: Option<Vec<u8>>,
3271 current_piece_offset: usize,
3272}
3273
3274impl<'a> OverlappingChunks<'a> {
3275 /// Create a new overlapping chunks iterator
3276 ///
3277 /// # Arguments
3278 ///
3279 /// * `text_buffer` - The text buffer to iterate over
3280 /// * `start` - Start position in the document
3281 /// * `end` - End position in the document (exclusive)
3282 /// * `chunk_size` - Target size for each chunk (excluding overlap)
3283 /// * `overlap` - Number of bytes to overlap between chunks
3284 ///
3285 /// # Recommendations
3286 ///
3287 /// * For literal string search: `chunk_size=65536, overlap=pattern.len()-1`
3288 /// * For regex search: `chunk_size=1048576, overlap=4096`
3289 pub fn new(
3290 text_buffer: &'a TextBuffer,
3291 start: usize,
3292 end: usize,
3293 chunk_size: usize,
3294 overlap: usize,
3295 ) -> Self {
3296 let piece_iter = text_buffer.piece_tree.iter_pieces_in_range(start, end);
3297
3298 Self {
3299 piece_iter,
3300 buffers: &text_buffer.buffers,
3301 buffer: Vec::with_capacity(chunk_size + overlap),
3302 buffer_absolute_pos: start,
3303 current_pos: start,
3304 end_pos: end,
3305 chunk_size,
3306 overlap,
3307 first_chunk: true,
3308 current_piece_data: None,
3309 current_piece_offset: 0,
3310 }
3311 }
3312
3313 /// Read one byte from the piece iterator
3314 fn read_byte(&mut self) -> Option<u8> {
3315 loop {
3316 // If we have cached piece data, read from it
3317 if let Some(ref data) = self.current_piece_data {
3318 if self.current_piece_offset < data.len() {
3319 let byte = data[self.current_piece_offset];
3320 self.current_piece_offset += 1;
3321 self.current_pos += 1;
3322 return Some(byte);
3323 } else {
3324 // Exhausted current piece, move to next
3325 self.current_piece_data = None;
3326 self.current_piece_offset = 0;
3327 }
3328 }
3329
3330 // Get next piece
3331 if let Some(piece_view) = self.piece_iter.next() {
3332 let buffer_id = piece_view.location.buffer_id();
3333 if let Some(buffer) = self.buffers.get(buffer_id) {
3334 // Extract the relevant slice from this piece
3335 let piece_start_in_doc = piece_view.doc_offset;
3336 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
3337
3338 // Clip to our search range
3339 let read_start = self.current_pos.max(piece_start_in_doc);
3340 let read_end = self.end_pos.min(piece_end_in_doc);
3341
3342 if read_end > read_start {
3343 let offset_in_piece = read_start - piece_start_in_doc;
3344 let bytes_to_read = read_end - read_start;
3345
3346 let buffer_start = piece_view.buffer_offset + offset_in_piece;
3347 let buffer_end = buffer_start + bytes_to_read;
3348
3349 if let Some(data) = buffer.get_data() {
3350 if buffer_end <= data.len() {
3351 // Cache this piece's data
3352 self.current_piece_data =
3353 Some(data[buffer_start..buffer_end].to_vec());
3354 self.current_piece_offset = 0;
3355 continue;
3356 }
3357 }
3358 }
3359 }
3360 }
3361
3362 // No more data
3363 return None;
3364 }
3365 }
3366
3367 /// Fill the buffer with the next chunk of data
3368 fn fill_next_chunk(&mut self) -> bool {
3369 if self.first_chunk {
3370 // First chunk: fill up to chunk_size
3371 self.first_chunk = false;
3372 while self.buffer.len() < self.chunk_size && self.current_pos < self.end_pos {
3373 if let Some(byte) = self.read_byte() {
3374 self.buffer.push(byte);
3375 } else {
3376 break;
3377 }
3378 }
3379 !self.buffer.is_empty()
3380 } else {
3381 // Subsequent chunks: keep overlap, fill chunk_size NEW bytes
3382 if self.current_pos >= self.end_pos {
3383 return false;
3384 }
3385
3386 // Keep overlap bytes at the end
3387 if self.buffer.len() > self.overlap {
3388 let drain_amount = self.buffer.len() - self.overlap;
3389 self.buffer.drain(0..drain_amount);
3390 self.buffer_absolute_pos += drain_amount;
3391 }
3392
3393 // Fill chunk_size NEW bytes (in addition to overlap)
3394 let before_len = self.buffer.len();
3395 let target_len = self.overlap + self.chunk_size;
3396 while self.buffer.len() < target_len && self.current_pos < self.end_pos {
3397 if let Some(byte) = self.read_byte() {
3398 self.buffer.push(byte);
3399 } else {
3400 break;
3401 }
3402 }
3403
3404 // Return true if we added new data
3405 self.buffer.len() > before_len
3406 }
3407 }
3408}
3409
3410impl<'a> Iterator for OverlappingChunks<'a> {
3411 type Item = ChunkInfo;
3412
3413 fn next(&mut self) -> Option<Self::Item> {
3414 // Track if this is the first chunk before filling
3415 let is_first = self.buffer_absolute_pos == self.current_pos;
3416
3417 if !self.fill_next_chunk() {
3418 return None;
3419 }
3420
3421 // First chunk: all data is valid (no overlap from previous)
3422 // Subsequent chunks: overlap bytes are not valid (already checked)
3423 let valid_start = if is_first {
3424 0
3425 } else {
3426 self.overlap.min(self.buffer.len())
3427 };
3428
3429 Some(ChunkInfo {
3430 buffer: self.buffer.clone(),
3431 absolute_pos: self.buffer_absolute_pos,
3432 valid_start,
3433 })
3434 }
3435}
3436
3437#[cfg(test)]
3438mod tests;
3439
3440#[cfg(test)]
3441mod property_tests;
3442
3443/// Line data with optional line number
3444#[derive(Debug, Clone)]
3445pub struct LineData {
3446 /// Byte offset where this line starts in the document
3447 pub byte_offset: usize,
3448 /// Line content (without trailing newline)
3449 pub content: String,
3450 /// Whether this line ends with a newline
3451 pub has_newline: bool,
3452 /// Line number (None for large files without line metadata)
3453 pub line_number: Option<usize>,
3454}
3455
3456/// Iterator over lines in a TextBuffer that efficiently tracks line numbers
3457/// using piece tree metadata (single source of truth)
3458pub struct TextBufferLineIterator {
3459 /// Collected lines (we collect all at once since we need mutable access to load chunks)
3460 lines: Vec<LineData>,
3461 /// Current index in the lines vector
3462 current_index: usize,
3463 /// Whether there are more lines after these
3464 pub has_more: bool,
3465}
3466
3467impl TextBufferLineIterator {
3468 pub(crate) fn new(buffer: &mut TextBuffer, byte_pos: usize, max_lines: usize) -> Result<Self> {
3469 let buffer_len = buffer.len();
3470 if byte_pos >= buffer_len {
3471 return Ok(Self {
3472 lines: Vec::new(),
3473 current_index: 0,
3474 has_more: false,
3475 });
3476 }
3477
3478 // Check if buffer has line metadata (None for large files > 1MB)
3479 let has_line_metadata = buffer.line_count().is_some();
3480
3481 // Determine starting line number by querying piece tree once
3482 // (only if we have line metadata)
3483 let mut current_line = if has_line_metadata {
3484 buffer.offset_to_position(byte_pos).map(|pos| pos.line)
3485 } else {
3486 None
3487 };
3488
3489 let mut lines = Vec::with_capacity(max_lines);
3490 let mut current_offset = byte_pos;
3491 let estimated_line_length = 80; // Use default estimate
3492
3493 // Collect lines by scanning forward
3494 for _ in 0..max_lines {
3495 if current_offset >= buffer_len {
3496 break;
3497 }
3498
3499 let line_start = current_offset;
3500 let line_number = current_line;
3501
3502 // Estimate how many bytes to load for this line
3503 let estimated_max_line_length = estimated_line_length * 3;
3504 let bytes_to_scan = estimated_max_line_length.min(buffer_len - current_offset);
3505
3506 // Load chunk (this handles lazy loading)
3507 let chunk = buffer.get_text_range_mut(current_offset, bytes_to_scan)?;
3508
3509 // Scan for newline
3510 let mut line_len = 0;
3511 let mut found_newline = false;
3512 for &byte in chunk.iter() {
3513 line_len += 1;
3514 if byte == b'\n' {
3515 found_newline = true;
3516 break;
3517 }
3518 }
3519
3520 // Handle long lines (rare case)
3521 if !found_newline && current_offset + line_len < buffer_len {
3522 // Line is longer than expected, load more data
3523 let remaining = buffer_len - current_offset - line_len;
3524 let additional_bytes = estimated_max_line_length.min(remaining);
3525 let more_chunk =
3526 buffer.get_text_range_mut(current_offset + line_len, additional_bytes)?;
3527
3528 let mut extended_chunk = chunk;
3529 extended_chunk.extend_from_slice(&more_chunk);
3530
3531 for &byte in more_chunk.iter() {
3532 line_len += 1;
3533 if byte == b'\n' {
3534 found_newline = true;
3535 break;
3536 }
3537 }
3538
3539 let line_string = String::from_utf8_lossy(&extended_chunk[..line_len]).into_owned();
3540 let has_newline = line_string.ends_with('\n');
3541 let content = if has_newline {
3542 line_string[..line_string.len() - 1].to_string()
3543 } else {
3544 line_string
3545 };
3546
3547 lines.push(LineData {
3548 byte_offset: line_start,
3549 content,
3550 has_newline,
3551 line_number,
3552 });
3553
3554 current_offset += line_len;
3555 if has_line_metadata && found_newline {
3556 current_line = current_line.map(|n| n + 1);
3557 }
3558 continue;
3559 }
3560
3561 // Normal case
3562 let line_string = String::from_utf8_lossy(&chunk[..line_len]).into_owned();
3563 let has_newline = line_string.ends_with('\n');
3564 let content = if has_newline {
3565 line_string[..line_string.len() - 1].to_string()
3566 } else {
3567 line_string
3568 };
3569
3570 lines.push(LineData {
3571 byte_offset: line_start,
3572 content,
3573 has_newline,
3574 line_number,
3575 });
3576
3577 current_offset += line_len;
3578 // Increment line number if we have metadata and found a newline
3579 if has_line_metadata && found_newline {
3580 current_line = current_line.map(|n| n + 1);
3581 }
3582 }
3583
3584 // Check if there are more lines
3585 let has_more = current_offset < buffer_len;
3586
3587 Ok(Self {
3588 lines,
3589 current_index: 0,
3590 has_more,
3591 })
3592 }
3593}
3594
3595impl Iterator for TextBufferLineIterator {
3596 type Item = LineData;
3597
3598 fn next(&mut self) -> Option<Self::Item> {
3599 if self.current_index < self.lines.len() {
3600 let line = self.lines[self.current_index].clone();
3601 self.current_index += 1;
3602 Some(line)
3603 } else {
3604 None
3605 }
3606 }
3607}