fresh/model/buffer/mod.rs
1/// Text buffer that uses PieceTree with integrated line tracking
2/// Architecture where the tree is the single source of truth for text and line information
3use crate::model::encoding;
4use crate::model::filesystem::{FileSearchOptions, FileSystem};
5use crate::model::piece_tree::{
6 BufferData, BufferLocation, Cursor, PieceInfo, PieceRangeIter, PieceTree, PieceView, Position,
7 StringBuffer, TreeStats,
8};
9use crate::model::piece_tree_diff::PieceTreeDiff;
10use crate::primitives::grapheme;
11use anyhow::{Context, Result};
12use regex::bytes::Regex;
13use std::io;
14
15use std::ops::Range;
16use std::path::{Path, PathBuf};
17use std::sync::Arc;
18
19// Re-export Encoding for backward compatibility
20pub use encoding::Encoding;
21
22pub mod file_kind;
23pub mod format;
24pub mod persistence;
25pub mod save;
26pub mod search;
27pub use file_kind::BufferFileKind;
28pub use format::{BufferFormat, LineEnding};
29pub use persistence::Persistence;
30pub use save::SudoSaveRequired;
31#[cfg(test)]
32pub(crate) use save::{RecipeAction, WriteRecipe};
33#[cfg(test)]
34use search::search_boundary_overlap;
35use search::SearchRegion;
36pub use search::{ChunkedSearchState, HybridSearchPlan};
37
38/// Error returned when a large file has a non-resynchronizable encoding
39/// and requires user confirmation before loading the entire file into memory.
40///
41/// Non-resynchronizable encodings (like Shift-JIS, GB18030, GBK, EUC-KR) cannot
42/// determine character boundaries when jumping into the middle of a file.
43/// This means the entire file must be loaded and decoded sequentially.
44#[derive(Debug, Clone, PartialEq)]
45pub struct LargeFileEncodingConfirmation {
46 /// Path to the file
47 pub path: PathBuf,
48 /// Size of the file in bytes
49 pub file_size: usize,
50 /// The detected encoding that requires full loading
51 pub encoding: Encoding,
52}
53
54impl std::fmt::Display for LargeFileEncodingConfirmation {
55 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56 let size_mb = self.file_size as f64 / (1024.0 * 1024.0);
57 write!(
58 f,
59 "{} ({:.0} MB) requires full load. (l)oad, (e)ncoding, (C)ancel? ",
60 self.encoding.display_name(),
61 size_mb
62 )
63 }
64}
65
66impl std::error::Error for LargeFileEncodingConfirmation {}
67
68/// A work item for incremental line-feed scanning (one per leaf).
69#[derive(Debug, Clone)]
70pub struct LineScanChunk {
71 /// Index of the leaf in the piece tree's leaf array.
72 pub leaf_index: usize,
73 /// Number of bytes in this leaf.
74 pub byte_len: usize,
75 /// True if the leaf already had a known line_feed_cnt (no I/O needed).
76 pub already_known: bool,
77}
78
79// Re-export SearchMatch from filesystem — same type is used by both
80// FileSystem::search_file (project grep on disk) and the piece-tree
81// search below (in-editor Ctrl+F and dirty buffers).
82pub use crate::model::filesystem::SearchMatch;
83
84// Large file support configuration
85/// Default threshold for considering a file "large" (100 MB)
86pub const DEFAULT_LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024;
87
88/// Chunk size to load when lazy loading (1 MB)
89pub const LOAD_CHUNK_SIZE: usize = 1024 * 1024;
90
91/// Chunk alignment for lazy loading (64 KB)
92pub const CHUNK_ALIGNMENT: usize = 64 * 1024;
93
94/// Configuration passed to TextBuffer constructors.
95#[derive(Debug, Clone)]
96pub struct BufferConfig {
97 /// Estimated average line length in bytes. Used for approximate line number
98 /// display in large files and for goto-line byte offset estimation.
99 pub estimated_line_length: usize,
100}
101
102impl Default for BufferConfig {
103 fn default() -> Self {
104 Self {
105 estimated_line_length: 80,
106 }
107 }
108}
109
110/// Line ending format used in the file
111
112/// Represents a line number (simplified for new implementation)
113/// Legacy enum kept for backwards compatibility - always Absolute now
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub enum LineNumber {
116 /// Absolute line number - this is the actual line number in the file
117 Absolute(usize),
118 /// Relative line number (deprecated - now same as Absolute)
119 Relative {
120 line: usize,
121 from_cached_line: usize,
122 },
123}
124
125impl LineNumber {
126 /// Get the line number value
127 pub fn value(&self) -> usize {
128 match self {
129 Self::Absolute(line) | Self::Relative { line, .. } => *line,
130 }
131 }
132
133 /// Check if this is an absolute line number
134 pub fn is_absolute(&self) -> bool {
135 matches!(self, LineNumber::Absolute(_))
136 }
137
138 /// Check if this is a relative line number
139 pub fn is_relative(&self) -> bool {
140 matches!(self, LineNumber::Relative { .. })
141 }
142
143 /// Format the line number for display
144 pub fn format(&self) -> String {
145 match self {
146 Self::Absolute(line) => format!("{}", line + 1),
147 Self::Relative { line, .. } => format!("~{}", line + 1),
148 }
149 }
150}
151
152/// A text buffer that manages document content using a piece table
153/// with integrated line tracking
154pub struct TextBuffer {
155 /// The piece tree for efficient text manipulation with integrated line tracking
156 piece_tree: PieceTree,
157
158 /// List of string buffers containing chunks of text data.
159 /// Index 0 is typically the original/stored buffer.
160 /// Additional buffers are added for modifications.
161 buffers: Vec<StringBuffer>,
162
163 /// Next buffer ID to assign.
164 next_buffer_id: usize,
165
166 /// Filesystem handle, optional file path, dirty/recovery flags,
167 /// saved-root snapshot, and saved-file size — see
168 /// `persistence.rs`.
169 persistence: Persistence,
170
171 /// File-kind flags (large_file, line_feeds_scanned, is_binary) —
172 /// see `file_kind.rs`.
173 file_kind: BufferFileKind,
174
175 /// Encoding + line-ending state — see `format.rs`.
176 format: BufferFormat,
177
178 /// Monotonic version counter for change tracking.
179 version: u64,
180
181 /// Buffer configuration (estimated line length, etc.)
182 config: BufferConfig,
183}
184
185/// Snapshot of a TextBuffer's piece tree and associated string buffers.
186///
187/// Used by BulkEdit undo/redo to capture the complete buffer state.
188/// Without this, consolidate_after_save() would destroy the string buffers
189/// that a BulkEdit's piece tree snapshot references, causing corruption on undo.
190#[derive(Debug, Clone)]
191pub struct BufferSnapshot {
192 pub piece_tree: PieceTree,
193 pub buffers: Vec<StringBuffer>,
194 pub next_buffer_id: usize,
195}
196
197impl TextBuffer {
198 /// Create a new text buffer with the given filesystem implementation.
199 /// Note: large_file_threshold is ignored in the new implementation
200 pub fn new(_large_file_threshold: usize, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
201 let piece_tree = PieceTree::empty();
202 let saved_root = piece_tree.root();
203 let line_ending = LineEnding::default();
204 let encoding = Encoding::default();
205 TextBuffer {
206 piece_tree,
207 buffers: vec![StringBuffer::new(0, Vec::new())],
208 next_buffer_id: 1,
209 persistence: Persistence::new(fs, None, saved_root, None),
210 file_kind: BufferFileKind::new(false, false),
211 format: BufferFormat::new(line_ending, encoding),
212 version: 0,
213 config: BufferConfig::default(),
214 }
215 }
216
217 /// Create an empty buffer associated with a file path.
218 /// Used for files that don't exist yet — the path is set so saving will create the file.
219 pub fn new_with_path(
220 large_file_threshold: usize,
221 fs: Arc<dyn FileSystem + Send + Sync>,
222 path: PathBuf,
223 ) -> Self {
224 let mut buffer = Self::new(large_file_threshold, fs);
225 buffer.persistence.set_file_path(path);
226 buffer
227 }
228
229 /// Current buffer version (monotonic, wraps on overflow)
230 pub fn version(&self) -> u64 {
231 self.version
232 }
233
234 /// Get a reference to the filesystem implementation used by this buffer.
235 pub fn filesystem(&self) -> &Arc<dyn FileSystem + Send + Sync> {
236 self.persistence.fs()
237 }
238
239 /// Set the filesystem implementation for this buffer.
240 pub fn set_filesystem(&mut self, fs: Arc<dyn FileSystem + Send + Sync>) {
241 self.persistence.set_fs(fs);
242 }
243
244 #[inline]
245 fn bump_version(&mut self) {
246 self.version = self.version.wrapping_add(1);
247 }
248
249 #[inline]
250 fn mark_content_modified(&mut self) {
251 self.persistence.mark_dirty();
252 self.bump_version();
253 }
254
255 /// Create a text buffer from raw bytes WITHOUT encoding conversion.
256 /// Used for binary files where we want to preserve the exact bytes.
257 fn from_bytes_raw(content: Vec<u8>, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
258 let bytes = content.len();
259
260 // For binary files, detect line ending but don't convert encoding
261 let line_ending = format::detect_line_ending(&content);
262
263 // Create initial StringBuffer with ID 0
264 let buffer = StringBuffer::new(0, content);
265 let line_feed_cnt = buffer.line_feed_count();
266
267 let piece_tree = if bytes > 0 {
268 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
269 } else {
270 PieceTree::empty()
271 };
272
273 let saved_root = piece_tree.root();
274
275 TextBuffer {
276 piece_tree,
277 buffers: vec![buffer],
278 next_buffer_id: 1,
279 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
280 file_kind: BufferFileKind::new(false, true),
281 format: BufferFormat::new(line_ending, Encoding::Utf8),
282 version: 0,
283 config: BufferConfig::default(),
284 }
285 }
286
287 /// Create a text buffer from initial content with the given filesystem.
288 pub fn from_bytes(content: Vec<u8>, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
289 // Auto-detect encoding and convert to UTF-8 if needed
290 let (encoding, utf8_content) = format::detect_and_convert_encoding(&content);
291
292 let bytes = utf8_content.len();
293
294 // Auto-detect line ending format from content
295 let line_ending = format::detect_line_ending(&utf8_content);
296
297 // Create initial StringBuffer with ID 0
298 let buffer = StringBuffer::new(0, utf8_content);
299 let line_feed_cnt = buffer.line_feed_count();
300
301 let piece_tree = if bytes > 0 {
302 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
303 } else {
304 PieceTree::empty()
305 };
306
307 let saved_root = piece_tree.root();
308
309 TextBuffer {
310 piece_tree,
311 buffers: vec![buffer],
312 next_buffer_id: 1,
313 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
314 file_kind: BufferFileKind::new(false, false),
315 format: BufferFormat::new(line_ending, encoding),
316 version: 0,
317 config: BufferConfig::default(),
318 }
319 }
320
321 /// Create a text buffer from bytes with a specific encoding (no auto-detection).
322 pub fn from_bytes_with_encoding(
323 content: Vec<u8>,
324 encoding: Encoding,
325 fs: Arc<dyn FileSystem + Send + Sync>,
326 ) -> Self {
327 // Convert from specified encoding to UTF-8
328 let utf8_content = encoding::convert_to_utf8(&content, encoding);
329
330 let bytes = utf8_content.len();
331
332 // Auto-detect line ending format from content
333 let line_ending = format::detect_line_ending(&utf8_content);
334
335 // Create initial StringBuffer with ID 0
336 let buffer = StringBuffer::new(0, utf8_content);
337 let line_feed_cnt = buffer.line_feed_count();
338
339 let piece_tree = if bytes > 0 {
340 PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
341 } else {
342 PieceTree::empty()
343 };
344
345 let saved_root = piece_tree.root();
346
347 TextBuffer {
348 piece_tree,
349 buffers: vec![buffer],
350 next_buffer_id: 1,
351 persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
352 file_kind: BufferFileKind::new(false, false),
353 format: BufferFormat::new(line_ending, encoding),
354 version: 0,
355 config: BufferConfig::default(),
356 }
357 }
358
359 /// Create a text buffer from a string with the given filesystem.
360 pub fn from_str(
361 s: &str,
362 _large_file_threshold: usize,
363 fs: Arc<dyn FileSystem + Send + Sync>,
364 ) -> Self {
365 Self::from_bytes(s.as_bytes().to_vec(), fs)
366 }
367
368 /// Create an empty text buffer with the given filesystem.
369 pub fn empty(fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
370 let piece_tree = PieceTree::empty();
371 let saved_root = piece_tree.root();
372 let line_ending = LineEnding::default();
373 let encoding = Encoding::default();
374 TextBuffer {
375 piece_tree,
376 buffers: vec![StringBuffer::new(0, Vec::new())],
377 next_buffer_id: 1,
378 persistence: Persistence::new(fs, None, saved_root, None),
379 file_kind: BufferFileKind::new(false, false),
380 format: BufferFormat::new(line_ending, encoding),
381 version: 0,
382 config: BufferConfig::default(),
383 }
384 }
385
386 /// Load a text buffer from a file using the given filesystem.
387 pub fn load_from_file<P: AsRef<Path>>(
388 path: P,
389 large_file_threshold: usize,
390 fs: Arc<dyn FileSystem + Send + Sync>,
391 ) -> anyhow::Result<Self> {
392 let path = path.as_ref();
393
394 // Get file size to determine loading strategy
395 let metadata = fs.metadata(path)?;
396 let file_size = metadata.size as usize;
397
398 // Use threshold parameter or default
399 let threshold = if large_file_threshold > 0 {
400 large_file_threshold
401 } else {
402 DEFAULT_LARGE_FILE_THRESHOLD
403 };
404
405 // Choose loading strategy based on file size
406 if file_size >= threshold {
407 Self::load_large_file(path, file_size, fs)
408 } else {
409 Self::load_small_file(path, fs)
410 }
411 }
412
413 /// Load a text buffer from a file with a specific encoding (no auto-detection).
414 pub fn load_from_file_with_encoding<P: AsRef<Path>>(
415 path: P,
416 encoding: Encoding,
417 fs: Arc<dyn FileSystem + Send + Sync>,
418 config: BufferConfig,
419 ) -> anyhow::Result<Self> {
420 let path = path.as_ref();
421 let contents = fs.read_file(path)?;
422
423 let mut buffer = Self::from_bytes_with_encoding(contents, encoding, fs);
424 buffer.persistence.set_file_path(path.to_path_buf());
425 buffer.persistence.clear_modified();
426 buffer.config = config;
427 Ok(buffer)
428 }
429
430 /// Load a small file with full eager loading and line indexing
431 fn load_small_file(path: &Path, fs: Arc<dyn FileSystem + Send + Sync>) -> anyhow::Result<Self> {
432 let contents = fs.read_file(path)?;
433
434 // Use unified encoding/binary detection
435 let (encoding, is_binary) = format::detect_encoding_or_binary(&contents, false);
436
437 // For binary files, skip encoding conversion to preserve raw bytes
438 let mut buffer = if is_binary {
439 Self::from_bytes_raw(contents, fs)
440 } else {
441 // from_bytes handles encoding detection/conversion and line ending detection
442 Self::from_bytes(contents, fs)
443 };
444 buffer.persistence.set_file_path(path.to_path_buf());
445 buffer.persistence.clear_modified();
446 buffer.file_kind.set_large_file(false);
447 buffer.file_kind.set_binary(is_binary);
448 // For binary files, ensure encoding matches detection
449 if is_binary {
450 buffer.format.set_default_encoding(encoding);
451 }
452 // Note: line_ending and encoding are already set by from_bytes/from_bytes_raw
453 Ok(buffer)
454 }
455
456 /// Check if loading a large file requires user confirmation due to encoding.
457 ///
458 /// Some encodings (like Shift-JIS, GB18030, GBK, EUC-KR) cannot be "resynchronized" -
459 /// meaning you cannot determine character boundaries when jumping into the middle
460 /// of a file. These encodings require loading the entire file into memory.
461 ///
462 /// Returns `Some(confirmation)` if user confirmation is needed, `None` if the file
463 /// can be loaded with lazy/streaming loading.
464 pub fn check_large_file_encoding(
465 path: impl AsRef<Path>,
466 fs: Arc<dyn FileSystem + Send + Sync>,
467 ) -> anyhow::Result<Option<LargeFileEncodingConfirmation>> {
468 let path = path.as_ref();
469 let metadata = fs.metadata(path)?;
470 let file_size = metadata.size as usize;
471
472 // Only check for large files
473 if file_size < DEFAULT_LARGE_FILE_THRESHOLD {
474 return Ok(None);
475 }
476
477 // Read a sample to detect encoding
478 let sample_size = file_size.min(8 * 1024);
479 let sample = fs.read_range(path, 0, sample_size)?;
480 let (encoding, is_binary) =
481 format::detect_encoding_or_binary(&sample, file_size > sample_size);
482
483 // Binary files don't need confirmation (loaded as-is)
484 if is_binary {
485 return Ok(None);
486 }
487
488 // Check if the encoding requires full file loading
489 if encoding.requires_full_file_load() {
490 return Ok(Some(LargeFileEncodingConfirmation {
491 path: path.to_path_buf(),
492 file_size,
493 encoding,
494 }));
495 }
496
497 Ok(None)
498 }
499
500 /// Load a large file with unloaded buffer (no line indexing, lazy loading)
501 ///
502 /// If `force_full_load` is true, loads the entire file regardless of encoding.
503 /// This should be set to true after user confirms loading a non-resynchronizable encoding.
504 fn load_large_file(
505 path: &Path,
506 file_size: usize,
507 fs: Arc<dyn FileSystem + Send + Sync>,
508 ) -> anyhow::Result<Self> {
509 Self::load_large_file_internal(path, file_size, fs, false)
510 }
511
512 /// Load a large file, optionally forcing full load for non-resynchronizable encodings.
513 ///
514 /// Called with `force_full_load=true` after user confirms the warning about
515 /// non-resynchronizable encodings requiring full file loading.
516 pub fn load_large_file_confirmed(
517 path: impl AsRef<Path>,
518 fs: Arc<dyn FileSystem + Send + Sync>,
519 ) -> anyhow::Result<Self> {
520 let path = path.as_ref();
521 let metadata = fs.metadata(path)?;
522 let file_size = metadata.size as usize;
523 Self::load_large_file_internal(path, file_size, fs, true)
524 }
525
526 /// Internal implementation for loading large files.
527 fn load_large_file_internal(
528 path: &Path,
529 file_size: usize,
530 fs: Arc<dyn FileSystem + Send + Sync>,
531 force_full_load: bool,
532 ) -> anyhow::Result<Self> {
533 use crate::model::piece_tree::{BufferData, BufferLocation};
534
535 // Read a sample of the file to detect encoding and whether it's binary
536 // We read the first 8KB for detection
537 let sample_size = file_size.min(8 * 1024);
538 let sample = fs.read_range(path, 0, sample_size)?;
539
540 // Use unified encoding/binary detection
541 let (encoding, is_binary) =
542 format::detect_encoding_or_binary(&sample, file_size > sample_size);
543
544 // Binary files skip encoding conversion to preserve raw bytes
545 if is_binary {
546 tracing::info!("Large binary file detected, loading without encoding conversion");
547 let contents = fs.read_file(path)?;
548 let mut buffer = Self::from_bytes_raw(contents, fs);
549 buffer.persistence.set_file_path(path.to_path_buf());
550 buffer.persistence.clear_modified();
551 buffer.file_kind.set_large_file(true);
552 buffer.format.set_default_encoding(encoding);
553 return Ok(buffer);
554 }
555
556 // Check if encoding requires full file loading
557 let requires_full_load = encoding.requires_full_file_load();
558
559 // For non-resynchronizable encodings, require confirmation unless forced
560 if requires_full_load && !force_full_load {
561 anyhow::bail!(LargeFileEncodingConfirmation {
562 path: path.to_path_buf(),
563 file_size,
564 encoding,
565 });
566 }
567
568 // For encodings that require full load (non-resynchronizable or non-UTF-8),
569 // load the entire file and convert
570 if !matches!(encoding, Encoding::Utf8 | Encoding::Ascii) {
571 tracing::info!(
572 "Large file with non-UTF-8 encoding ({:?}), loading fully for conversion",
573 encoding
574 );
575 let contents = fs.read_file(path)?;
576 let mut buffer = Self::from_bytes(contents, fs);
577 buffer.persistence.set_file_path(path.to_path_buf());
578 buffer.persistence.clear_modified();
579 buffer.file_kind.set_large_file(true); // Still mark as large file for UI purposes
580 buffer.file_kind.set_binary(is_binary);
581 return Ok(buffer);
582 }
583
584 // UTF-8/ASCII files can use lazy loading
585 let line_ending = format::detect_line_ending(&sample);
586
587 // Create an unloaded buffer that references the entire file
588 let buffer = StringBuffer {
589 id: 0,
590 data: BufferData::Unloaded {
591 file_path: path.to_path_buf(),
592 file_offset: 0,
593 bytes: file_size,
594 },
595 stored_file_offset: None,
596 };
597
598 // Create piece tree with a single piece covering the whole file
599 // No line feed count (None) since we're not computing line indexing
600 let piece_tree = if file_size > 0 {
601 PieceTree::new(BufferLocation::Stored(0), 0, file_size, None)
602 } else {
603 PieceTree::empty()
604 };
605 let saved_root = piece_tree.root();
606
607 tracing::debug!(
608 "Buffer::load_from_file: loaded {} bytes, saved_file_size={}",
609 file_size,
610 file_size
611 );
612
613 Ok(TextBuffer {
614 piece_tree,
615 buffers: vec![buffer],
616 next_buffer_id: 1,
617 persistence: Persistence::new(
618 fs,
619 Some(path.to_path_buf()),
620 saved_root,
621 Some(file_size),
622 ),
623 file_kind: BufferFileKind::new(true, is_binary),
624 format: BufferFormat::new(line_ending, encoding),
625 version: 0,
626 config: BufferConfig::default(),
627 })
628 }
629
630 /// Save the buffer to its associated file
631 pub fn save(&mut self) -> anyhow::Result<()> {
632 if let Some(path) = self.persistence.file_path_owned() {
633 self.save_to_file(path)
634 } else {
635 anyhow::bail!(io::Error::new(
636 io::ErrorKind::NotFound,
637 "No file path associated with buffer",
638 ))
639 }
640 }
641
642 /// Build a write recipe from the piece tree for saving.
643 ///
644 /// Delegates to `save::build_write_recipe`.
645 #[cfg(test)]
646 pub(crate) fn build_write_recipe(&self) -> io::Result<WriteRecipe> {
647 save::build_write_recipe(
648 &self.piece_tree,
649 &self.buffers,
650 &self.format,
651 &self.file_kind,
652 &self.persistence,
653 )
654 }
655
656 /// Save the buffer to a specific file
657 ///
658 /// Uses the write recipe approach for both local and remote filesystems:
659 /// - Copy ops reference unchanged regions in the source file
660 /// - Insert ops contain new/modified data
661 ///
662 /// For remote filesystems, the recipe is sent to the agent which reconstructs
663 /// the file server-side, avoiding transfer of unchanged content.
664 ///
665 /// For local filesystems with ownership concerns (file owned by another user),
666 /// uses in-place writing to preserve ownership. Otherwise uses atomic writes.
667 ///
668 /// If the line ending format has been changed (via set_line_ending), all content
669 /// will be converted to the new format during save.
670 pub fn save_to_file<P: AsRef<Path>>(&mut self, path: P) -> anyhow::Result<()> {
671 let dest_path = path.as_ref();
672 let total = self.total_bytes();
673
674 // Handle empty files
675 if total == 0 {
676 self.persistence.fs().write_file(dest_path, &[])?;
677 self.finalize_save(dest_path)?;
678 return Ok(());
679 }
680
681 // Build the write recipe (unified for all filesystem types)
682 let recipe = save::build_write_recipe(
683 &self.piece_tree,
684 &self.buffers,
685 &self.format,
686 &self.file_kind,
687 &self.persistence,
688 )?;
689 let ops = recipe.to_write_ops();
690
691 // Check if we need in-place writing to preserve file ownership (local only)
692 // Remote filesystems handle this differently
693 let fs = self.persistence.fs();
694 let is_local = fs.remote_connection_info().is_none();
695 let use_inplace = is_local && save::should_use_inplace_write(fs, dest_path);
696
697 if use_inplace {
698 // In-place write: write directly to preserve ownership
699 save::save_with_inplace_write(fs, dest_path, &recipe)?;
700 } else if !recipe.has_copy_ops() && !is_local {
701 // Remote with no Copy ops: use write_file directly (more efficient)
702 let data = recipe.flatten_inserts();
703 fs.write_file(dest_path, &data)?;
704 } else if is_local {
705 // Local: use write_file or write_patched with sudo fallback
706 let write_result = if !recipe.has_copy_ops() {
707 let data = recipe.flatten_inserts();
708 fs.write_file(dest_path, &data)
709 } else {
710 let src_for_patch = recipe.src_path.as_deref().unwrap_or(dest_path);
711 fs.write_patched(src_for_patch, dest_path, &ops)
712 };
713
714 if let Err(e) = write_result {
715 if e.kind() == io::ErrorKind::PermissionDenied {
716 // Create temp file and return sudo error
717 let original_metadata = fs.metadata_if_exists(dest_path);
718 let (temp_path, mut temp_file) = save::create_temp_file(fs, dest_path)?;
719 save::write_recipe_to_file(fs, &mut temp_file, &recipe)?;
720 temp_file.sync_all()?;
721 drop(temp_file);
722 return Err(save::make_sudo_error(
723 temp_path,
724 dest_path,
725 original_metadata,
726 ));
727 }
728 return Err(e.into());
729 }
730 } else {
731 // Remote with Copy ops: use write_patched
732 let src_for_patch = recipe.src_path.as_deref().unwrap_or(dest_path);
733 fs.write_patched(src_for_patch, dest_path, &ops)?;
734 }
735
736 self.finalize_save(dest_path)?;
737 Ok(())
738 }
739
740 /// Finalize save state after successful write.
741 fn finalize_save(&mut self, dest_path: &Path) -> anyhow::Result<()> {
742 let new_size = self.persistence.fs().metadata(dest_path)?.size as usize;
743 tracing::debug!(
744 "Buffer::save: updating saved_file_size from {:?} to {}",
745 self.persistence.saved_file_size(),
746 new_size
747 );
748 self.persistence.set_saved_file_size(Some(new_size));
749 self.persistence.set_file_path(dest_path.to_path_buf());
750
751 // Consolidate the piece tree to synchronize with disk (for large files)
752 // or to simplify structure (for small files).
753 self.consolidate_after_save(dest_path, new_size);
754
755 self.mark_saved_snapshot();
756 self.format.promote_current_to_original();
757 Ok(())
758 }
759
760 /// Finalize buffer state after an external save operation (e.g., via sudo).
761 ///
762 /// This updates the saved snapshot and file size to match the new state on disk.
763 pub fn finalize_external_save(&mut self, dest_path: PathBuf) -> anyhow::Result<()> {
764 let new_size = self.persistence.fs().metadata(&dest_path)?.size as usize;
765 self.persistence.set_saved_file_size(Some(new_size));
766 self.persistence.set_file_path(dest_path.clone());
767
768 // Consolidate the piece tree to synchronize with disk or simplify structure.
769 self.consolidate_after_save(&dest_path, new_size);
770
771 self.mark_saved_snapshot();
772 self.format.promote_current_to_original();
773 Ok(())
774 }
775
776 /// Consolidate the piece tree into a single piece.
777 /// For large files, this creates a reference to the disk file to save memory and sync offsets.
778 /// For small files, this flattens all edits into a single in-memory buffer.
779 fn consolidate_after_save(&mut self, path: &Path, file_size: usize) {
780 if self.file_kind.is_large_file() {
781 self.consolidate_large_file(path, file_size);
782 } else {
783 self.consolidate_small_file();
784 }
785 }
786
787 /// Consolidate large file piece tree into a single piece pointing to the new file.
788 /// This ensures that subsequent operations correctly reference the new content and offsets.
789 /// Preserves total line feed count from the old tree if a scan was previously done.
790 fn consolidate_large_file(&mut self, path: &Path, file_size: usize) {
791 // Preserve line feed count from the old tree if we had scanned it
792 let preserved_lf = if self.file_kind.has_line_feed_scan() {
793 self.piece_tree.line_count().map(|c| c.saturating_sub(1))
794 } else {
795 None
796 };
797
798 let buffer = StringBuffer {
799 id: 0,
800 data: BufferData::Unloaded {
801 file_path: path.to_path_buf(),
802 file_offset: 0,
803 bytes: file_size,
804 },
805 stored_file_offset: None,
806 };
807
808 self.piece_tree = if file_size > 0 {
809 PieceTree::new(BufferLocation::Stored(0), 0, file_size, preserved_lf)
810 } else {
811 PieceTree::empty()
812 };
813
814 self.buffers = vec![buffer];
815 self.next_buffer_id = 1;
816
817 tracing::debug!(
818 "Buffer::consolidate_large_file: consolidated into single piece of {} bytes",
819 file_size
820 );
821 }
822
823 /// Consolidate small file edits into a single in-memory buffer and re-index lines.
824 fn consolidate_small_file(&mut self) {
825 if let Some(bytes) = self.get_all_text() {
826 let line_feed_cnt = bytes.iter().filter(|&&b| b == b'\n').count();
827 let len = bytes.len();
828
829 // Create a single loaded buffer with line indexing
830 let buffer = StringBuffer::new_loaded(0, bytes, true);
831
832 self.piece_tree = if len > 0 {
833 PieceTree::new(BufferLocation::Stored(0), 0, len, Some(line_feed_cnt))
834 } else {
835 PieceTree::empty()
836 };
837
838 self.buffers = vec![buffer];
839 self.next_buffer_id = 1;
840
841 tracing::debug!(
842 "Buffer::consolidate_small_file: consolidated into single loaded buffer of {} bytes",
843 len
844 );
845 }
846 }
847
848 /// Get the total number of bytes in the document
849 pub fn total_bytes(&self) -> usize {
850 self.piece_tree.total_bytes()
851 }
852
853 /// Get the total number of lines in the document
854 /// Uses the piece tree's integrated line tracking
855 /// Returns None if line count is unknown (e.g., for large files without line indexing)
856 pub fn line_count(&self) -> Option<usize> {
857 self.piece_tree.line_count()
858 }
859
860 /// Snapshot the current tree as the saved baseline
861 pub fn mark_saved_snapshot(&mut self) {
862 self.persistence.mark_saved_snapshot(&self.piece_tree);
863 }
864
865 /// Refresh the saved root to match the current tree structure without
866 /// clearing the modified flag. Call this after structural-only changes
867 /// (e.g. chunk_split_and_load during search scan) so that
868 /// `diff_since_saved()` can take the fast `Arc::ptr_eq` path.
869 pub fn refresh_saved_root_if_unmodified(&mut self) {
870 self.persistence
871 .refresh_saved_root_if_unmodified(&self.piece_tree);
872 }
873
874 /// Diff the current piece tree against the last saved snapshot.
875 ///
876 /// See `Persistence::diff_since_saved` for the algorithm.
877 pub fn diff_since_saved(&self) -> PieceTreeDiff {
878 let _span = tracing::info_span!(
879 "diff_since_saved",
880 large_file = self.file_kind.is_large_file(),
881 modified = self.persistence.is_modified(),
882 lf_scanned = self.file_kind.has_line_feed_scan()
883 )
884 .entered();
885
886 self.persistence
887 .diff_since_saved(&self.piece_tree, &self.buffers)
888 }
889
890 /// Convert a byte offset to a line/column position
891 pub fn offset_to_position(&self, offset: usize) -> Option<Position> {
892 self.piece_tree
893 .offset_to_position(offset, &self.buffers)
894 .map(|(line, column)| Position { line, column })
895 }
896
897 /// Convert a line/column position to a byte offset
898 pub fn position_to_offset(&self, position: Position) -> usize {
899 self.piece_tree
900 .position_to_offset(position.line, position.column, &self.buffers)
901 }
902
903 /// Insert text at the given byte offset
904 pub fn insert_bytes(&mut self, offset: usize, text: Vec<u8>) -> Cursor {
905 if text.is_empty() {
906 return self.piece_tree.cursor_at_offset(offset);
907 }
908
909 // Mark as modified (updates version)
910 self.mark_content_modified();
911
912 // Count line feeds in the text to insert
913 let line_feed_cnt = Some(text.iter().filter(|&&b| b == b'\n').count());
914
915 // Optimization: try to append to existing buffer if insertion is at piece boundary
916 let (buffer_location, buffer_offset, text_len) =
917 if let Some(append_info) = self.try_append_to_existing_buffer(offset, &text) {
918 append_info
919 } else {
920 // Create a new StringBuffer for this insertion
921 let buffer_id = self.next_buffer_id;
922 self.next_buffer_id += 1;
923 let buffer = StringBuffer::new(buffer_id, text.clone());
924 self.buffers.push(buffer);
925 (BufferLocation::Added(buffer_id), 0, text.len())
926 };
927
928 // When line feeds have been scanned, ensure the chunk at the insertion
929 // point is loaded so compute_line_feeds_static can recount during splits.
930 if self.file_kind.has_line_feed_scan() {
931 self.ensure_chunk_loaded_at(offset);
932 }
933
934 // Update piece tree (need to pass buffers reference)
935 self.piece_tree.insert(
936 offset,
937 buffer_location,
938 buffer_offset,
939 text_len,
940 line_feed_cnt,
941 &self.buffers,
942 )
943 }
944
945 /// Try to append to an existing buffer if insertion point aligns with buffer end
946 /// Returns (BufferLocation, buffer_offset, text_len) if append succeeds, None otherwise
947 fn try_append_to_existing_buffer(
948 &mut self,
949 offset: usize,
950 text: &[u8],
951 ) -> Option<(BufferLocation, usize, usize)> {
952 // Only optimize for non-empty insertions after existing content
953 if text.is_empty() || offset == 0 {
954 return None;
955 }
956
957 // Find the piece containing the byte just before the insertion point
958 // This avoids the saturating_sub issue
959 let piece_info = self.piece_tree.find_by_offset(offset - 1)?;
960
961 // Check if insertion is exactly at the end of this piece
962 // offset_in_piece tells us where (offset-1) is within the piece
963 // For insertion to be at piece end, (offset-1) must be the last byte
964 let offset_in_piece = piece_info.offset_in_piece?;
965 if offset_in_piece + 1 != piece_info.bytes {
966 return None; // Not at the end of the piece
967 }
968
969 // Only append to "Added" buffers (not original Stored buffers)
970 if !matches!(piece_info.location, BufferLocation::Added(_)) {
971 return None;
972 }
973
974 let buffer_id = piece_info.location.buffer_id();
975 let buffer = self.buffers.get_mut(buffer_id)?;
976
977 // Check if buffer is loaded
978 let buffer_len = buffer.get_data()?.len();
979
980 // Check if this piece ends exactly at the end of its buffer
981 if piece_info.offset + piece_info.bytes != buffer_len {
982 return None;
983 }
984
985 // Perfect! Append to this buffer
986 let append_offset = buffer.append(text);
987
988 Some((piece_info.location, append_offset, text.len()))
989 }
990
991 /// Insert text (from &str) at the given byte offset
992 pub fn insert(&mut self, offset: usize, text: &str) {
993 self.insert_bytes(offset, text.as_bytes().to_vec());
994 }
995
996 /// Insert text at a line/column position
997 /// This now uses the optimized piece_tree.insert_at_position() for a single traversal
998 pub fn insert_at_position(&mut self, position: Position, text: Vec<u8>) -> Cursor {
999 if text.is_empty() {
1000 let offset = self.position_to_offset(position);
1001 return self.piece_tree.cursor_at_offset(offset);
1002 }
1003
1004 self.mark_content_modified();
1005
1006 // Count line feeds in the text to insert
1007 let line_feed_cnt = text.iter().filter(|&&b| b == b'\n').count();
1008
1009 // Create a new StringBuffer for this insertion
1010 let buffer_id = self.next_buffer_id;
1011 self.next_buffer_id += 1;
1012 let buffer = StringBuffer::new(buffer_id, text.clone());
1013 self.buffers.push(buffer);
1014
1015 // Use the optimized position-based insertion (single traversal)
1016 self.piece_tree.insert_at_position(
1017 position.line,
1018 position.column,
1019 BufferLocation::Added(buffer_id),
1020 0,
1021 text.len(),
1022 line_feed_cnt,
1023 &self.buffers,
1024 )
1025 }
1026
1027 /// Delete text starting at the given byte offset
1028 pub fn delete_bytes(&mut self, offset: usize, bytes: usize) {
1029 if bytes == 0 || offset >= self.total_bytes() {
1030 return;
1031 }
1032
1033 // When line feeds have been scanned, ensure chunks at delete boundaries
1034 // are loaded so compute_line_feeds_static can recount during splits.
1035 if self.file_kind.has_line_feed_scan() {
1036 self.ensure_chunk_loaded_at(offset);
1037 let end = (offset + bytes).min(self.total_bytes());
1038 if end > offset {
1039 self.ensure_chunk_loaded_at(end.saturating_sub(1));
1040 }
1041 }
1042
1043 // Update piece tree
1044 self.piece_tree.delete(offset, bytes, &self.buffers);
1045
1046 self.mark_content_modified();
1047 }
1048
1049 /// Delete text in a range
1050 pub fn delete(&mut self, range: Range<usize>) {
1051 if range.end > range.start {
1052 self.delete_bytes(range.start, range.end - range.start);
1053 }
1054 }
1055
1056 /// Delete text in a line/column range
1057 /// This now uses the optimized piece_tree.delete_position_range() for a single traversal
1058 pub fn delete_range(&mut self, start: Position, end: Position) {
1059 // Use the optimized position-based deletion
1060 self.piece_tree.delete_position_range(
1061 start.line,
1062 start.column,
1063 end.line,
1064 end.column,
1065 &self.buffers,
1066 );
1067 self.mark_content_modified();
1068 }
1069
1070 /// Replace the entire buffer content with new content
1071 /// This is an O(n) operation that rebuilds the piece tree in a single pass,
1072 /// avoiding the O(n²) complexity of applying individual edits.
1073 ///
1074 /// This is used for bulk operations like "replace all" where applying
1075 /// individual edits would be prohibitively slow.
1076 pub fn replace_content(&mut self, new_content: &str) {
1077 let bytes = new_content.len();
1078 let content_bytes = new_content.as_bytes().to_vec();
1079
1080 // Count line feeds in the new content
1081 let line_feed_cnt = content_bytes.iter().filter(|&&b| b == b'\n').count();
1082
1083 // Create a new StringBuffer for the new content
1084 let buffer_id = self.next_buffer_id;
1085 self.next_buffer_id += 1;
1086 let buffer = StringBuffer::new(buffer_id, content_bytes);
1087 self.buffers.push(buffer);
1088
1089 // Rebuild the piece tree with a single piece containing all the new content
1090 if bytes > 0 {
1091 self.piece_tree = PieceTree::new(
1092 BufferLocation::Added(buffer_id),
1093 0,
1094 bytes,
1095 Some(line_feed_cnt),
1096 );
1097 } else {
1098 self.piece_tree = PieceTree::empty();
1099 }
1100
1101 self.mark_content_modified();
1102 }
1103
1104 /// Restore a previously saved buffer state (for undo/redo of BulkEdit).
1105 ///
1106 /// This restores the piece tree AND the buffers list, which is critical
1107 /// because consolidate_after_save() replaces self.buffers. Without restoring
1108 /// buffers, the piece tree would reference buffer IDs that no longer exist.
1109 pub fn restore_buffer_state(&mut self, snapshot: &BufferSnapshot) {
1110 self.piece_tree = snapshot.piece_tree.clone();
1111 self.buffers = snapshot.buffers.clone();
1112 self.next_buffer_id = snapshot.next_buffer_id;
1113 self.mark_content_modified();
1114 }
1115
1116 /// Snapshot the current buffer state (piece tree + buffers) for BulkEdit undo/redo.
1117 ///
1118 /// The snapshot includes buffers because consolidate_after_save() can replace
1119 /// self.buffers between the snapshot and restore, which would otherwise cause
1120 /// the restored piece tree to reference nonexistent buffer IDs.
1121 pub fn snapshot_buffer_state(&self) -> Arc<BufferSnapshot> {
1122 Arc::new(BufferSnapshot {
1123 piece_tree: self.piece_tree.clone(),
1124 buffers: self.buffers.clone(),
1125 next_buffer_id: self.next_buffer_id,
1126 })
1127 }
1128
1129 /// Apply bulk edits efficiently in a single pass
1130 /// Returns the net change in bytes
1131 pub fn apply_bulk_edits(&mut self, edits: &[(usize, usize, &str)]) -> isize {
1132 // Pre-allocate buffers for all insert texts (only non-empty texts)
1133 // This avoids the borrow conflict in the closure
1134 // IMPORTANT: Only add entries for non-empty texts because the closure
1135 // is only called for edits with non-empty insert text
1136 let mut buffer_info: Vec<(BufferLocation, usize, usize, Option<usize>)> = Vec::new();
1137
1138 for (_, _, text) in edits {
1139 if !text.is_empty() {
1140 let buffer_id = self.next_buffer_id;
1141 self.next_buffer_id += 1;
1142 let content = text.as_bytes().to_vec();
1143 let lf_cnt = content.iter().filter(|&&b| b == b'\n').count();
1144 let bytes = content.len();
1145 let buffer = StringBuffer::new(buffer_id, content);
1146 self.buffers.push(buffer);
1147 buffer_info.push((BufferLocation::Added(buffer_id), 0, bytes, Some(lf_cnt)));
1148 }
1149 // No placeholder for empty texts - the closure is only called for non-empty texts
1150 }
1151
1152 // Now call apply_bulk_edits with a simple index-based closure
1153 let mut idx = 0;
1154 let delta = self
1155 .piece_tree
1156 .apply_bulk_edits(edits, &self.buffers, |_text| {
1157 let info = buffer_info[idx];
1158 idx += 1;
1159 info
1160 });
1161
1162 self.mark_content_modified();
1163 delta
1164 }
1165
1166 /// Get text from a byte offset range
1167 /// This now uses the optimized piece_tree.iter_pieces_in_range() for a single traversal
1168 /// Get text from a byte offset range (read-only)
1169 /// Returns None if any buffer in the range is unloaded
1170 /// PRIVATE: External code should use get_text_range_mut() which handles lazy loading
1171 fn get_text_range(&self, offset: usize, bytes: usize) -> Option<Vec<u8>> {
1172 if bytes == 0 {
1173 return Some(Vec::new());
1174 }
1175
1176 let mut result = Vec::with_capacity(bytes);
1177 let end_offset = offset + bytes;
1178 let mut collected = 0;
1179
1180 // Use the efficient piece iterator (single O(log n) traversal + O(N) iteration)
1181 for piece_view in self.piece_tree.iter_pieces_in_range(offset, end_offset) {
1182 let buffer_id = piece_view.location.buffer_id();
1183 if let Some(buffer) = self.buffers.get(buffer_id) {
1184 // Calculate the range to read from this piece
1185 let piece_start_in_doc = piece_view.doc_offset;
1186 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
1187
1188 // Clip to the requested range
1189 let read_start = offset.max(piece_start_in_doc);
1190 let read_end = end_offset.min(piece_end_in_doc);
1191
1192 if read_end > read_start {
1193 let offset_in_piece = read_start - piece_start_in_doc;
1194 let bytes_to_read = read_end - read_start;
1195
1196 let buffer_start = piece_view.buffer_offset + offset_in_piece;
1197 let buffer_end = buffer_start + bytes_to_read;
1198
1199 // Return None if buffer is unloaded (type-safe)
1200 let data = buffer.get_data()?;
1201
1202 if buffer_end <= data.len() {
1203 result.extend_from_slice(&data[buffer_start..buffer_end]);
1204 collected += bytes_to_read;
1205
1206 if collected >= bytes {
1207 break;
1208 }
1209 }
1210 }
1211 }
1212 }
1213
1214 Some(result)
1215 }
1216
1217 /// Get text from a byte offset range with lazy loading
1218 /// This will load unloaded chunks on-demand and always returns complete data
1219 ///
1220 /// Returns an error if loading fails or if data cannot be read for any reason.
1221 ///
1222 /// NOTE: Currently loads entire buffers on-demand. Future optimization would split
1223 /// large pieces and load only LOAD_CHUNK_SIZE chunks at a time.
1224 pub fn get_text_range_mut(&mut self, offset: usize, bytes: usize) -> Result<Vec<u8>> {
1225 let _span = tracing::info_span!("get_text_range_mut", offset, bytes).entered();
1226 if bytes == 0 {
1227 return Ok(Vec::new());
1228 }
1229
1230 let mut result = Vec::with_capacity(bytes);
1231 // Clamp end_offset to buffer length to handle reads beyond EOF
1232 let end_offset = (offset + bytes).min(self.len());
1233 let mut current_offset = offset;
1234 let mut iteration_count = 0u32;
1235
1236 // Keep iterating until we've collected all requested bytes
1237 while current_offset < end_offset {
1238 iteration_count += 1;
1239 let mut made_progress = false;
1240 let mut restarted_iteration = false;
1241
1242 // Use the efficient piece iterator (single O(log n) traversal + O(N) iteration)
1243 for piece_view in self
1244 .piece_tree
1245 .iter_pieces_in_range(current_offset, end_offset)
1246 {
1247 let buffer_id = piece_view.location.buffer_id();
1248
1249 // Check if buffer needs loading
1250 let needs_loading = self
1251 .buffers
1252 .get(buffer_id)
1253 .map(|b| !b.is_loaded())
1254 .unwrap_or(false);
1255
1256 if needs_loading && self.chunk_split_and_load(&piece_view, current_offset)? {
1257 restarted_iteration = true;
1258 break;
1259 }
1260
1261 // Calculate the range to read from this piece
1262 let piece_start_in_doc = piece_view.doc_offset;
1263 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
1264
1265 // Clip to the requested range
1266 let read_start = current_offset.max(piece_start_in_doc);
1267 let read_end = end_offset.min(piece_end_in_doc);
1268
1269 if read_end > read_start {
1270 let offset_in_piece = read_start - piece_start_in_doc;
1271 let bytes_to_read = read_end - read_start;
1272
1273 let buffer_start = piece_view.buffer_offset + offset_in_piece;
1274 let buffer_end = buffer_start + bytes_to_read;
1275
1276 // Buffer should be loaded now
1277 let buffer = self.buffers.get(buffer_id).context("Buffer not found")?;
1278 let data = buffer
1279 .get_data()
1280 .context("Buffer data unavailable after load")?;
1281
1282 anyhow::ensure!(
1283 buffer_end <= data.len(),
1284 "Buffer range out of bounds: requested {}..{}, buffer size {}",
1285 buffer_start,
1286 buffer_end,
1287 data.len()
1288 );
1289
1290 result.extend_from_slice(&data[buffer_start..buffer_end]);
1291 current_offset = read_end;
1292 made_progress = true;
1293 }
1294 }
1295
1296 // If we didn't make progress and didn't restart iteration, this is an error
1297 if !made_progress && !restarted_iteration {
1298 tracing::error!(
1299 "get_text_range_mut: No progress at offset {} (requested range: {}..{}, buffer len: {})",
1300 current_offset,
1301 offset,
1302 end_offset,
1303 self.len()
1304 );
1305 tracing::error!(
1306 "Piece tree stats: {} total bytes",
1307 self.piece_tree.stats().total_bytes
1308 );
1309 anyhow::bail!(
1310 "Failed to read data at offset {}: no progress made (requested {}..{}, buffer len: {})",
1311 current_offset,
1312 offset,
1313 end_offset,
1314 self.len()
1315 );
1316 }
1317 }
1318
1319 if iteration_count > 1 {
1320 tracing::info!(
1321 iteration_count,
1322 result_len = result.len(),
1323 "get_text_range_mut: completed with multiple iterations"
1324 );
1325 }
1326
1327 Ok(result)
1328 }
1329
1330 /// Prepare a viewport for rendering
1331 ///
1332 /// This is called before rendering with &mut access to pre-load all data
1333 /// that will be needed for the viewport. It estimates the number of bytes
1334 /// needed based on the line count and pre-loads them.
1335 ///
1336 /// # Arguments
1337 /// * `start_offset` - The byte offset where the viewport starts
1338 /// * `line_count` - The number of lines to prepare (estimate)
1339 ///
1340 /// # Returns
1341 /// Ok(()) if preparation succeeded, Err if loading failed
1342 pub fn prepare_viewport(&mut self, start_offset: usize, line_count: usize) -> Result<()> {
1343 let _span = tracing::info_span!("prepare_viewport", start_offset, line_count).entered();
1344 // Estimate how many bytes we need (pessimistic assumption)
1345 // Average line length is typically 80-100 bytes, but we use 200 to be safe
1346 let estimated_bytes = line_count.saturating_mul(200);
1347
1348 // Cap the estimate at the remaining bytes in the document
1349 let remaining_bytes = self.total_bytes().saturating_sub(start_offset);
1350 let bytes_to_load = estimated_bytes.min(remaining_bytes);
1351 tracing::trace!(
1352 bytes_to_load,
1353 total_bytes = self.total_bytes(),
1354 "prepare_viewport loading"
1355 );
1356
1357 // Pre-load with full chunk-splitting support
1358 // This may load more than we need, but ensures all data is available
1359 self.get_text_range_mut(start_offset, bytes_to_load)?;
1360
1361 Ok(())
1362 }
1363
1364 /// Split a piece that references a large unloaded buffer, create a chunk
1365 /// buffer for the region around `current_offset`, and load it.
1366 ///
1367 /// Returns `true` if the piece tree was modified (caller must restart its
1368 /// iteration), `false` if the piece was small enough to load in-place.
1369 fn chunk_split_and_load(
1370 &mut self,
1371 piece_view: &PieceView,
1372 current_offset: usize,
1373 ) -> Result<bool> {
1374 let buffer_id = piece_view.location.buffer_id();
1375
1376 // The underlying buffer may be much larger than this piece (e.g. the
1377 // whole-file Stored buffer after rebuild_with_pristine_saved_root).
1378 // We must chunk-split if either the piece or its buffer exceeds
1379 // LOAD_CHUNK_SIZE, because `load()` loads the entire buffer.
1380 let buffer_bytes = self
1381 .buffers
1382 .get(buffer_id)
1383 .and_then(|b| b.unloaded_bytes())
1384 .unwrap_or(0);
1385 let needs_chunk_split =
1386 piece_view.bytes > LOAD_CHUNK_SIZE || buffer_bytes > piece_view.bytes;
1387
1388 tracing::info!(
1389 buffer_id,
1390 piece_bytes = piece_view.bytes,
1391 buffer_bytes,
1392 needs_chunk_split,
1393 piece_doc_offset = piece_view.doc_offset,
1394 current_offset,
1395 "chunk_split_and_load: loading unloaded piece"
1396 );
1397
1398 if !needs_chunk_split {
1399 // Piece is small enough and its buffer matches — load in-place.
1400 let _span = tracing::info_span!(
1401 "load_small_buffer",
1402 piece_bytes = piece_view.bytes,
1403 buffer_id,
1404 )
1405 .entered();
1406 self.buffers
1407 .get_mut(buffer_id)
1408 .context("Buffer not found")?
1409 .load(&**self.persistence.fs())
1410 .context("Failed to load buffer")?;
1411 return Ok(false);
1412 }
1413
1414 let _span = tracing::info_span!(
1415 "chunk_split_and_load",
1416 piece_bytes = piece_view.bytes,
1417 buffer_id,
1418 )
1419 .entered();
1420
1421 let piece_start_in_doc = piece_view.doc_offset;
1422 let offset_in_piece = current_offset.saturating_sub(piece_start_in_doc);
1423
1424 // When the piece already fits within LOAD_CHUNK_SIZE, create a chunk
1425 // buffer for the exact piece range (no alignment/splitting needed).
1426 // Alignment rounding is only useful when carving a sub-range out of a
1427 // piece larger than LOAD_CHUNK_SIZE.
1428 let (chunk_start_in_buffer, chunk_bytes) = if piece_view.bytes <= LOAD_CHUNK_SIZE {
1429 (piece_view.buffer_offset, piece_view.bytes)
1430 } else {
1431 let start =
1432 (piece_view.buffer_offset + offset_in_piece) / CHUNK_ALIGNMENT * CHUNK_ALIGNMENT;
1433 let bytes = LOAD_CHUNK_SIZE
1434 .min((piece_view.buffer_offset + piece_view.bytes).saturating_sub(start));
1435 (start, bytes)
1436 };
1437
1438 // Calculate document offsets for splitting
1439 let chunk_start_offset_in_piece =
1440 chunk_start_in_buffer.saturating_sub(piece_view.buffer_offset);
1441 let split_start_in_doc = piece_start_in_doc + chunk_start_offset_in_piece;
1442 let split_end_in_doc = split_start_in_doc + chunk_bytes;
1443
1444 // Split the piece to isolate the chunk
1445 if chunk_start_offset_in_piece > 0 {
1446 self.piece_tree
1447 .split_at_offset(split_start_in_doc, &self.buffers);
1448 }
1449 if split_end_in_doc < piece_start_in_doc + piece_view.bytes {
1450 self.piece_tree
1451 .split_at_offset(split_end_in_doc, &self.buffers);
1452 }
1453
1454 // Create a new buffer for this chunk
1455 let chunk_buffer = self
1456 .buffers
1457 .get(buffer_id)
1458 .context("Buffer not found")?
1459 .create_chunk_buffer(self.next_buffer_id, chunk_start_in_buffer, chunk_bytes)
1460 .context("Failed to create chunk buffer")?;
1461
1462 self.next_buffer_id += 1;
1463 let new_buffer_id = chunk_buffer.id;
1464 self.buffers.push(chunk_buffer);
1465
1466 // Update the piece to reference the new chunk buffer
1467 self.piece_tree.replace_buffer_reference(
1468 buffer_id,
1469 piece_view.buffer_offset + chunk_start_offset_in_piece,
1470 chunk_bytes,
1471 BufferLocation::Added(new_buffer_id),
1472 );
1473
1474 // Load the chunk buffer
1475 self.buffers
1476 .get_mut(new_buffer_id)
1477 .context("Chunk buffer not found")?
1478 .load(&**self.persistence.fs())
1479 .context("Failed to load chunk")?;
1480
1481 // split_at_offset uses compute_line_feeds_static which returns None
1482 // for unloaded buffers, destroying the scanned line feed counts.
1483 // Fix up: the loaded chunk is counted from memory, remaining unloaded
1484 // pieces use the filesystem's count_line_feeds_in_range.
1485 if self.file_kind.has_line_feed_scan() {
1486 let leaves = self.piece_tree.get_leaves();
1487 let mut fixups: Vec<(usize, usize)> = Vec::new();
1488 for (idx, leaf) in leaves.iter().enumerate() {
1489 if leaf.line_feed_cnt.is_none() {
1490 if let Ok(count) = self.scan_leaf(leaf) {
1491 fixups.push((idx, count));
1492 }
1493 }
1494 }
1495 if !fixups.is_empty() {
1496 self.piece_tree.update_leaf_line_feeds_path_copy(&fixups);
1497 }
1498 }
1499
1500 // Keep saved_root in sync with viewport-loading tree restructures so
1501 // that diff_since_saved() can match by (location, offset) identity.
1502 //
1503 // When !modified the current tree IS the saved state, so just snapshot.
1504 // When modified, we must apply the same Stored→Added leaf replacement
1505 // to saved_root so the diff doesn't see loaded-but-unedited regions as
1506 // changed.
1507 if !self.persistence.is_modified() {
1508 self.persistence.set_saved_root(self.piece_tree.root());
1509 } else {
1510 self.persistence.apply_chunk_load_to_saved_root(
1511 buffer_id,
1512 chunk_start_in_buffer,
1513 chunk_bytes,
1514 new_buffer_id,
1515 );
1516 }
1517
1518 Ok(true)
1519 }
1520
1521 /// Get all text as a single Vec<u8>
1522 /// Returns None if any buffers are unloaded (lazy loading)
1523 /// CRATE-PRIVATE: External code should use get_text_range_mut() or DocumentModel methods
1524 pub(crate) fn get_all_text(&self) -> Option<Vec<u8>> {
1525 self.get_text_range(0, self.total_bytes())
1526 }
1527
1528 /// Get all text as a String
1529 /// Returns None if any buffers are unloaded (lazy loading)
1530 /// CRATE-PRIVATE: External code should use get_text_range_mut() or DocumentModel methods
1531 pub(crate) fn get_all_text_string(&self) -> Option<String> {
1532 self.get_all_text()
1533 .map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
1534 }
1535
1536 /// Get text from a byte range as bytes
1537 /// CRATE-PRIVATE: Returns empty vector if any buffers are unloaded (silently fails!)
1538 /// Only use this when you KNOW the data is loaded (e.g., for syntax highlighting small regions)
1539 /// External code should use get_text_range_mut() or DocumentModel methods
1540 pub(crate) fn slice_bytes(&self, range: Range<usize>) -> Vec<u8> {
1541 self.get_text_range(range.start, range.end.saturating_sub(range.start))
1542 .unwrap_or_default()
1543 }
1544
1545 /// Get all text as a String
1546 /// Returns None if any buffers are unloaded (lazy loading)
1547 pub fn to_string(&self) -> Option<String> {
1548 self.get_all_text_string()
1549 }
1550
1551 /// Get the total number of bytes
1552 pub fn len(&self) -> usize {
1553 self.total_bytes()
1554 }
1555
1556 /// Check if the buffer is empty
1557 pub fn is_empty(&self) -> bool {
1558 self.total_bytes() == 0
1559 }
1560
1561 /// Get the file path associated with this buffer
1562 pub fn file_path(&self) -> Option<&Path> {
1563 self.persistence.file_path()
1564 }
1565
1566 /// Update the file path after a rename operation on disk.
1567 pub fn rename_file_path(&mut self, path: PathBuf) {
1568 self.persistence.set_file_path(path);
1569 }
1570
1571 /// Clear the file path (make buffer unnamed)
1572 /// Note: This does NOT affect Unloaded chunk file_paths used for lazy loading.
1573 /// Those still point to the original source file for chunk loading.
1574 pub fn clear_file_path(&mut self) {
1575 self.persistence.clear_file_path();
1576 }
1577
1578 /// Extend buffer to include more bytes from a streaming source file.
1579 /// Used for stdin streaming where the temp file grows over time.
1580 /// Appends a new Unloaded chunk for the new bytes.
1581 pub fn extend_streaming(&mut self, source_path: &Path, new_size: usize) {
1582 let old_size = self.total_bytes();
1583 if new_size <= old_size {
1584 return;
1585 }
1586
1587 let additional_bytes = new_size - old_size;
1588
1589 // Create new Unloaded buffer for the appended region
1590 let buffer_id = self.next_buffer_id;
1591 self.next_buffer_id += 1;
1592
1593 let new_buffer = StringBuffer::new_unloaded(
1594 buffer_id,
1595 source_path.to_path_buf(),
1596 old_size, // file_offset - where this chunk starts in the file
1597 additional_bytes, // bytes - size of this chunk
1598 );
1599 self.buffers.push(new_buffer);
1600
1601 // Append piece at end of document (insert at offset == total_bytes)
1602 self.piece_tree.insert(
1603 old_size,
1604 BufferLocation::Stored(buffer_id),
1605 0,
1606 additional_bytes,
1607 None, // line_feed_cnt unknown for unloaded chunk
1608 &self.buffers,
1609 );
1610 }
1611
1612 /// Check if the buffer has been modified since last save
1613 pub fn is_modified(&self) -> bool {
1614 self.persistence.is_modified()
1615 }
1616
1617 /// Clear the modified flag (after save)
1618 pub fn clear_modified(&mut self) {
1619 self.persistence.clear_modified();
1620 }
1621
1622 /// Set the modified flag explicitly
1623 /// Used by undo/redo to restore the correct modified state
1624 pub fn set_modified(&mut self, modified: bool) {
1625 self.persistence.set_modified(modified);
1626 }
1627
1628 /// Check if buffer has pending changes for recovery auto-save
1629 pub fn is_recovery_pending(&self) -> bool {
1630 self.persistence.is_recovery_pending()
1631 }
1632
1633 /// Mark buffer as needing recovery auto-save (call after edits)
1634 pub fn set_recovery_pending(&mut self, pending: bool) {
1635 self.persistence.set_recovery_pending(pending);
1636 }
1637
1638 /// Ensure the buffer chunk at the given byte offset is loaded.
1639 ///
1640 /// When `line_feeds_scanned` is true, piece splits during insert/delete need
1641 /// the buffer data to be loaded so `compute_line_feeds_static` can accurately
1642 /// recount line feeds for each half. This method loads the chunk if needed.
1643 fn ensure_chunk_loaded_at(&mut self, offset: usize) {
1644 if let Some(piece_info) = self.piece_tree.find_by_offset(offset) {
1645 let buffer_id = piece_info.location.buffer_id();
1646 if let Some(buffer) = self.buffers.get_mut(buffer_id) {
1647 if !buffer.is_loaded() {
1648 let buf_bytes = buffer.unloaded_bytes().unwrap_or(0);
1649 tracing::info!(
1650 "ensure_chunk_loaded_at: loading buffer {} ({} bytes) for offset {}",
1651 buffer_id,
1652 buf_bytes,
1653 offset
1654 );
1655 if let Err(e) = buffer.load(&**self.persistence.fs()) {
1656 tracing::warn!("Failed to load chunk at offset {offset}: {e}");
1657 }
1658 }
1659 }
1660 }
1661 }
1662
1663 /// Check if this is a large file with lazy loading enabled
1664 pub fn is_large_file(&self) -> bool {
1665 self.file_kind.is_large_file()
1666 }
1667
1668 /// Check if line feeds have been scanned for this large file.
1669 /// When true, `line_count()` returns exact values.
1670 pub fn has_line_feed_scan(&self) -> bool {
1671 self.file_kind.has_line_feed_scan()
1672 }
1673
1674 /// Get the raw piece tree leaves (for storing alongside scan chunks).
1675 pub fn piece_tree_leaves(&self) -> Vec<crate::model::piece_tree::LeafData> {
1676 self.piece_tree.get_leaves()
1677 }
1678
1679 /// Prepare work items for an incremental line scan.
1680 ///
1681 /// First splits any oversized leaves in the piece tree so every leaf is
1682 /// at most `LOAD_CHUNK_SIZE` bytes. Then returns one work item per leaf.
1683 /// After scanning, `get_text_range_mut` will never need to split a scanned
1684 /// leaf (it's already chunk-sized), so line-feed counts are preserved.
1685 ///
1686 /// Returns `(chunks, total_bytes)`.
1687 pub fn prepare_line_scan(&mut self) -> (Vec<LineScanChunk>, usize) {
1688 // Pre-split the tree so every leaf ≤ LOAD_CHUNK_SIZE.
1689 self.piece_tree.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
1690
1691 let leaves = self.piece_tree.get_leaves();
1692 let total_bytes: usize = leaves.iter().map(|l| l.bytes).sum();
1693 let mut chunks = Vec::new();
1694
1695 for (idx, leaf) in leaves.iter().enumerate() {
1696 chunks.push(LineScanChunk {
1697 leaf_index: idx,
1698 byte_len: leaf.bytes,
1699 already_known: leaf.line_feed_cnt.is_some(),
1700 });
1701 }
1702
1703 (chunks, total_bytes)
1704 }
1705
1706 /// Initialize a chunked search scan over this buffer's piece tree.
1707 ///
1708 /// Used for in-editor Ctrl+F (incremental, yields to the event loop
1709 /// between chunks) and for searching dirty buffers during project grep.
1710 /// For searching files on disk, use `FileSystem::search_file` instead.
1711 pub fn search_scan_init(
1712 &mut self,
1713 regex: regex::bytes::Regex,
1714 max_matches: usize,
1715 query_len: usize,
1716 ) -> ChunkedSearchState {
1717 let (chunks, total_bytes) = self.prepare_line_scan();
1718 ChunkedSearchState {
1719 chunks,
1720 next_chunk: 0,
1721 next_doc_offset: 0,
1722 total_bytes,
1723 scanned_bytes: 0,
1724 regex,
1725 matches: Vec::new(),
1726 overlap_tail: Vec::new(),
1727 overlap_doc_offset: 0,
1728 max_matches,
1729 capped: false,
1730 query_len,
1731 running_line: 1,
1732 }
1733 }
1734
1735 /// Process one chunk of a chunked search scan.
1736 ///
1737 /// Loads the next chunk via `get_text_range_mut`, prepends overlap from
1738 /// the previous chunk, runs the regex, and appends matches to `state`
1739 /// with line/column/context computed on the fly from the loaded bytes.
1740 ///
1741 /// Line numbers are tracked incrementally via `running_line` — each
1742 /// chunk counts newlines in its non-overlap portion to advance the
1743 /// counter for the next chunk, and matches use an incremental cursor
1744 /// so total line-counting work is O(chunk_size), not O(chunk × matches).
1745 ///
1746 /// Returns `Ok(true)` if there are more chunks to process, `Ok(false)`
1747 /// when the scan is complete.
1748 ///
1749 /// TODO: For concurrent/parallel search (searching multiple files at once),
1750 /// chunks would need to return chunk-relative line numbers and have them
1751 /// fixed up with each file's starting line offset after all chunks complete.
1752 pub fn search_scan_next_chunk(
1753 &mut self,
1754 state: &mut ChunkedSearchState,
1755 ) -> std::io::Result<bool> {
1756 if state.is_done() {
1757 return Ok(false);
1758 }
1759
1760 let chunk_info = state.chunks[state.next_chunk].clone();
1761 let doc_offset = state.next_doc_offset;
1762
1763 state.next_chunk += 1;
1764 state.scanned_bytes += chunk_info.byte_len;
1765 state.next_doc_offset += chunk_info.byte_len;
1766
1767 // Load the chunk bytes
1768 let chunk_bytes = self
1769 .get_text_range_mut(doc_offset, chunk_info.byte_len)
1770 .map_err(std::io::Error::other)?;
1771
1772 // Build search buffer: overlap tail + new chunk
1773 let overlap_len = state.overlap_tail.len();
1774 let mut search_buf = Vec::with_capacity(overlap_len + chunk_bytes.len());
1775 search_buf.extend_from_slice(&state.overlap_tail);
1776 search_buf.extend_from_slice(&chunk_bytes);
1777
1778 let buf_doc_offset = if overlap_len > 0 {
1779 state.overlap_doc_offset
1780 } else {
1781 doc_offset
1782 };
1783
1784 // Line number at buf_doc_offset: running_line tracks the line at
1785 // doc_offset (start of new chunk data). Count newlines in the overlap
1786 // prefix to get the line at the start of the full search_buf.
1787 let newlines_in_overlap = search_buf[..overlap_len]
1788 .iter()
1789 .filter(|&&b| b == b'\n')
1790 .count();
1791 let mut line_at = state.running_line - newlines_in_overlap;
1792 let mut counted_to = 0usize;
1793
1794 // Run regex on the combined buffer
1795 for m in state.regex.find_iter(&search_buf) {
1796 // Skip matches entirely within the overlap (already found)
1797 if overlap_len > 0 && m.end() <= overlap_len {
1798 continue;
1799 }
1800
1801 if state.matches.len() >= state.max_matches {
1802 state.capped = true;
1803 break;
1804 }
1805
1806 // Advance line counter incrementally to this match
1807 line_at += search_buf[counted_to..m.start()]
1808 .iter()
1809 .filter(|&&b| b == b'\n')
1810 .count();
1811 counted_to = m.start();
1812
1813 // Find line boundaries in search_buf for context
1814 let line_start = search_buf[..m.start()]
1815 .iter()
1816 .rposition(|&b| b == b'\n')
1817 .map(|p| p + 1)
1818 .unwrap_or(0);
1819 let line_end = search_buf[m.start()..]
1820 .iter()
1821 .position(|&b| b == b'\n')
1822 .map(|p| m.start() + p)
1823 .unwrap_or(search_buf.len());
1824
1825 let match_doc_offset = buf_doc_offset + m.start();
1826 let match_len = m.end() - m.start();
1827 let column = m.start() - line_start + 1;
1828 let context = String::from_utf8_lossy(&search_buf[line_start..line_end]).into_owned();
1829
1830 state.matches.push(SearchMatch {
1831 byte_offset: match_doc_offset,
1832 length: match_len,
1833 line: line_at,
1834 column,
1835 context,
1836 });
1837 }
1838
1839 // Advance running_line by newlines in the new (non-overlap) chunk data
1840 let newlines_in_chunk = chunk_bytes.iter().filter(|&&b| b == b'\n').count();
1841 state.running_line += newlines_in_chunk;
1842
1843 // Save overlap tail for next chunk
1844 let max_overlap = state.query_len.max(256).min(chunk_bytes.len());
1845 let tail_start = chunk_bytes.len().saturating_sub(max_overlap);
1846 state.overlap_tail = chunk_bytes[tail_start..].to_vec();
1847 state.overlap_doc_offset = doc_offset + tail_start;
1848
1849 Ok(!state.is_done())
1850 }
1851
1852 /// Run a complete chunked search over the piece tree (all chunks).
1853 ///
1854 /// Synchronous variant — used for dirty buffer snapshots in project
1855 /// grep and in tests. For on-disk files, use `FileSystem::search_file`.
1856 pub fn search_scan_all(
1857 &mut self,
1858 regex: regex::bytes::Regex,
1859 max_matches: usize,
1860 query_len: usize,
1861 ) -> std::io::Result<ChunkedSearchState> {
1862 let mut state = self.search_scan_init(regex, max_matches, query_len);
1863 while self.search_scan_next_chunk(&mut state)? {}
1864 Ok(state)
1865 }
1866
1867 /// Build a hybrid search plan from the piece tree.
1868 ///
1869 /// Extracts regions (unloaded file ranges + loaded in-memory data) that
1870 /// can be searched independently. The plan is `Send` so it can be
1871 /// executed on a background thread via `HybridSearchPlan::execute`.
1872 ///
1873 /// Returns `None` if the buffer has no file path (caller should fall
1874 /// back to `search_scan_all`).
1875 pub fn search_hybrid_plan(&mut self) -> Option<HybridSearchPlan> {
1876 let file_path = self.persistence.file_path_owned()?;
1877
1878 self.piece_tree.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
1879 let leaves = self.piece_tree.get_leaves();
1880
1881 let mut regions: Vec<SearchRegion> = Vec::new();
1882 let mut doc_offset = 0usize;
1883
1884 for leaf in &leaves {
1885 let buf = self.buffers.get(leaf.location.buffer_id());
1886 let is_unloaded_stored = matches!(
1887 (&leaf.location, buf),
1888 (
1889 BufferLocation::Stored(_),
1890 Some(StringBuffer {
1891 data: BufferData::Unloaded { .. },
1892 ..
1893 }),
1894 )
1895 );
1896
1897 if is_unloaded_stored {
1898 let file_offset = match buf.unwrap().data {
1899 BufferData::Unloaded {
1900 file_offset: fo, ..
1901 } => fo + leaf.offset,
1902 _ => unreachable!(),
1903 };
1904
1905 // Merge with previous unloaded region if contiguous
1906 if let Some(SearchRegion::Unloaded {
1907 file_offset: prev_fo,
1908 bytes: prev_bytes,
1909 ..
1910 }) = regions.last_mut()
1911 {
1912 if *prev_fo + *prev_bytes == file_offset {
1913 *prev_bytes += leaf.bytes;
1914 doc_offset += leaf.bytes;
1915 continue;
1916 }
1917 }
1918 regions.push(SearchRegion::Unloaded {
1919 file_offset,
1920 bytes: leaf.bytes,
1921 doc_offset,
1922 });
1923 } else {
1924 let data = match buf.and_then(|b| b.get_data()) {
1925 Some(full) => {
1926 let end = (leaf.offset + leaf.bytes).min(full.len());
1927 full[leaf.offset..end].to_vec()
1928 }
1929 None => match self.get_text_range_mut(doc_offset, leaf.bytes) {
1930 Ok(d) => d,
1931 Err(_) => {
1932 doc_offset += leaf.bytes;
1933 continue;
1934 }
1935 },
1936 };
1937
1938 // Merge with previous loaded region
1939 if let Some(SearchRegion::Loaded {
1940 data: prev_data, ..
1941 }) = regions.last_mut()
1942 {
1943 prev_data.extend_from_slice(&data);
1944 doc_offset += leaf.bytes;
1945 continue;
1946 }
1947 regions.push(SearchRegion::Loaded { data, doc_offset });
1948 }
1949
1950 doc_offset += leaf.bytes;
1951 }
1952
1953 Some(HybridSearchPlan { file_path, regions })
1954 }
1955
1956 /// Hybrid search: uses `fs.search_file` for unloaded piece-tree regions
1957 /// (searches where the data lives, no network transfer) and in-memory regex
1958 /// for loaded/edited regions. Handles overlap at region boundaries.
1959 ///
1960 /// For a huge remote file with a small local edit, this avoids transferring
1961 /// the entire file — only match metadata crosses the network.
1962 ///
1963 /// Falls back to `search_scan_all` when the buffer has no file path or is
1964 /// fully loaded.
1965 pub fn search_hybrid(
1966 &mut self,
1967 pattern: &str,
1968 opts: &FileSearchOptions,
1969 regex: Regex,
1970 max_matches: usize,
1971 query_len: usize,
1972 ) -> io::Result<Vec<SearchMatch>> {
1973 let plan = match self.search_hybrid_plan() {
1974 Some(p) => p,
1975 None => {
1976 let state = self.search_scan_all(regex, max_matches, query_len)?;
1977 return Ok(state.matches);
1978 }
1979 };
1980 plan.execute(
1981 &**self.persistence.fs(),
1982 pattern,
1983 opts,
1984 ®ex,
1985 max_matches,
1986 query_len,
1987 )
1988 }
1989
1990 /// Count `\n` bytes in a single leaf.
1991 ///
1992 /// Uses `count_line_feeds_in_range` for unloaded buffers, which remote
1993 /// filesystem implementations can override to count server-side.
1994 pub fn scan_leaf(&self, leaf: &crate::model::piece_tree::LeafData) -> std::io::Result<usize> {
1995 let buffer_id = leaf.location.buffer_id();
1996 let buffer = self
1997 .buffers
1998 .get(buffer_id)
1999 .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::NotFound, "buffer not found"))?;
2000
2001 let count = match &buffer.data {
2002 crate::model::piece_tree::BufferData::Loaded { data, .. } => {
2003 let end = (leaf.offset + leaf.bytes).min(data.len());
2004 data[leaf.offset..end]
2005 .iter()
2006 .filter(|&&b| b == b'\n')
2007 .count()
2008 }
2009 crate::model::piece_tree::BufferData::Unloaded {
2010 file_path,
2011 file_offset,
2012 ..
2013 } => {
2014 let read_offset = *file_offset as u64 + leaf.offset as u64;
2015 self.persistence.fs().count_line_feeds_in_range(
2016 file_path,
2017 read_offset,
2018 leaf.bytes,
2019 )?
2020 }
2021 };
2022 Ok(count)
2023 }
2024
2025 /// Return the I/O parameters for an unloaded leaf, or `None` if loaded.
2026 ///
2027 /// Used by the incremental scan to distinguish leaves that can be counted
2028 /// in-memory (via `scan_leaf`) from those that need filesystem I/O.
2029 pub fn leaf_io_params(
2030 &self,
2031 leaf: &crate::model::piece_tree::LeafData,
2032 ) -> Option<(std::path::PathBuf, u64, usize)> {
2033 let buffer_id = leaf.location.buffer_id();
2034 let buffer = self.buffers.get(buffer_id)?;
2035 match &buffer.data {
2036 crate::model::piece_tree::BufferData::Loaded { .. } => None,
2037 crate::model::piece_tree::BufferData::Unloaded {
2038 file_path,
2039 file_offset,
2040 ..
2041 } => {
2042 let read_offset = *file_offset as u64 + leaf.offset as u64;
2043 Some((file_path.clone(), read_offset, leaf.bytes))
2044 }
2045 }
2046 }
2047
2048 /// Get a reference to the string buffers (for parallel scanning).
2049 pub fn buffer_slice(&self) -> &[StringBuffer] {
2050 &self.buffers
2051 }
2052
2053 /// Apply the results of an incremental line scan.
2054 pub fn apply_scan_updates(&mut self, updates: &[(usize, usize)]) {
2055 self.piece_tree.update_leaf_line_feeds(updates);
2056 self.file_kind.mark_line_feed_scan_complete();
2057 }
2058
2059 /// After an incremental line-feed scan completes, rebuild the tree so that
2060 /// `saved_root` and the current tree share `Arc` pointers for unedited
2061 /// subtrees. This makes `diff_since_saved()` O(edited regions) instead of
2062 /// O(file size).
2063 pub fn rebuild_with_pristine_saved_root(&mut self, scan_updates: &[(usize, usize)]) {
2064 let file_size = match self.persistence.saved_file_size() {
2065 Some(s) => s,
2066 None => {
2067 // Fallback: no saved file size means we can't build a pristine
2068 // tree. Just apply updates the old way.
2069 self.apply_scan_updates(scan_updates);
2070 return;
2071 }
2072 };
2073
2074 // --- Walk the current tree to extract deletions and insertions ---
2075 let total = self.total_bytes();
2076 // Deletions: gaps in Stored coverage (orig_offset, len).
2077 let mut deletions: Vec<(usize, usize)> = Vec::new();
2078 // Insertions: (post_delete_offset, location, buf_offset, bytes, lf_cnt).
2079 // post_delete_offset = cumulative surviving Stored bytes before this point.
2080 let mut insertions: Vec<(usize, BufferLocation, usize, usize, Option<usize>)> = Vec::new();
2081 let mut orig_cursor: usize = 0;
2082 let mut stored_bytes_in_doc: usize = 0;
2083
2084 for piece in self.piece_tree.iter_pieces_in_range(0, total) {
2085 match piece.location {
2086 BufferLocation::Stored(_) => {
2087 if piece.buffer_offset > orig_cursor {
2088 deletions.push((orig_cursor, piece.buffer_offset - orig_cursor));
2089 }
2090 orig_cursor = piece.buffer_offset + piece.bytes;
2091 stored_bytes_in_doc += piece.bytes;
2092 }
2093 BufferLocation::Added(id) => {
2094 // Check if this Added buffer was created by loading a chunk
2095 // from the stored file (via get_text_range_mut chunk loading).
2096 // If so, treat it as stored content, not a user edit.
2097 if let Some(file_off) = self.buffers.get(id).and_then(|b| b.stored_file_offset)
2098 {
2099 if file_off > orig_cursor {
2100 deletions.push((orig_cursor, file_off - orig_cursor));
2101 }
2102 orig_cursor = file_off + piece.bytes;
2103 stored_bytes_in_doc += piece.bytes;
2104 } else {
2105 insertions.push((
2106 stored_bytes_in_doc,
2107 piece.location,
2108 piece.buffer_offset,
2109 piece.bytes,
2110 piece.line_feed_cnt,
2111 ));
2112 }
2113 }
2114 }
2115 }
2116 // Trailing deletion.
2117 if orig_cursor < file_size {
2118 deletions.push((orig_cursor, file_size - orig_cursor));
2119 }
2120
2121 // --- Build pristine tree (full original file, pre-split, with lf counts) ---
2122 let mut pristine = if file_size > 0 {
2123 PieceTree::new(BufferLocation::Stored(0), 0, file_size, None)
2124 } else {
2125 PieceTree::empty()
2126 };
2127 pristine.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
2128 pristine.update_leaf_line_feeds(scan_updates);
2129
2130 // Snapshot the pristine tree as saved_root.
2131 self.persistence.set_saved_root(pristine.root());
2132
2133 // If no edits, the pristine tree IS the current tree.
2134 if deletions.is_empty() && insertions.is_empty() {
2135 self.piece_tree = pristine;
2136 self.file_kind.mark_line_feed_scan_complete();
2137 return;
2138 }
2139
2140 // --- Replay edits onto a clone of the pristine tree ---
2141 let mut tree = pristine;
2142
2143 // Apply deletions from HIGH to LOW offset so earlier offsets stay valid.
2144 deletions.sort_by(|a, b| b.0.cmp(&a.0));
2145 for &(offset, len) in &deletions {
2146 tree.delete(offset, len, &self.buffers);
2147 }
2148
2149 // Apply insertions from LOW to HIGH. Each insertion shifts subsequent
2150 // offsets by its byte count, tracked via insert_delta.
2151 let mut insert_delta: usize = 0;
2152 for &(offset, location, buf_offset, bytes, lf_cnt) in &insertions {
2153 tree.insert(
2154 offset + insert_delta,
2155 location,
2156 buf_offset,
2157 bytes,
2158 lf_cnt,
2159 &self.buffers,
2160 );
2161 insert_delta += bytes;
2162 }
2163
2164 // Path-copy insert/delete may split Stored leaves whose data is
2165 // Unloaded, producing fragments with line_feed_cnt = None
2166 // (compute_line_feeds_static can't read unloaded data). Fix them up
2167 // by scanning any remaining None leaves.
2168 let leaves = tree.get_leaves();
2169 let mut fixups: Vec<(usize, usize)> = Vec::new();
2170 for (idx, leaf) in leaves.iter().enumerate() {
2171 if leaf.line_feed_cnt.is_none() {
2172 if let Ok(count) = self.scan_leaf(leaf) {
2173 fixups.push((idx, count));
2174 }
2175 }
2176 }
2177 if !fixups.is_empty() {
2178 tree.update_leaf_line_feeds_path_copy(&fixups);
2179 }
2180
2181 self.piece_tree = tree;
2182 self.file_kind.mark_line_feed_scan_complete();
2183 }
2184
2185 /// Resolve the exact byte offset for a given line number (0-indexed).
2186 ///
2187 /// Uses the tree's line feed counts to find the piece containing the target line,
2188 /// then loads/reads that piece's data to find the exact newline position.
2189 /// This works even when buffers are unloaded (large file with scanned line index).
2190 pub fn resolve_line_byte_offset(&mut self, target_line: usize) -> Option<usize> {
2191 if target_line == 0 {
2192 return Some(0);
2193 }
2194
2195 // Use tree metadata to find the piece containing the target line
2196 let (doc_offset, buffer_id, piece_offset, piece_bytes, lines_before) =
2197 self.piece_tree.piece_info_for_line(target_line)?;
2198
2199 // We need to find the (target_line - lines_before)-th newline within this piece
2200 let lines_to_skip = target_line - lines_before;
2201
2202 // Get the piece data — either from loaded buffer or read from disk
2203 let buffer = self.buffers.get(buffer_id)?;
2204 let piece_data: Vec<u8> = match &buffer.data {
2205 crate::model::piece_tree::BufferData::Loaded { data, .. } => {
2206 let end = (piece_offset + piece_bytes).min(data.len());
2207 data[piece_offset..end].to_vec()
2208 }
2209 crate::model::piece_tree::BufferData::Unloaded {
2210 file_path,
2211 file_offset,
2212 ..
2213 } => {
2214 let read_offset = *file_offset as u64 + piece_offset as u64;
2215 self.persistence
2216 .fs()
2217 .read_range(file_path, read_offset, piece_bytes)
2218 .ok()?
2219 }
2220 };
2221
2222 // Count newlines to find the target line start
2223 let mut newlines_found = 0;
2224 for (i, &byte) in piece_data.iter().enumerate() {
2225 if byte == b'\n' {
2226 newlines_found += 1;
2227 if newlines_found == lines_to_skip {
2228 // The target line starts right after this newline
2229 return Some(doc_offset + i + 1);
2230 }
2231 }
2232 }
2233
2234 // If we didn't find enough newlines, the line starts in the next piece
2235 // Return the end of this piece as an approximation
2236 Some(doc_offset + piece_bytes)
2237 }
2238
2239 /// Get the saved file size (size of the file on disk after last load/save)
2240 /// For large files, this is used during recovery to know the expected original file size.
2241 /// Returns None for new unsaved buffers.
2242 pub fn original_file_size(&self) -> Option<usize> {
2243 // Return the tracked saved file size - this is updated when the file is
2244 // loaded or saved, so it always reflects the current file on disk.
2245 self.persistence.saved_file_size()
2246 }
2247
2248 /// Get recovery chunks for this buffer (only modified portions)
2249 ///
2250 /// For large files, this returns only the pieces that come from Added buffers
2251 /// (i.e., the modifications), not the original file content. This allows
2252 /// efficient incremental recovery without reading/writing the entire file.
2253 ///
2254 /// Returns: Vec of (original_file_offset, data) for each modified chunk
2255 /// The offset is the position in the ORIGINAL file where this chunk should be inserted.
2256 pub fn get_recovery_chunks(&self) -> Vec<(usize, Vec<u8>)> {
2257 use crate::model::piece_tree::BufferLocation;
2258
2259 let mut chunks = Vec::new();
2260 let total = self.total_bytes();
2261
2262 // Track cumulative bytes from Stored pieces as we iterate.
2263 // This gives us the original file offset for Added pieces.
2264 // The key insight: Added pieces should be inserted at the position
2265 // corresponding to where they appear relative to Stored content,
2266 // not their position in the current document.
2267 let mut stored_bytes_before = 0;
2268
2269 for piece in self.piece_tree.iter_pieces_in_range(0, total) {
2270 match piece.location {
2271 BufferLocation::Stored(_) => {
2272 // Accumulate stored bytes to track position in original file
2273 stored_bytes_before += piece.bytes;
2274 }
2275 BufferLocation::Added(buffer_id) => {
2276 if let Some(buffer) = self.buffers.iter().find(|b| b.id == buffer_id) {
2277 // Skip buffers that originate from the original file
2278 // (loaded by chunk_split_and_load for viewport display).
2279 // These have stored_file_offset set and are not user edits.
2280 //
2281 // Why Added and not Stored? The piece tree only has two
2282 // variants: Stored and Added. chunk_split_and_load marks
2283 // loaded chunks as Added(new_id) because
2284 // rebuild_with_pristine_saved_root interprets Stored
2285 // pieces' buffer_offset as a position in the original
2286 // file — but a chunk buffer starts at offset 0, so using
2287 // Stored would corrupt the rebuild logic. We rely on
2288 // stored_file_offset instead to distinguish "loaded from
2289 // disk" from "user edit". A third BufferLocation variant
2290 // (e.g. LoadedChunk) would make this distinction explicit
2291 // in the type system rather than requiring this runtime
2292 // check.
2293 if buffer.stored_file_offset.is_some() {
2294 stored_bytes_before += piece.bytes;
2295 continue;
2296 }
2297 // Get the data from the buffer if loaded
2298 if let Some(data) = buffer.get_data() {
2299 // Extract just the portion this piece references
2300 let start = piece.buffer_offset;
2301 let end = start + piece.bytes;
2302 if end <= data.len() {
2303 // Use stored_bytes_before as the original file offset.
2304 // This is where this insertion should go relative to
2305 // the original file content.
2306 chunks.push((stored_bytes_before, data[start..end].to_vec()));
2307 }
2308 }
2309 }
2310 }
2311 }
2312 }
2313
2314 chunks
2315 }
2316
2317 /// Check if this buffer contains binary content
2318 pub fn is_binary(&self) -> bool {
2319 self.file_kind.is_binary()
2320 }
2321
2322 /// Get the line ending format for this buffer
2323 pub fn line_ending(&self) -> LineEnding {
2324 self.format.line_ending()
2325 }
2326
2327 /// Set the line ending format for this buffer
2328 ///
2329 /// This marks the buffer as modified since the line ending format has changed.
2330 /// On save, the buffer content will be converted to the new format.
2331 pub fn set_line_ending(&mut self, line_ending: LineEnding) {
2332 self.format.set_line_ending(line_ending);
2333 self.mark_content_modified();
2334 }
2335
2336 /// Set the default line ending format for a new/empty buffer
2337 ///
2338 /// Unlike `set_line_ending`, this does NOT mark the buffer as modified.
2339 /// This should be used when initializing a new buffer with a configured default.
2340 pub fn set_default_line_ending(&mut self, line_ending: LineEnding) {
2341 self.format.set_default_line_ending(line_ending);
2342 }
2343
2344 /// Get the encoding format for this buffer
2345 pub fn encoding(&self) -> Encoding {
2346 self.format.encoding()
2347 }
2348
2349 /// Set the encoding format for this buffer
2350 ///
2351 /// This marks the buffer as modified since the encoding format has changed.
2352 /// On save, the buffer content will be converted to the new encoding.
2353 pub fn set_encoding(&mut self, encoding: Encoding) {
2354 self.format.set_encoding(encoding);
2355 self.mark_content_modified();
2356 }
2357
2358 /// Set the default encoding format for a new/empty buffer
2359 ///
2360 /// Unlike `set_encoding`, this does NOT mark the buffer as modified.
2361 /// This should be used when initializing a new buffer with a configured default.
2362 pub fn set_default_encoding(&mut self, encoding: Encoding) {
2363 self.format.set_default_encoding(encoding);
2364 }
2365
2366 /// Get the first line of the buffer as a lossy UTF-8 string, suitable
2367 /// for shebang / first-line grammar detection. Returns `None` for an
2368 /// empty buffer. Non-UTF-8 bytes are replaced with U+FFFD.
2369 pub fn first_line_lossy(&self) -> Option<String> {
2370 let bytes = self.get_line(0)?;
2371 if bytes.is_empty() {
2372 return None;
2373 }
2374 Some(String::from_utf8_lossy(&bytes).into_owned())
2375 }
2376
2377 /// Get text for a specific line
2378 pub fn get_line(&self, line: usize) -> Option<Vec<u8>> {
2379 let (start, end) = self.piece_tree.line_range(line, &self.buffers)?;
2380
2381 let bytes = if let Some(end_offset) = end {
2382 end_offset.saturating_sub(start)
2383 } else {
2384 self.total_bytes().saturating_sub(start)
2385 };
2386
2387 self.get_text_range(start, bytes)
2388 }
2389
2390 /// Get the byte offset where a line starts
2391 pub fn line_start_offset(&self, line: usize) -> Option<usize> {
2392 let (start, _) = self.piece_tree.line_range(line, &self.buffers)?;
2393 Some(start)
2394 }
2395
2396 /// Get piece information at a byte offset
2397 pub fn piece_info_at_offset(&self, offset: usize) -> Option<PieceInfo> {
2398 self.piece_tree.find_by_offset(offset)
2399 }
2400
2401 /// Get tree statistics for debugging
2402 pub fn stats(&self) -> TreeStats {
2403 self.piece_tree.stats()
2404 }
2405
2406 // Search and Replace Operations
2407
2408 /// Find the next occurrence of a pattern, with wrap-around
2409 pub fn find_next(&self, pattern: &str, start_pos: usize) -> Option<usize> {
2410 if pattern.is_empty() {
2411 return None;
2412 }
2413
2414 let pattern_bytes = pattern.as_bytes();
2415 let buffer_len = self.len();
2416
2417 // Search from start_pos to end
2418 if start_pos < buffer_len {
2419 if let Some(offset) = self.find_pattern(start_pos, buffer_len, pattern_bytes) {
2420 return Some(offset);
2421 }
2422 }
2423
2424 // Wrap around: search from beginning to start_pos
2425 if start_pos > 0 {
2426 if let Some(offset) = self.find_pattern(0, start_pos, pattern_bytes) {
2427 return Some(offset);
2428 }
2429 }
2430
2431 None
2432 }
2433
2434 /// Find the next occurrence of a pattern within an optional range
2435 /// If range is None, searches the entire buffer with wrap-around (same as find_next)
2436 /// If range is Some, searches only within that range without wrap-around
2437 pub fn find_next_in_range(
2438 &self,
2439 pattern: &str,
2440 start_pos: usize,
2441 range: Option<Range<usize>>,
2442 ) -> Option<usize> {
2443 if pattern.is_empty() {
2444 return None;
2445 }
2446
2447 if let Some(search_range) = range {
2448 // Search within range only, no wrap-around
2449 let pattern_bytes = pattern.as_bytes();
2450 let search_start = start_pos.max(search_range.start);
2451 let search_end = search_range.end.min(self.len());
2452
2453 if search_start < search_end {
2454 self.find_pattern(search_start, search_end, pattern_bytes)
2455 } else {
2456 None
2457 }
2458 } else {
2459 // No range specified, use normal find_next with wrap-around
2460 self.find_next(pattern, start_pos)
2461 }
2462 }
2463
2464 /// Find pattern in a byte range using overlapping chunks
2465 fn find_pattern(&self, start: usize, end: usize, pattern: &[u8]) -> Option<usize> {
2466 if pattern.is_empty() || start >= end {
2467 return None;
2468 }
2469
2470 const CHUNK_SIZE: usize = 65536; // 64KB chunks
2471 let overlap = pattern.len().saturating_sub(1).max(1);
2472
2473 // Use the overlapping chunks iterator for efficient streaming search
2474 let chunks = OverlappingChunks::new(self, start, end, CHUNK_SIZE, overlap);
2475
2476 for chunk in chunks {
2477 // Search the entire chunk buffer
2478 if let Some(pos) = Self::find_in_bytes(&chunk.buffer, pattern) {
2479 let match_end = pos + pattern.len();
2480 // Only report if match ENDS in or after the valid zone
2481 // This ensures patterns spanning boundaries are found exactly once
2482 if match_end > chunk.valid_start {
2483 let absolute_pos = chunk.absolute_pos + pos;
2484 // Verify the match doesn't extend beyond our search range
2485 if absolute_pos + pattern.len() <= end {
2486 return Some(absolute_pos);
2487 }
2488 }
2489 }
2490 }
2491
2492 None
2493 }
2494
2495 /// Simple byte pattern search using naive algorithm
2496 fn find_in_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
2497 if needle.is_empty() || needle.len() > haystack.len() {
2498 return None;
2499 }
2500
2501 (0..=haystack.len() - needle.len()).find(|&i| &haystack[i..i + needle.len()] == needle)
2502 }
2503
2504 /// Find the next occurrence of a regex pattern, with wrap-around
2505 pub fn find_next_regex(&self, regex: &Regex, start_pos: usize) -> Option<usize> {
2506 let buffer_len = self.len();
2507
2508 // Search from start_pos to end
2509 if start_pos < buffer_len {
2510 if let Some(offset) = self.find_regex(start_pos, buffer_len, regex) {
2511 return Some(offset);
2512 }
2513 }
2514
2515 // Wrap around: search from beginning to start_pos
2516 if start_pos > 0 {
2517 if let Some(offset) = self.find_regex(0, start_pos, regex) {
2518 return Some(offset);
2519 }
2520 }
2521
2522 None
2523 }
2524
2525 /// Find the next occurrence of a regex pattern within an optional range
2526 pub fn find_next_regex_in_range(
2527 &self,
2528 regex: &Regex,
2529 start_pos: usize,
2530 range: Option<Range<usize>>,
2531 ) -> Option<usize> {
2532 if let Some(search_range) = range {
2533 let search_start = start_pos.max(search_range.start);
2534 let search_end = search_range.end.min(self.len());
2535
2536 if search_start < search_end {
2537 self.find_regex(search_start, search_end, regex)
2538 } else {
2539 None
2540 }
2541 } else {
2542 self.find_next_regex(regex, start_pos)
2543 }
2544 }
2545
2546 /// Find regex pattern in a byte range using overlapping chunks
2547 fn find_regex(&self, start: usize, end: usize, regex: &Regex) -> Option<usize> {
2548 if start >= end {
2549 return None;
2550 }
2551
2552 const CHUNK_SIZE: usize = 1048576; // 1MB chunks
2553 const OVERLAP: usize = 4096; // 4KB overlap for regex
2554
2555 // Use the overlapping chunks iterator for efficient streaming search
2556 // This fixes the critical bug where regex patterns spanning chunk boundaries were missed
2557 let chunks = OverlappingChunks::new(self, start, end, CHUNK_SIZE, OVERLAP);
2558
2559 for chunk in chunks {
2560 // Search the entire chunk buffer
2561 if let Some(mat) = regex.find(&chunk.buffer) {
2562 let match_end = mat.end();
2563 // Only report if match ENDS in or after the valid zone
2564 // This ensures patterns spanning boundaries are found exactly once
2565 if match_end > chunk.valid_start {
2566 let absolute_pos = chunk.absolute_pos + mat.start();
2567 // Verify the match doesn't extend beyond our search range
2568 let match_len = mat.end() - mat.start();
2569 if absolute_pos + match_len <= end {
2570 return Some(absolute_pos);
2571 }
2572 }
2573 }
2574 }
2575
2576 None
2577 }
2578
2579 /// Replace a range with replacement text
2580 pub fn replace_range(&mut self, range: Range<usize>, replacement: &str) -> bool {
2581 if range.start >= self.len() {
2582 return false;
2583 }
2584
2585 let end = range.end.min(self.len());
2586 if end > range.start {
2587 self.delete_bytes(range.start, end - range.start);
2588 }
2589
2590 if !replacement.is_empty() {
2591 self.insert(range.start, replacement);
2592 }
2593
2594 true
2595 }
2596
2597 /// Find and replace the next occurrence of a pattern
2598 pub fn replace_next(
2599 &mut self,
2600 pattern: &str,
2601 replacement: &str,
2602 start_pos: usize,
2603 range: Option<Range<usize>>,
2604 ) -> Option<usize> {
2605 if let Some(pos) = self.find_next_in_range(pattern, start_pos, range.clone()) {
2606 self.replace_range(pos..pos + pattern.len(), replacement);
2607 Some(pos)
2608 } else {
2609 None
2610 }
2611 }
2612
2613 /// Replace all occurrences of a pattern with replacement text
2614 pub fn replace_all(&mut self, pattern: &str, replacement: &str) -> usize {
2615 if pattern.is_empty() {
2616 return 0;
2617 }
2618
2619 let mut count = 0;
2620 let mut pos = 0;
2621
2622 // Keep searching and replacing
2623 // Note: we search forward from last replacement to handle growth/shrinkage
2624 // Find next occurrence (no wrap-around for replace_all)
2625 while let Some(found_pos) = self.find_next_in_range(pattern, pos, Some(0..self.len())) {
2626 self.replace_range(found_pos..found_pos + pattern.len(), replacement);
2627 count += 1;
2628
2629 // Move past the replacement
2630 pos = found_pos + replacement.len();
2631
2632 // If we're at or past the end, stop
2633 if pos >= self.len() {
2634 break;
2635 }
2636 }
2637
2638 count
2639 }
2640
2641 /// Replace all occurrences of a regex pattern with replacement text
2642 pub fn replace_all_regex(&mut self, regex: &Regex, replacement: &str) -> Result<usize> {
2643 let mut count = 0;
2644 let mut pos = 0;
2645
2646 while let Some(found_pos) = self.find_next_regex_in_range(regex, pos, Some(0..self.len())) {
2647 // Get the match to find its length
2648 let text = self
2649 .get_text_range_mut(found_pos, self.len() - found_pos)
2650 .context("Failed to read text for regex match")?;
2651
2652 if let Some(mat) = regex.find(&text) {
2653 self.replace_range(found_pos..found_pos + mat.len(), replacement);
2654 count += 1;
2655 pos = found_pos + replacement.len();
2656
2657 if pos >= self.len() {
2658 break;
2659 }
2660 } else {
2661 break;
2662 }
2663 }
2664
2665 Ok(count)
2666 }
2667
2668 // LSP Support (UTF-16 conversions)
2669
2670 /// Convert byte position to (line, column) in bytes
2671 pub fn position_to_line_col(&self, byte_pos: usize) -> (usize, usize) {
2672 self.offset_to_position(byte_pos)
2673 .map(|pos| (pos.line, pos.column))
2674 .unwrap_or_else(|| (byte_pos / 80, 0)) // Estimate if metadata unavailable
2675 }
2676
2677 /// Convert (line, character) to byte position - 0-indexed
2678 /// character is in BYTES, not UTF-16 code units
2679 /// Optimized to use single line_range() call instead of two
2680 pub fn line_col_to_position(&self, line: usize, character: usize) -> usize {
2681 if let Some((start, end)) = self.piece_tree.line_range(line, &self.buffers) {
2682 // Calculate line length from the range
2683 let line_len = if let Some(end_offset) = end {
2684 end_offset.saturating_sub(start)
2685 } else {
2686 self.total_bytes().saturating_sub(start)
2687 };
2688 let byte_offset = character.min(line_len);
2689 start + byte_offset
2690 } else {
2691 // Line doesn't exist, return end of buffer
2692 self.len()
2693 }
2694 }
2695
2696 /// Convert byte position to LSP position (line, UTF-16 code units)
2697 /// LSP protocol uses UTF-16 code units for character offsets
2698 pub fn position_to_lsp_position(&self, byte_pos: usize) -> (usize, usize) {
2699 let (line, column_bytes) = self
2700 .offset_to_position(byte_pos)
2701 .map(|pos| (pos.line, pos.column))
2702 .unwrap_or_else(|| (byte_pos / 80, 0)); // Estimate if metadata unavailable
2703
2704 // Get the line content
2705 if let Some(line_bytes) = self.get_line(line) {
2706 // Convert byte offset to UTF-16 code units
2707 let text_before = &line_bytes[..column_bytes.min(line_bytes.len())];
2708 let text_str = String::from_utf8_lossy(text_before);
2709 let utf16_offset = text_str.encode_utf16().count();
2710 (line, utf16_offset)
2711 } else {
2712 (line, 0)
2713 }
2714 }
2715
2716 /// Convert LSP position (line, UTF-16 code units) to byte position
2717 /// LSP uses UTF-16 code units for character offsets, not bytes
2718 /// Optimized to use single line_range() call instead of two
2719 pub fn lsp_position_to_byte(&self, line: usize, utf16_offset: usize) -> usize {
2720 if let Some((line_start, end)) = self.piece_tree.line_range(line, &self.buffers) {
2721 // Calculate line length and get line content
2722 let line_len = if let Some(end_offset) = end {
2723 end_offset.saturating_sub(line_start)
2724 } else {
2725 self.total_bytes().saturating_sub(line_start)
2726 };
2727
2728 if line_len > 0 {
2729 // If data is unloaded, return line_start as fallback
2730 let Some(line_bytes) = self.get_text_range(line_start, line_len) else {
2731 return line_start;
2732 };
2733 let line_str = String::from_utf8_lossy(&line_bytes);
2734
2735 // Convert UTF-16 offset to byte offset
2736 let mut utf16_count = 0;
2737 let mut byte_offset = 0;
2738
2739 for ch in line_str.chars() {
2740 if utf16_count >= utf16_offset {
2741 break;
2742 }
2743 utf16_count += ch.len_utf16();
2744 byte_offset += ch.len_utf8();
2745 }
2746
2747 line_start + byte_offset
2748 } else {
2749 line_start
2750 }
2751 } else {
2752 // Line doesn't exist, return end of buffer
2753 self.len()
2754 }
2755 }
2756
2757 // Navigation helpers
2758
2759 /// Find the previous character boundary (UTF-8 aware)
2760 pub fn prev_char_boundary(&self, pos: usize) -> usize {
2761 if pos == 0 {
2762 return 0;
2763 }
2764
2765 // Get a few bytes before pos to find the character boundary
2766 let start = pos.saturating_sub(4);
2767 let Some(bytes) = self.get_text_range(start, pos - start) else {
2768 // Data unloaded, return pos as fallback
2769 return pos;
2770 };
2771
2772 // Walk backwards to find a UTF-8 leading byte
2773 for i in (0..bytes.len()).rev() {
2774 let byte = bytes[i];
2775 // Check if this is a UTF-8 leading byte (not a continuation byte)
2776 if (byte & 0b1100_0000) != 0b1000_0000 {
2777 return start + i;
2778 }
2779 }
2780
2781 // Fallback
2782 pos.saturating_sub(1)
2783 }
2784
2785 /// Find the next character boundary (UTF-8 aware)
2786 pub fn next_char_boundary(&self, pos: usize) -> usize {
2787 let len = self.len();
2788 if pos >= len {
2789 return len;
2790 }
2791
2792 // Get a few bytes after pos to find the character boundary
2793 let end = (pos + 5).min(len);
2794 let Some(bytes) = self.get_text_range(pos, end - pos) else {
2795 // Data unloaded, return pos as fallback
2796 return pos;
2797 };
2798
2799 // Start from index 1 (we want the NEXT boundary)
2800 for (i, &byte) in bytes.iter().enumerate().skip(1) {
2801 // Check if this is a UTF-8 leading byte (not a continuation byte)
2802 if (byte & 0b1100_0000) != 0b1000_0000 {
2803 return pos + i;
2804 }
2805 }
2806
2807 // If we got here, we're at the end or found no boundary in the range
2808 end
2809 }
2810
2811 /// Check if a byte is a UTF-8 continuation byte (not at a char boundary)
2812 /// UTF-8 continuation bytes have the pattern 10xxxxxx (0x80-0xBF)
2813 /// This is the same check that str::is_char_boundary uses internally.
2814 #[inline]
2815 fn is_utf8_continuation_byte(byte: u8) -> bool {
2816 (byte & 0b1100_0000) == 0b1000_0000
2817 }
2818
2819 /// Snap position to a valid UTF-8 character boundary
2820 /// If already at a boundary, returns the same position.
2821 /// Otherwise, moves to the previous valid boundary.
2822 pub fn snap_to_char_boundary(&self, pos: usize) -> usize {
2823 let len = self.len();
2824 if pos == 0 || pos >= len {
2825 return pos.min(len);
2826 }
2827
2828 // Get the byte at pos to check if we're at a character boundary
2829 let Some(bytes) = self.get_text_range(pos, 1) else {
2830 // Data unloaded, return pos as fallback
2831 return pos;
2832 };
2833
2834 // A position is at a char boundary if the byte there is NOT a continuation byte
2835 if !Self::is_utf8_continuation_byte(bytes[0]) {
2836 // Already at a character boundary
2837 return pos;
2838 }
2839
2840 // Not at a boundary, find the previous one
2841 self.prev_char_boundary(pos)
2842 }
2843
2844 /// Find the previous grapheme cluster boundary (for proper cursor movement with combining characters)
2845 ///
2846 /// This handles complex scripts like Thai where multiple Unicode code points
2847 /// form a single visual character (grapheme cluster). For example, Thai "ที่"
2848 /// is 3 code points but 1 grapheme cluster.
2849 ///
2850 /// The lookahead window starts at 32 bytes but grows whenever the
2851 /// returned boundary sits at the start of the chunk — that is, whenever
2852 /// the chunk might not contain the full grapheme. This matters for ZWJ
2853 /// emoji sequences and Zalgo strings with many combining marks, which
2854 /// can easily exceed 32 bytes.
2855 pub fn prev_grapheme_boundary(&self, pos: usize) -> usize {
2856 if pos == 0 {
2857 return 0;
2858 }
2859
2860 let mut lookback: usize = 32;
2861 loop {
2862 // IMPORTANT: Align start to a valid character boundary to avoid invalid UTF-8
2863 // when get_text_range starts mid-character
2864 let raw_start = pos.saturating_sub(lookback);
2865 let start = if raw_start == 0 {
2866 0
2867 } else {
2868 // Find the character boundary at or before raw_start
2869 self.prev_char_boundary(raw_start + 1)
2870 };
2871
2872 let Some(bytes) = self.get_text_range(start, pos - start) else {
2873 // Data unloaded, fall back to char boundary
2874 return self.prev_char_boundary(pos);
2875 };
2876
2877 let text = match std::str::from_utf8(&bytes) {
2878 Ok(s) => s,
2879 Err(e) => {
2880 // Still got invalid UTF-8 (shouldn't happen after alignment)
2881 // Try using just the valid portion
2882 let valid_bytes = &bytes[..e.valid_up_to()];
2883 match std::str::from_utf8(valid_bytes) {
2884 Ok(s) if !s.is_empty() => s,
2885 _ => return self.prev_char_boundary(pos),
2886 }
2887 }
2888 };
2889
2890 // Use shared grapheme utility with relative position
2891 let rel_pos = pos - start;
2892 let new_rel_pos = grapheme::prev_grapheme_boundary(text, rel_pos);
2893
2894 // If the returned boundary is at the start of our chunk, the
2895 // grapheme may extend further back. Only trust the answer when
2896 // either we already reached the beginning of the buffer or the
2897 // boundary sits strictly inside the chunk.
2898 if new_rel_pos > 0 || start == 0 {
2899 return start + new_rel_pos;
2900 }
2901
2902 // Expand the lookback window and retry. Cap at the full buffer.
2903 if lookback >= pos {
2904 return 0;
2905 }
2906 lookback = lookback.saturating_mul(2);
2907 }
2908 }
2909
2910 /// Find the next grapheme cluster boundary (for proper cursor movement with combining characters)
2911 ///
2912 /// This handles complex scripts like Thai where multiple Unicode code points
2913 /// form a single visual character (grapheme cluster). For example, Thai "ที่"
2914 /// is 3 code points but 1 grapheme cluster.
2915 ///
2916 /// The lookahead window grows whenever the first grapheme reaches the
2917 /// end of the chunk — otherwise ZWJ emoji and Zalgo strings whose byte
2918 /// length exceeds the initial 32-byte window would be split mid-cluster.
2919 pub fn next_grapheme_boundary(&self, pos: usize) -> usize {
2920 let len = self.len();
2921 if pos >= len {
2922 return len;
2923 }
2924
2925 let mut lookahead: usize = 32;
2926 loop {
2927 let end = (pos + lookahead).min(len);
2928 let Some(bytes) = self.get_text_range(pos, end - pos) else {
2929 // Data unloaded, fall back to char boundary
2930 return self.next_char_boundary(pos);
2931 };
2932
2933 // Convert to UTF-8 string, handling the case where we might have
2934 // grabbed bytes that end mid-character (truncate to valid UTF-8)
2935 let text = match std::str::from_utf8(&bytes) {
2936 Ok(s) => s,
2937 Err(e) => {
2938 // The bytes end in an incomplete UTF-8 sequence
2939 // Use only the valid portion (which includes at least the first grapheme)
2940 let valid_bytes = &bytes[..e.valid_up_to()];
2941 match std::str::from_utf8(valid_bytes) {
2942 Ok(s) if !s.is_empty() => s,
2943 _ => return self.next_char_boundary(pos),
2944 }
2945 }
2946 };
2947
2948 let new_rel_pos = grapheme::next_grapheme_boundary(text, 0);
2949
2950 // If the first grapheme reaches the end of our chunk and there
2951 // is more buffer left beyond it, the grapheme may extend further.
2952 // Expand the window and retry.
2953 if new_rel_pos == text.len() && end < len {
2954 if lookahead >= len - pos {
2955 return len;
2956 }
2957 lookahead = lookahead.saturating_mul(2);
2958 continue;
2959 }
2960
2961 return pos + new_rel_pos;
2962 }
2963 }
2964
2965 /// Find the previous word boundary
2966 pub fn prev_word_boundary(&self, pos: usize) -> usize {
2967 if pos == 0 {
2968 return 0;
2969 }
2970
2971 // Get some text before pos
2972 let start = pos.saturating_sub(256).max(0);
2973 let Some(bytes) = self.get_text_range(start, pos - start) else {
2974 // Data unloaded, return pos as fallback
2975 return pos;
2976 };
2977 let text = String::from_utf8_lossy(&bytes);
2978
2979 let mut found_word_char = false;
2980 let chars: Vec<char> = text.chars().collect();
2981
2982 for i in (0..chars.len()).rev() {
2983 let ch = chars[i];
2984 let is_word_char = ch.is_alphanumeric() || ch == '_';
2985
2986 if found_word_char && !is_word_char {
2987 // We've transitioned from word to non-word
2988 // Calculate the byte position
2989 let byte_offset: usize = chars[0..=i].iter().map(|c| c.len_utf8()).sum();
2990 return start + byte_offset;
2991 }
2992
2993 if is_word_char {
2994 found_word_char = true;
2995 }
2996 }
2997
2998 0
2999 }
3000
3001 /// Find the next word boundary
3002 pub fn next_word_boundary(&self, pos: usize) -> usize {
3003 let len = self.len();
3004 if pos >= len {
3005 return len;
3006 }
3007
3008 // Get some text after pos
3009 let end = (pos + 256).min(len);
3010 let Some(bytes) = self.get_text_range(pos, end - pos) else {
3011 // Data unloaded, return pos as fallback
3012 return pos;
3013 };
3014 let text = String::from_utf8_lossy(&bytes);
3015
3016 let mut found_word_char = false;
3017 let mut byte_offset = 0;
3018
3019 for ch in text.chars() {
3020 let is_word_char = ch.is_alphanumeric() || ch == '_';
3021
3022 if found_word_char && !is_word_char {
3023 // We've transitioned from word to non-word
3024 return pos + byte_offset;
3025 }
3026
3027 if is_word_char {
3028 found_word_char = true;
3029 }
3030
3031 byte_offset += ch.len_utf8();
3032 }
3033
3034 len
3035 }
3036
3037 /// Create a line iterator starting at the given byte position
3038 ///
3039 /// This iterator lazily loads chunks as needed, never scanning the entire file.
3040 /// For large files with unloaded buffers, chunks are loaded on-demand (1MB at a time).
3041 pub fn line_iterator(
3042 &mut self,
3043 byte_pos: usize,
3044 estimated_line_length: usize,
3045 ) -> LineIterator<'_> {
3046 LineIterator::new(self, byte_pos, estimated_line_length)
3047 }
3048
3049 /// Iterate over lines starting from a given byte offset, with line numbers
3050 ///
3051 /// This is a more efficient alternative to using line_iterator() + offset_to_position()
3052 /// because it calculates line numbers incrementally during iteration by accumulating
3053 /// line_feed_cnt from pieces (which is already tracked in the piece tree).
3054 ///
3055 /// Returns: Iterator yielding (byte_offset, content, line_number: Option<usize>)
3056 /// - line_number is Some(n) for small files with line metadata
3057 /// - line_number is None for large files without line metadata
3058 ///
3059 /// # Performance
3060 /// - O(1) per line for line number calculation (vs O(log n) per line with offset_to_position)
3061 /// - Uses single source of truth: piece tree's existing line_feed_cnt metadata
3062 pub fn iter_lines_from(
3063 &mut self,
3064 byte_pos: usize,
3065 max_lines: usize,
3066 ) -> Result<TextBufferLineIterator> {
3067 TextBufferLineIterator::new(self, byte_pos, max_lines)
3068 }
3069
3070 // Legacy API methods for backwards compatibility
3071
3072 /// Get the line number for a given byte offset
3073 ///
3074 /// Returns exact line number if metadata available, otherwise estimates based on bytes.
3075 ///
3076 /// # Behavior by File Size:
3077 /// - **Small files (< 1MB)**: Returns exact line number from piece tree's `line_starts` metadata
3078 /// - **Large files (≥ 1MB)**: Returns estimated line number using `byte_offset / estimated_line_length`
3079 ///
3080 /// Large files don't maintain line metadata for performance reasons. The estimation
3081 /// uses the configured `estimated_line_length` (default 80 bytes).
3082 pub fn get_line_number(&self, byte_offset: usize) -> usize {
3083 self.offset_to_position(byte_offset)
3084 .map(|pos| pos.line)
3085 .unwrap_or_else(|| {
3086 // Estimate line number based on configured average line length
3087 byte_offset / self.config.estimated_line_length
3088 })
3089 }
3090
3091 /// Get the configured estimated line length for approximate line number calculations.
3092 pub fn estimated_line_length(&self) -> usize {
3093 self.config.estimated_line_length
3094 }
3095
3096 /// Get the starting line number at a byte offset (used for viewport rendering)
3097 ///
3098 /// # Line Cache Architecture (Post-Refactoring):
3099 ///
3100 /// The concept of a separate "line cache" is **now obsolete**. After the refactoring,
3101 /// line tracking is integrated directly into the piece tree via:
3102 /// ```rust
3103 /// BufferData::Loaded {
3104 /// data: Vec<u8>,
3105 /// line_starts: Option<Vec<usize>> // None = large file mode (no line metadata)
3106 /// }
3107 /// ```
3108 ///
3109 /// ## Why This Method Still Exists:
3110 /// The rendering code needs to know what line number to display in the margin at the
3111 /// top of the viewport. This method returns that line number, handling both small
3112 /// and large file modes transparently.
3113 ///
3114 /// ## Small vs Large File Modes:
3115 /// - **Small files**: `line_starts = Some(vec)` → returns exact line number from metadata
3116 /// - **Large files**: `line_starts = None` → returns estimated line number (byte_offset / estimated_line_length)
3117 ///
3118 /// ## Legacy Line Cache Methods:
3119 /// These methods are now no-ops and can be removed in a future cleanup:
3120 /// - `invalidate_line_cache_from()` - No-op (piece tree updates automatically)
3121 /// - `handle_line_cache_insertion()` - No-op (piece tree updates automatically)
3122 /// - `handle_line_cache_deletion()` - No-op (piece tree updates automatically)
3123 /// - `clear_line_cache()` - No-op (can't clear piece tree metadata)
3124 ///
3125 /// ## Bug Fix (2025-11):
3126 /// Previously this method always returned `0`, causing line numbers in the margin
3127 /// to always show 1, 2, 3... regardless of scroll position. Now it correctly returns
3128 /// the actual line number at `start_byte`.
3129 pub fn populate_line_cache(&mut self, start_byte: usize, _line_count: usize) -> usize {
3130 // No-op for cache population: LineIndex maintains all line starts automatically
3131 // But we need to return the actual line number at start_byte for rendering
3132 self.get_line_number(start_byte)
3133 }
3134
3135 /// Get cached byte offset for line (compatibility method)
3136 pub fn get_cached_byte_offset_for_line(&self, line_number: usize) -> Option<usize> {
3137 self.line_start_offset(line_number)
3138 }
3139
3140 /// Invalidate line cache from offset (no-op in new implementation)
3141 pub fn invalidate_line_cache_from(&mut self, _byte_offset: usize) {
3142 // No-op: LineIndex updates automatically
3143 }
3144
3145 /// Handle line cache insertion (no-op in new implementation)
3146 pub fn handle_line_cache_insertion(&mut self, _byte_offset: usize, _bytes_inserted: usize) {
3147 // No-op: LineIndex updates automatically during insert
3148 }
3149
3150 /// Handle line cache deletion (no-op in new implementation)
3151 pub fn handle_line_cache_deletion(&mut self, _byte_offset: usize, _bytes_deleted: usize) {
3152 // No-op: LineIndex updates automatically during delete
3153 }
3154
3155 /// Clear line cache (no-op in new implementation)
3156 pub fn clear_line_cache(&mut self) {
3157 // No-op: LineIndex can't be cleared
3158 }
3159
3160 // Test helper methods
3161
3162 /// Create a buffer from a string for testing
3163 #[cfg(test)]
3164 pub fn from_str_test(s: &str) -> Self {
3165 Self::from_bytes(
3166 s.as_bytes().to_vec(),
3167 std::sync::Arc::new(crate::model::filesystem::StdFileSystem),
3168 )
3169 }
3170
3171 /// Create a new empty buffer for testing
3172 #[cfg(test)]
3173 pub fn new_test() -> Self {
3174 Self::empty(std::sync::Arc::new(crate::model::filesystem::StdFileSystem))
3175 }
3176}
3177
3178/// Type alias for backwards compatibility
3179pub type Buffer = TextBuffer;
3180
3181// Re-export LineIterator from the line_iterator module
3182pub use crate::primitives::line_iterator::LineIterator;
3183
3184// ============================================================================
3185// Overlapping Chunks Iterator for Efficient Search
3186// ============================================================================
3187
3188/// Information about a chunk of data for pattern matching
3189#[derive(Debug)]
3190pub struct ChunkInfo {
3191 /// The buffer containing this chunk's data (includes overlap from previous chunk)
3192 pub buffer: Vec<u8>,
3193
3194 /// Absolute position in the document where this buffer starts
3195 pub absolute_pos: usize,
3196
3197 /// Offset within buffer where "new" data starts (valid match zone)
3198 /// Matches starting before this offset were already checked in the previous chunk
3199 pub valid_start: usize,
3200}
3201
3202/// Iterator that yields overlapping chunks for pattern matching
3203///
3204/// This iterator implements the VSCode/Sublime approach: pull overlapping chunks
3205/// from the underlying piece tree and use standard search algorithms on them.
3206///
3207/// # Algorithm
3208///
3209/// ```text
3210/// Chunk 1: [------------ valid -----------]
3211/// Chunk 2: [overlap][---- valid ----]
3212/// Chunk 3: [overlap][-- valid --]
3213///
3214/// Only matches starting in the "valid" zone are reported to avoid duplicates.
3215/// ```
3216///
3217/// # Example
3218///
3219/// ```ignore
3220/// let chunks = OverlappingChunks::new(&text_buffer, start, end, 4096, pattern.len()-1);
3221/// for chunk in chunks {
3222/// // Search only starting from chunk.valid_start
3223/// if let Some(pos) = search(&chunk.buffer[chunk.valid_start..]) {
3224/// let absolute_pos = chunk.absolute_pos + chunk.valid_start + pos;
3225/// return Some(absolute_pos);
3226/// }
3227/// }
3228/// ```
3229pub struct OverlappingChunks<'a> {
3230 piece_iter: PieceRangeIter,
3231 buffers: &'a [StringBuffer],
3232
3233 // Reusable chunk buffer that we fill from pieces
3234 buffer: Vec<u8>,
3235 buffer_absolute_pos: usize,
3236
3237 // Current state
3238 current_pos: usize,
3239 end_pos: usize,
3240
3241 // Configuration
3242 chunk_size: usize,
3243 overlap: usize,
3244
3245 // Track first chunk special case
3246 first_chunk: bool,
3247
3248 // Cached piece data for incremental reading
3249 current_piece_data: Option<Vec<u8>>,
3250 current_piece_offset: usize,
3251}
3252
3253impl<'a> OverlappingChunks<'a> {
3254 /// Create a new overlapping chunks iterator
3255 ///
3256 /// # Arguments
3257 ///
3258 /// * `text_buffer` - The text buffer to iterate over
3259 /// * `start` - Start position in the document
3260 /// * `end` - End position in the document (exclusive)
3261 /// * `chunk_size` - Target size for each chunk (excluding overlap)
3262 /// * `overlap` - Number of bytes to overlap between chunks
3263 ///
3264 /// # Recommendations
3265 ///
3266 /// * For literal string search: `chunk_size=65536, overlap=pattern.len()-1`
3267 /// * For regex search: `chunk_size=1048576, overlap=4096`
3268 pub fn new(
3269 text_buffer: &'a TextBuffer,
3270 start: usize,
3271 end: usize,
3272 chunk_size: usize,
3273 overlap: usize,
3274 ) -> Self {
3275 let piece_iter = text_buffer.piece_tree.iter_pieces_in_range(start, end);
3276
3277 Self {
3278 piece_iter,
3279 buffers: &text_buffer.buffers,
3280 buffer: Vec::with_capacity(chunk_size + overlap),
3281 buffer_absolute_pos: start,
3282 current_pos: start,
3283 end_pos: end,
3284 chunk_size,
3285 overlap,
3286 first_chunk: true,
3287 current_piece_data: None,
3288 current_piece_offset: 0,
3289 }
3290 }
3291
3292 /// Read one byte from the piece iterator
3293 fn read_byte(&mut self) -> Option<u8> {
3294 loop {
3295 // If we have cached piece data, read from it
3296 if let Some(ref data) = self.current_piece_data {
3297 if self.current_piece_offset < data.len() {
3298 let byte = data[self.current_piece_offset];
3299 self.current_piece_offset += 1;
3300 self.current_pos += 1;
3301 return Some(byte);
3302 } else {
3303 // Exhausted current piece, move to next
3304 self.current_piece_data = None;
3305 self.current_piece_offset = 0;
3306 }
3307 }
3308
3309 // Get next piece
3310 if let Some(piece_view) = self.piece_iter.next() {
3311 let buffer_id = piece_view.location.buffer_id();
3312 if let Some(buffer) = self.buffers.get(buffer_id) {
3313 // Extract the relevant slice from this piece
3314 let piece_start_in_doc = piece_view.doc_offset;
3315 let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
3316
3317 // Clip to our search range
3318 let read_start = self.current_pos.max(piece_start_in_doc);
3319 let read_end = self.end_pos.min(piece_end_in_doc);
3320
3321 if read_end > read_start {
3322 let offset_in_piece = read_start - piece_start_in_doc;
3323 let bytes_to_read = read_end - read_start;
3324
3325 let buffer_start = piece_view.buffer_offset + offset_in_piece;
3326 let buffer_end = buffer_start + bytes_to_read;
3327
3328 if let Some(data) = buffer.get_data() {
3329 if buffer_end <= data.len() {
3330 // Cache this piece's data
3331 self.current_piece_data =
3332 Some(data[buffer_start..buffer_end].to_vec());
3333 self.current_piece_offset = 0;
3334 continue;
3335 }
3336 }
3337 }
3338 }
3339 }
3340
3341 // No more data
3342 return None;
3343 }
3344 }
3345
3346 /// Fill the buffer with the next chunk of data
3347 fn fill_next_chunk(&mut self) -> bool {
3348 if self.first_chunk {
3349 // First chunk: fill up to chunk_size
3350 self.first_chunk = false;
3351 while self.buffer.len() < self.chunk_size && self.current_pos < self.end_pos {
3352 if let Some(byte) = self.read_byte() {
3353 self.buffer.push(byte);
3354 } else {
3355 break;
3356 }
3357 }
3358 !self.buffer.is_empty()
3359 } else {
3360 // Subsequent chunks: keep overlap, fill chunk_size NEW bytes
3361 if self.current_pos >= self.end_pos {
3362 return false;
3363 }
3364
3365 // Keep overlap bytes at the end
3366 if self.buffer.len() > self.overlap {
3367 let drain_amount = self.buffer.len() - self.overlap;
3368 self.buffer.drain(0..drain_amount);
3369 self.buffer_absolute_pos += drain_amount;
3370 }
3371
3372 // Fill chunk_size NEW bytes (in addition to overlap)
3373 let before_len = self.buffer.len();
3374 let target_len = self.overlap + self.chunk_size;
3375 while self.buffer.len() < target_len && self.current_pos < self.end_pos {
3376 if let Some(byte) = self.read_byte() {
3377 self.buffer.push(byte);
3378 } else {
3379 break;
3380 }
3381 }
3382
3383 // Return true if we added new data
3384 self.buffer.len() > before_len
3385 }
3386 }
3387}
3388
3389impl<'a> Iterator for OverlappingChunks<'a> {
3390 type Item = ChunkInfo;
3391
3392 fn next(&mut self) -> Option<Self::Item> {
3393 // Track if this is the first chunk before filling
3394 let is_first = self.buffer_absolute_pos == self.current_pos;
3395
3396 if !self.fill_next_chunk() {
3397 return None;
3398 }
3399
3400 // First chunk: all data is valid (no overlap from previous)
3401 // Subsequent chunks: overlap bytes are not valid (already checked)
3402 let valid_start = if is_first {
3403 0
3404 } else {
3405 self.overlap.min(self.buffer.len())
3406 };
3407
3408 Some(ChunkInfo {
3409 buffer: self.buffer.clone(),
3410 absolute_pos: self.buffer_absolute_pos,
3411 valid_start,
3412 })
3413 }
3414}
3415
3416#[cfg(test)]
3417mod tests;
3418
3419#[cfg(test)]
3420mod property_tests;
3421
3422/// Line data with optional line number
3423#[derive(Debug, Clone)]
3424pub struct LineData {
3425 /// Byte offset where this line starts in the document
3426 pub byte_offset: usize,
3427 /// Line content (without trailing newline)
3428 pub content: String,
3429 /// Whether this line ends with a newline
3430 pub has_newline: bool,
3431 /// Line number (None for large files without line metadata)
3432 pub line_number: Option<usize>,
3433}
3434
3435/// Iterator over lines in a TextBuffer that efficiently tracks line numbers
3436/// using piece tree metadata (single source of truth)
3437pub struct TextBufferLineIterator {
3438 /// Collected lines (we collect all at once since we need mutable access to load chunks)
3439 lines: Vec<LineData>,
3440 /// Current index in the lines vector
3441 current_index: usize,
3442 /// Whether there are more lines after these
3443 pub has_more: bool,
3444}
3445
3446impl TextBufferLineIterator {
3447 pub(crate) fn new(buffer: &mut TextBuffer, byte_pos: usize, max_lines: usize) -> Result<Self> {
3448 let buffer_len = buffer.len();
3449 if byte_pos >= buffer_len {
3450 return Ok(Self {
3451 lines: Vec::new(),
3452 current_index: 0,
3453 has_more: false,
3454 });
3455 }
3456
3457 // Check if buffer has line metadata (None for large files > 1MB)
3458 let has_line_metadata = buffer.line_count().is_some();
3459
3460 // Determine starting line number by querying piece tree once
3461 // (only if we have line metadata)
3462 let mut current_line = if has_line_metadata {
3463 buffer.offset_to_position(byte_pos).map(|pos| pos.line)
3464 } else {
3465 None
3466 };
3467
3468 let mut lines = Vec::with_capacity(max_lines);
3469 let mut current_offset = byte_pos;
3470 let estimated_line_length = 80; // Use default estimate
3471
3472 // Collect lines by scanning forward
3473 for _ in 0..max_lines {
3474 if current_offset >= buffer_len {
3475 break;
3476 }
3477
3478 let line_start = current_offset;
3479 let line_number = current_line;
3480
3481 // Estimate how many bytes to load for this line
3482 let estimated_max_line_length = estimated_line_length * 3;
3483 let bytes_to_scan = estimated_max_line_length.min(buffer_len - current_offset);
3484
3485 // Load chunk (this handles lazy loading)
3486 let chunk = buffer.get_text_range_mut(current_offset, bytes_to_scan)?;
3487
3488 // Scan for newline
3489 let mut line_len = 0;
3490 let mut found_newline = false;
3491 for &byte in chunk.iter() {
3492 line_len += 1;
3493 if byte == b'\n' {
3494 found_newline = true;
3495 break;
3496 }
3497 }
3498
3499 // Handle long lines (rare case)
3500 if !found_newline && current_offset + line_len < buffer_len {
3501 // Line is longer than expected, load more data
3502 let remaining = buffer_len - current_offset - line_len;
3503 let additional_bytes = estimated_max_line_length.min(remaining);
3504 let more_chunk =
3505 buffer.get_text_range_mut(current_offset + line_len, additional_bytes)?;
3506
3507 let mut extended_chunk = chunk;
3508 extended_chunk.extend_from_slice(&more_chunk);
3509
3510 for &byte in more_chunk.iter() {
3511 line_len += 1;
3512 if byte == b'\n' {
3513 found_newline = true;
3514 break;
3515 }
3516 }
3517
3518 let line_string = String::from_utf8_lossy(&extended_chunk[..line_len]).into_owned();
3519 let has_newline = line_string.ends_with('\n');
3520 let content = if has_newline {
3521 line_string[..line_string.len() - 1].to_string()
3522 } else {
3523 line_string
3524 };
3525
3526 lines.push(LineData {
3527 byte_offset: line_start,
3528 content,
3529 has_newline,
3530 line_number,
3531 });
3532
3533 current_offset += line_len;
3534 if has_line_metadata && found_newline {
3535 current_line = current_line.map(|n| n + 1);
3536 }
3537 continue;
3538 }
3539
3540 // Normal case
3541 let line_string = String::from_utf8_lossy(&chunk[..line_len]).into_owned();
3542 let has_newline = line_string.ends_with('\n');
3543 let content = if has_newline {
3544 line_string[..line_string.len() - 1].to_string()
3545 } else {
3546 line_string
3547 };
3548
3549 lines.push(LineData {
3550 byte_offset: line_start,
3551 content,
3552 has_newline,
3553 line_number,
3554 });
3555
3556 current_offset += line_len;
3557 // Increment line number if we have metadata and found a newline
3558 if has_line_metadata && found_newline {
3559 current_line = current_line.map(|n| n + 1);
3560 }
3561 }
3562
3563 // Check if there are more lines
3564 let has_more = current_offset < buffer_len;
3565
3566 Ok(Self {
3567 lines,
3568 current_index: 0,
3569 has_more,
3570 })
3571 }
3572}
3573
3574impl Iterator for TextBufferLineIterator {
3575 type Item = LineData;
3576
3577 fn next(&mut self) -> Option<Self::Item> {
3578 if self.current_index < self.lines.len() {
3579 let line = self.lines[self.current_index].clone();
3580 self.current_index += 1;
3581 Some(line)
3582 } else {
3583 None
3584 }
3585 }
3586}