Skip to main content

fresh/model/buffer/
mod.rs

1/// Text buffer that uses PieceTree with integrated line tracking
2/// Architecture where the tree is the single source of truth for text and line information
3use crate::model::encoding;
4use crate::model::filesystem::{FileSearchOptions, FileSystem};
5use crate::model::piece_tree::{
6    BufferData, BufferLocation, Cursor, PieceInfo, PieceRangeIter, PieceTree, PieceView, Position,
7    StringBuffer, TreeStats,
8};
9use crate::model::piece_tree_diff::PieceTreeDiff;
10use crate::primitives::grapheme;
11use anyhow::{Context, Result};
12use regex::bytes::Regex;
13use std::io;
14
15use std::ops::Range;
16use std::path::{Path, PathBuf};
17use std::sync::Arc;
18
19// Re-export Encoding for backward compatibility
20pub use encoding::Encoding;
21
22pub mod file_kind;
23pub mod format;
24pub mod persistence;
25pub mod save;
26pub mod search;
27pub use file_kind::BufferFileKind;
28pub use format::{BufferFormat, LineEnding};
29pub use persistence::Persistence;
30pub use save::SudoSaveRequired;
31#[cfg(test)]
32pub(crate) use save::{RecipeAction, WriteRecipe};
33#[cfg(test)]
34use search::search_boundary_overlap;
35use search::SearchRegion;
36pub use search::{ChunkedSearchState, HybridSearchPlan};
37
38/// Error returned when a large file has a non-resynchronizable encoding
39/// and requires user confirmation before loading the entire file into memory.
40///
41/// Non-resynchronizable encodings (like Shift-JIS, GB18030, GBK, EUC-KR) cannot
42/// determine character boundaries when jumping into the middle of a file.
43/// This means the entire file must be loaded and decoded sequentially.
44#[derive(Debug, Clone, PartialEq)]
45pub struct LargeFileEncodingConfirmation {
46    /// Path to the file
47    pub path: PathBuf,
48    /// Size of the file in bytes
49    pub file_size: usize,
50    /// The detected encoding that requires full loading
51    pub encoding: Encoding,
52}
53
54impl std::fmt::Display for LargeFileEncodingConfirmation {
55    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56        let size_mb = self.file_size as f64 / (1024.0 * 1024.0);
57        write!(
58            f,
59            "{} ({:.0} MB) requires full load. (l)oad, (e)ncoding, (C)ancel? ",
60            self.encoding.display_name(),
61            size_mb
62        )
63    }
64}
65
66impl std::error::Error for LargeFileEncodingConfirmation {}
67
68/// A work item for incremental line-feed scanning (one per leaf).
69#[derive(Debug, Clone)]
70pub struct LineScanChunk {
71    /// Index of the leaf in the piece tree's leaf array.
72    pub leaf_index: usize,
73    /// Number of bytes in this leaf.
74    pub byte_len: usize,
75    /// True if the leaf already had a known line_feed_cnt (no I/O needed).
76    pub already_known: bool,
77}
78
79// Re-export SearchMatch from filesystem — same type is used by both
80// FileSystem::search_file (project grep on disk) and the piece-tree
81// search below (in-editor Ctrl+F and dirty buffers).
82pub use crate::model::filesystem::SearchMatch;
83
84// Large file support configuration
85/// Default threshold for considering a file "large" (100 MB)
86pub const DEFAULT_LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024;
87
88/// Chunk size to load when lazy loading (1 MB)
89pub const LOAD_CHUNK_SIZE: usize = 1024 * 1024;
90
91/// Chunk alignment for lazy loading (64 KB)
92pub const CHUNK_ALIGNMENT: usize = 64 * 1024;
93
94/// Configuration passed to TextBuffer constructors.
95#[derive(Debug, Clone)]
96pub struct BufferConfig {
97    /// Estimated average line length in bytes. Used for approximate line number
98    /// display in large files and for goto-line byte offset estimation.
99    pub estimated_line_length: usize,
100}
101
102impl Default for BufferConfig {
103    fn default() -> Self {
104        Self {
105            estimated_line_length: 80,
106        }
107    }
108}
109
110/// Line ending format used in the file
111
112/// Represents a line number (simplified for new implementation)
113/// Legacy enum kept for backwards compatibility - always Absolute now
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115pub enum LineNumber {
116    /// Absolute line number - this is the actual line number in the file
117    Absolute(usize),
118    /// Relative line number (deprecated - now same as Absolute)
119    Relative {
120        line: usize,
121        from_cached_line: usize,
122    },
123}
124
125impl LineNumber {
126    /// Get the line number value
127    pub fn value(&self) -> usize {
128        match self {
129            Self::Absolute(line) | Self::Relative { line, .. } => *line,
130        }
131    }
132
133    /// Check if this is an absolute line number
134    pub fn is_absolute(&self) -> bool {
135        matches!(self, LineNumber::Absolute(_))
136    }
137
138    /// Check if this is a relative line number
139    pub fn is_relative(&self) -> bool {
140        matches!(self, LineNumber::Relative { .. })
141    }
142
143    /// Format the line number for display
144    pub fn format(&self) -> String {
145        match self {
146            Self::Absolute(line) => format!("{}", line + 1),
147            Self::Relative { line, .. } => format!("~{}", line + 1),
148        }
149    }
150}
151
152/// A text buffer that manages document content using a piece table
153/// with integrated line tracking
154pub struct TextBuffer {
155    /// The piece tree for efficient text manipulation with integrated line tracking
156    piece_tree: PieceTree,
157
158    /// List of string buffers containing chunks of text data.
159    /// Index 0 is typically the original/stored buffer.
160    /// Additional buffers are added for modifications.
161    buffers: Vec<StringBuffer>,
162
163    /// Next buffer ID to assign.
164    next_buffer_id: usize,
165
166    /// Filesystem handle, optional file path, dirty/recovery flags,
167    /// saved-root snapshot, and saved-file size — see
168    /// `persistence.rs`.
169    persistence: Persistence,
170
171    /// File-kind flags (large_file, line_feeds_scanned, is_binary) —
172    /// see `file_kind.rs`.
173    file_kind: BufferFileKind,
174
175    /// Encoding + line-ending state — see `format.rs`.
176    format: BufferFormat,
177
178    /// Monotonic version counter for change tracking.
179    version: u64,
180
181    /// Buffer configuration (estimated line length, etc.)
182    config: BufferConfig,
183}
184
185/// Snapshot of a TextBuffer's piece tree and associated string buffers.
186///
187/// Used by BulkEdit undo/redo to capture the complete buffer state.
188/// Without this, consolidate_after_save() would destroy the string buffers
189/// that a BulkEdit's piece tree snapshot references, causing corruption on undo.
190#[derive(Debug, Clone)]
191pub struct BufferSnapshot {
192    pub piece_tree: PieceTree,
193    pub buffers: Vec<StringBuffer>,
194    pub next_buffer_id: usize,
195}
196
197impl TextBuffer {
198    /// Create a new text buffer with the given filesystem implementation.
199    /// Note: large_file_threshold is ignored in the new implementation
200    pub fn new(_large_file_threshold: usize, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
201        let piece_tree = PieceTree::empty();
202        let saved_root = piece_tree.root();
203        let line_ending = LineEnding::default();
204        let encoding = Encoding::default();
205        TextBuffer {
206            piece_tree,
207            buffers: vec![StringBuffer::new(0, Vec::new())],
208            next_buffer_id: 1,
209            persistence: Persistence::new(fs, None, saved_root, None),
210            file_kind: BufferFileKind::new(false, false),
211            format: BufferFormat::new(line_ending, encoding),
212            version: 0,
213            config: BufferConfig::default(),
214        }
215    }
216
217    /// Create an empty buffer associated with a file path.
218    /// Used for files that don't exist yet — the path is set so saving will create the file.
219    pub fn new_with_path(
220        large_file_threshold: usize,
221        fs: Arc<dyn FileSystem + Send + Sync>,
222        path: PathBuf,
223    ) -> Self {
224        let mut buffer = Self::new(large_file_threshold, fs);
225        buffer.persistence.set_file_path(path);
226        buffer
227    }
228
229    /// Current buffer version (monotonic, wraps on overflow)
230    pub fn version(&self) -> u64 {
231        self.version
232    }
233
234    /// Get a reference to the filesystem implementation used by this buffer.
235    pub fn filesystem(&self) -> &Arc<dyn FileSystem + Send + Sync> {
236        self.persistence.fs()
237    }
238
239    /// Set the filesystem implementation for this buffer.
240    pub fn set_filesystem(&mut self, fs: Arc<dyn FileSystem + Send + Sync>) {
241        self.persistence.set_fs(fs);
242    }
243
244    #[inline]
245    fn bump_version(&mut self) {
246        self.version = self.version.wrapping_add(1);
247    }
248
249    #[inline]
250    fn mark_content_modified(&mut self) {
251        self.persistence.mark_dirty();
252        self.bump_version();
253    }
254
255    /// Create a text buffer from raw bytes WITHOUT encoding conversion.
256    /// Used for binary files where we want to preserve the exact bytes.
257    fn from_bytes_raw(content: Vec<u8>, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
258        let bytes = content.len();
259
260        // For binary files, detect line ending but don't convert encoding
261        let line_ending = format::detect_line_ending(&content);
262
263        // Create initial StringBuffer with ID 0
264        let buffer = StringBuffer::new(0, content);
265        let line_feed_cnt = buffer.line_feed_count();
266
267        let piece_tree = if bytes > 0 {
268            PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
269        } else {
270            PieceTree::empty()
271        };
272
273        let saved_root = piece_tree.root();
274
275        TextBuffer {
276            piece_tree,
277            buffers: vec![buffer],
278            next_buffer_id: 1,
279            persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
280            file_kind: BufferFileKind::new(false, true),
281            format: BufferFormat::new(line_ending, Encoding::Utf8),
282            version: 0,
283            config: BufferConfig::default(),
284        }
285    }
286
287    /// Create a text buffer from initial content with the given filesystem.
288    pub fn from_bytes(content: Vec<u8>, fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
289        // Auto-detect encoding and convert to UTF-8 if needed
290        let (encoding, utf8_content) = format::detect_and_convert_encoding(&content);
291
292        let bytes = utf8_content.len();
293
294        // Auto-detect line ending format from content
295        let line_ending = format::detect_line_ending(&utf8_content);
296
297        // Create initial StringBuffer with ID 0
298        let buffer = StringBuffer::new(0, utf8_content);
299        let line_feed_cnt = buffer.line_feed_count();
300
301        let piece_tree = if bytes > 0 {
302            PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
303        } else {
304            PieceTree::empty()
305        };
306
307        let saved_root = piece_tree.root();
308
309        TextBuffer {
310            piece_tree,
311            buffers: vec![buffer],
312            next_buffer_id: 1,
313            persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
314            file_kind: BufferFileKind::new(false, false),
315            format: BufferFormat::new(line_ending, encoding),
316            version: 0,
317            config: BufferConfig::default(),
318        }
319    }
320
321    /// Create a text buffer from bytes with a specific encoding (no auto-detection).
322    pub fn from_bytes_with_encoding(
323        content: Vec<u8>,
324        encoding: Encoding,
325        fs: Arc<dyn FileSystem + Send + Sync>,
326    ) -> Self {
327        // Convert from specified encoding to UTF-8
328        let utf8_content = encoding::convert_to_utf8(&content, encoding);
329
330        let bytes = utf8_content.len();
331
332        // Auto-detect line ending format from content
333        let line_ending = format::detect_line_ending(&utf8_content);
334
335        // Create initial StringBuffer with ID 0
336        let buffer = StringBuffer::new(0, utf8_content);
337        let line_feed_cnt = buffer.line_feed_count();
338
339        let piece_tree = if bytes > 0 {
340            PieceTree::new(BufferLocation::Stored(0), 0, bytes, line_feed_cnt)
341        } else {
342            PieceTree::empty()
343        };
344
345        let saved_root = piece_tree.root();
346
347        TextBuffer {
348            piece_tree,
349            buffers: vec![buffer],
350            next_buffer_id: 1,
351            persistence: Persistence::new(fs, None, saved_root, Some(bytes)),
352            file_kind: BufferFileKind::new(false, false),
353            format: BufferFormat::new(line_ending, encoding),
354            version: 0,
355            config: BufferConfig::default(),
356        }
357    }
358
359    /// Create a text buffer from a string with the given filesystem.
360    pub fn from_str(
361        s: &str,
362        _large_file_threshold: usize,
363        fs: Arc<dyn FileSystem + Send + Sync>,
364    ) -> Self {
365        Self::from_bytes(s.as_bytes().to_vec(), fs)
366    }
367
368    /// Create an empty text buffer with the given filesystem.
369    pub fn empty(fs: Arc<dyn FileSystem + Send + Sync>) -> Self {
370        let piece_tree = PieceTree::empty();
371        let saved_root = piece_tree.root();
372        let line_ending = LineEnding::default();
373        let encoding = Encoding::default();
374        TextBuffer {
375            piece_tree,
376            buffers: vec![StringBuffer::new(0, Vec::new())],
377            next_buffer_id: 1,
378            persistence: Persistence::new(fs, None, saved_root, None),
379            file_kind: BufferFileKind::new(false, false),
380            format: BufferFormat::new(line_ending, encoding),
381            version: 0,
382            config: BufferConfig::default(),
383        }
384    }
385
386    /// Load a text buffer from a file using the given filesystem.
387    pub fn load_from_file<P: AsRef<Path>>(
388        path: P,
389        large_file_threshold: usize,
390        fs: Arc<dyn FileSystem + Send + Sync>,
391    ) -> anyhow::Result<Self> {
392        Self::load_from_file_internal(path, large_file_threshold, fs, false)
393    }
394
395    /// Load a text buffer from a file, forcing it to be treated as text.
396    ///
397    /// Identical to [`load_from_file`](Self::load_from_file) but skips binary
398    /// detection entirely — the buffer is always loaded through the text path
399    /// and `is_binary` stays `false`. Used for the terminal scrollback backing
400    /// file, whose raw PTY output can contain control bytes that would
401    /// otherwise trip binary detection and suppress ANSI-color rendering in
402    /// scrollback mode (issue #2449).
403    pub fn load_from_file_force_text<P: AsRef<Path>>(
404        path: P,
405        large_file_threshold: usize,
406        fs: Arc<dyn FileSystem + Send + Sync>,
407    ) -> anyhow::Result<Self> {
408        Self::load_from_file_internal(path, large_file_threshold, fs, true)
409    }
410
411    fn load_from_file_internal<P: AsRef<Path>>(
412        path: P,
413        large_file_threshold: usize,
414        fs: Arc<dyn FileSystem + Send + Sync>,
415        force_text: bool,
416    ) -> anyhow::Result<Self> {
417        let path = path.as_ref();
418
419        // Get file size to determine loading strategy
420        let metadata = fs.metadata(path)?;
421        let file_size = metadata.size as usize;
422
423        // Use threshold parameter or default
424        let threshold = if large_file_threshold > 0 {
425            large_file_threshold
426        } else {
427            DEFAULT_LARGE_FILE_THRESHOLD
428        };
429
430        // Choose loading strategy based on file size
431        if file_size >= threshold {
432            Self::load_large_file_internal(path, file_size, fs, false, force_text)
433        } else {
434            Self::load_small_file(path, fs, force_text)
435        }
436    }
437
438    /// Load a text buffer from a file with a specific encoding (no auto-detection).
439    pub fn load_from_file_with_encoding<P: AsRef<Path>>(
440        path: P,
441        encoding: Encoding,
442        fs: Arc<dyn FileSystem + Send + Sync>,
443        config: BufferConfig,
444    ) -> anyhow::Result<Self> {
445        let path = path.as_ref();
446        let contents = fs.read_file(path)?;
447
448        let mut buffer = Self::from_bytes_with_encoding(contents, encoding, fs);
449        buffer.persistence.set_file_path(path.to_path_buf());
450        buffer.persistence.clear_modified();
451        buffer.config = config;
452        Ok(buffer)
453    }
454
455    /// Load a small file with full eager loading and line indexing
456    ///
457    /// When `force_text` is true, binary detection is ignored and the file is
458    /// always loaded through the text path (see `load_from_file_force_text`).
459    fn load_small_file(
460        path: &Path,
461        fs: Arc<dyn FileSystem + Send + Sync>,
462        force_text: bool,
463    ) -> anyhow::Result<Self> {
464        let contents = fs.read_file(path)?;
465
466        // Use unified encoding/binary detection
467        let (encoding, detected_binary) = format::detect_encoding_or_binary(&contents, false);
468        let is_binary = detected_binary && !force_text;
469
470        // For binary files, skip encoding conversion to preserve raw bytes
471        let mut buffer = if is_binary {
472            Self::from_bytes_raw(contents, fs)
473        } else {
474            // from_bytes handles encoding detection/conversion and line ending detection
475            Self::from_bytes(contents, fs)
476        };
477        buffer.persistence.set_file_path(path.to_path_buf());
478        buffer.persistence.clear_modified();
479        buffer.file_kind.set_large_file(false);
480        buffer.file_kind.set_binary(is_binary);
481        // For binary files, ensure encoding matches detection
482        if is_binary {
483            buffer.format.set_default_encoding(encoding);
484        }
485        // Note: line_ending and encoding are already set by from_bytes/from_bytes_raw
486        Ok(buffer)
487    }
488
489    /// Check if loading a large file requires user confirmation due to encoding.
490    ///
491    /// Some encodings (like Shift-JIS, GB18030, GBK, EUC-KR) cannot be "resynchronized" -
492    /// meaning you cannot determine character boundaries when jumping into the middle
493    /// of a file. These encodings require loading the entire file into memory.
494    ///
495    /// Returns `Some(confirmation)` if user confirmation is needed, `None` if the file
496    /// can be loaded with lazy/streaming loading.
497    pub fn check_large_file_encoding(
498        path: impl AsRef<Path>,
499        fs: Arc<dyn FileSystem + Send + Sync>,
500    ) -> anyhow::Result<Option<LargeFileEncodingConfirmation>> {
501        let path = path.as_ref();
502        let metadata = fs.metadata(path)?;
503        let file_size = metadata.size as usize;
504
505        // Only check for large files
506        if file_size < DEFAULT_LARGE_FILE_THRESHOLD {
507            return Ok(None);
508        }
509
510        // Read a sample to detect encoding
511        let sample_size = file_size.min(8 * 1024);
512        let sample = fs.read_range(path, 0, sample_size)?;
513        let (encoding, is_binary) =
514            format::detect_encoding_or_binary(&sample, file_size > sample_size);
515
516        // Binary files don't need confirmation (loaded as-is)
517        if is_binary {
518            return Ok(None);
519        }
520
521        // Check if the encoding requires full file loading
522        if encoding.requires_full_file_load() {
523            return Ok(Some(LargeFileEncodingConfirmation {
524                path: path.to_path_buf(),
525                file_size,
526                encoding,
527            }));
528        }
529
530        Ok(None)
531    }
532
533    /// Load a large file, optionally forcing full load for non-resynchronizable encodings.
534    ///
535    /// Called with `force_full_load=true` after user confirms the warning about
536    /// non-resynchronizable encodings requiring full file loading.
537    pub fn load_large_file_confirmed(
538        path: impl AsRef<Path>,
539        fs: Arc<dyn FileSystem + Send + Sync>,
540    ) -> anyhow::Result<Self> {
541        let path = path.as_ref();
542        let metadata = fs.metadata(path)?;
543        let file_size = metadata.size as usize;
544        Self::load_large_file_internal(path, file_size, fs, true, false)
545    }
546
547    /// Internal implementation for loading large files.
548    ///
549    /// When `force_text` is true, binary detection is ignored and the file is
550    /// always loaded through the text path (see `load_from_file_force_text`).
551    fn load_large_file_internal(
552        path: &Path,
553        file_size: usize,
554        fs: Arc<dyn FileSystem + Send + Sync>,
555        force_full_load: bool,
556        force_text: bool,
557    ) -> anyhow::Result<Self> {
558        use crate::model::piece_tree::{BufferData, BufferLocation};
559
560        // Read a sample of the file to detect encoding and whether it's binary
561        // We read the first 8KB for detection
562        let sample_size = file_size.min(8 * 1024);
563        let sample = fs.read_range(path, 0, sample_size)?;
564
565        // Use unified encoding/binary detection
566        let (encoding, detected_binary) =
567            format::detect_encoding_or_binary(&sample, file_size > sample_size);
568        let is_binary = detected_binary && !force_text;
569
570        // Binary files skip encoding conversion to preserve raw bytes
571        if is_binary {
572            tracing::info!("Large binary file detected, loading without encoding conversion");
573            let contents = fs.read_file(path)?;
574            let mut buffer = Self::from_bytes_raw(contents, fs);
575            buffer.persistence.set_file_path(path.to_path_buf());
576            buffer.persistence.clear_modified();
577            buffer.file_kind.set_large_file(true);
578            buffer.format.set_default_encoding(encoding);
579            return Ok(buffer);
580        }
581
582        // Check if encoding requires full file loading
583        let requires_full_load = encoding.requires_full_file_load();
584
585        // For non-resynchronizable encodings, require confirmation unless forced
586        if requires_full_load && !force_full_load {
587            anyhow::bail!(LargeFileEncodingConfirmation {
588                path: path.to_path_buf(),
589                file_size,
590                encoding,
591            });
592        }
593
594        // For encodings that require full load (non-resynchronizable or non-UTF-8),
595        // load the entire file and convert
596        if !matches!(encoding, Encoding::Utf8 | Encoding::Ascii) {
597            tracing::info!(
598                "Large file with non-UTF-8 encoding ({:?}), loading fully for conversion",
599                encoding
600            );
601            let contents = fs.read_file(path)?;
602            let mut buffer = Self::from_bytes(contents, fs);
603            buffer.persistence.set_file_path(path.to_path_buf());
604            buffer.persistence.clear_modified();
605            buffer.file_kind.set_large_file(true); // Still mark as large file for UI purposes
606            buffer.file_kind.set_binary(is_binary);
607            return Ok(buffer);
608        }
609
610        // UTF-8/ASCII files can use lazy loading
611        let line_ending = format::detect_line_ending(&sample);
612
613        // Create an unloaded buffer that references the entire file
614        let buffer = StringBuffer {
615            id: 0,
616            data: BufferData::Unloaded {
617                file_path: path.to_path_buf(),
618                file_offset: 0,
619                bytes: file_size,
620            },
621            stored_file_offset: None,
622        };
623
624        // Create piece tree with a single piece covering the whole file
625        // No line feed count (None) since we're not computing line indexing
626        let piece_tree = if file_size > 0 {
627            PieceTree::new(BufferLocation::Stored(0), 0, file_size, None)
628        } else {
629            PieceTree::empty()
630        };
631        let saved_root = piece_tree.root();
632
633        tracing::debug!(
634            "Buffer::load_from_file: loaded {} bytes, saved_file_size={}",
635            file_size,
636            file_size
637        );
638
639        Ok(TextBuffer {
640            piece_tree,
641            buffers: vec![buffer],
642            next_buffer_id: 1,
643            persistence: Persistence::new(
644                fs,
645                Some(path.to_path_buf()),
646                saved_root,
647                Some(file_size),
648            ),
649            file_kind: BufferFileKind::new(true, is_binary),
650            format: BufferFormat::new(line_ending, encoding),
651            version: 0,
652            config: BufferConfig::default(),
653        })
654    }
655
656    /// Save the buffer to its associated file
657    pub fn save(&mut self) -> anyhow::Result<()> {
658        if let Some(path) = self.persistence.file_path_owned() {
659            self.save_to_file(path)
660        } else {
661            anyhow::bail!(io::Error::new(
662                io::ErrorKind::NotFound,
663                "No file path associated with buffer",
664            ))
665        }
666    }
667
668    /// Build a write recipe from the piece tree for saving.
669    ///
670    /// Delegates to `save::build_write_recipe`.
671    #[cfg(test)]
672    pub(crate) fn build_write_recipe(&self) -> io::Result<WriteRecipe> {
673        save::build_write_recipe(
674            &self.piece_tree,
675            &self.buffers,
676            &self.format,
677            &self.file_kind,
678            &self.persistence,
679        )
680    }
681
682    /// Save the buffer to a specific file
683    ///
684    /// Uses the write recipe approach for both local and remote filesystems:
685    /// - Copy ops reference unchanged regions in the source file
686    /// - Insert ops contain new/modified data
687    ///
688    /// For remote filesystems, the recipe is sent to the agent which reconstructs
689    /// the file server-side, avoiding transfer of unchanged content.
690    ///
691    /// For local filesystems with ownership concerns (file owned by another user),
692    /// uses in-place writing to preserve ownership. Otherwise uses atomic writes.
693    ///
694    /// If the line ending format has been changed (via set_line_ending), all content
695    /// will be converted to the new format during save.
696    pub fn save_to_file<P: AsRef<Path>>(&mut self, path: P) -> anyhow::Result<()> {
697        let dest_path = path.as_ref();
698        let total = self.total_bytes();
699
700        // Handle empty files
701        if total == 0 {
702            self.persistence.fs().write_file(dest_path, &[])?;
703            self.finalize_save(dest_path)?;
704            return Ok(());
705        }
706
707        // Build the write recipe (unified for all filesystem types)
708        let recipe = save::build_write_recipe(
709            &self.piece_tree,
710            &self.buffers,
711            &self.format,
712            &self.file_kind,
713            &self.persistence,
714        )?;
715        let ops = recipe.to_write_ops();
716
717        // Check if we need in-place writing to preserve file ownership (local only)
718        // Remote filesystems handle this differently
719        let fs = self.persistence.fs();
720        let is_local = fs.remote_connection_info().is_none();
721        let use_inplace = is_local && save::should_use_inplace_write(fs, dest_path);
722
723        if use_inplace {
724            // In-place write: write directly to preserve ownership
725            save::save_with_inplace_write(fs, dest_path, &recipe)?;
726        } else if !recipe.has_copy_ops() && !is_local {
727            // Remote with no Copy ops: use write_file directly (more efficient)
728            let data = recipe.flatten_inserts();
729            fs.write_file(dest_path, &data)?;
730        } else if is_local {
731            // Local: use write_file or write_patched with sudo fallback
732            let write_result = if !recipe.has_copy_ops() {
733                let data = recipe.flatten_inserts();
734                fs.write_file(dest_path, &data)
735            } else {
736                let src_for_patch = recipe.src_path.as_deref().unwrap_or(dest_path);
737                fs.write_patched(src_for_patch, dest_path, &ops)
738            };
739
740            if let Err(e) = write_result {
741                if e.kind() == io::ErrorKind::PermissionDenied {
742                    // Create temp file and return sudo error
743                    let original_metadata = fs.metadata_if_exists(dest_path);
744                    let (temp_path, mut temp_file) = save::create_temp_file(fs, dest_path)?;
745                    save::write_recipe_to_file(fs, &mut temp_file, &recipe)?;
746                    temp_file.sync_all()?;
747                    drop(temp_file);
748                    return Err(save::make_sudo_error(
749                        temp_path,
750                        dest_path,
751                        original_metadata,
752                    ));
753                }
754                return Err(e.into());
755            }
756        } else {
757            // Remote with Copy ops: use write_patched
758            let src_for_patch = recipe.src_path.as_deref().unwrap_or(dest_path);
759            fs.write_patched(src_for_patch, dest_path, &ops)?;
760        }
761
762        self.finalize_save(dest_path)?;
763        Ok(())
764    }
765
766    /// Finalize save state after successful write.
767    fn finalize_save(&mut self, dest_path: &Path) -> anyhow::Result<()> {
768        let new_size = self.persistence.fs().metadata(dest_path)?.size as usize;
769        tracing::debug!(
770            "Buffer::save: updating saved_file_size from {:?} to {}",
771            self.persistence.saved_file_size(),
772            new_size
773        );
774        self.persistence.set_saved_file_size(Some(new_size));
775        self.persistence.set_file_path(dest_path.to_path_buf());
776
777        // Consolidate the piece tree to synchronize with disk (for large files)
778        // or to simplify structure (for small files).
779        self.consolidate_after_save(dest_path, new_size);
780
781        self.mark_saved_snapshot();
782        self.format.promote_current_to_original();
783        Ok(())
784    }
785
786    /// Finalize buffer state after an external save operation (e.g., via sudo).
787    ///
788    /// This updates the saved snapshot and file size to match the new state on disk.
789    pub fn finalize_external_save(&mut self, dest_path: PathBuf) -> anyhow::Result<()> {
790        let new_size = self.persistence.fs().metadata(&dest_path)?.size as usize;
791        self.persistence.set_saved_file_size(Some(new_size));
792        self.persistence.set_file_path(dest_path.clone());
793
794        // Consolidate the piece tree to synchronize with disk or simplify structure.
795        self.consolidate_after_save(&dest_path, new_size);
796
797        self.mark_saved_snapshot();
798        self.format.promote_current_to_original();
799        Ok(())
800    }
801
802    /// Consolidate the piece tree into a single piece.
803    /// For large files, this creates a reference to the disk file to save memory and sync offsets.
804    /// For small files, this flattens all edits into a single in-memory buffer.
805    fn consolidate_after_save(&mut self, path: &Path, file_size: usize) {
806        if self.file_kind.is_large_file() {
807            self.consolidate_large_file(path, file_size);
808        } else {
809            self.consolidate_small_file();
810        }
811    }
812
813    /// Consolidate large file piece tree into a single piece pointing to the new file.
814    /// This ensures that subsequent operations correctly reference the new content and offsets.
815    /// Preserves total line feed count from the old tree if a scan was previously done.
816    fn consolidate_large_file(&mut self, path: &Path, file_size: usize) {
817        // Preserve line feed count from the old tree if we had scanned it
818        let preserved_lf = if self.file_kind.has_line_feed_scan() {
819            self.piece_tree.line_count().map(|c| c.saturating_sub(1))
820        } else {
821            None
822        };
823
824        let buffer = StringBuffer {
825            id: 0,
826            data: BufferData::Unloaded {
827                file_path: path.to_path_buf(),
828                file_offset: 0,
829                bytes: file_size,
830            },
831            stored_file_offset: None,
832        };
833
834        self.piece_tree = if file_size > 0 {
835            PieceTree::new(BufferLocation::Stored(0), 0, file_size, preserved_lf)
836        } else {
837            PieceTree::empty()
838        };
839
840        self.buffers = vec![buffer];
841        self.next_buffer_id = 1;
842
843        tracing::debug!(
844            "Buffer::consolidate_large_file: consolidated into single piece of {} bytes",
845            file_size
846        );
847    }
848
849    /// Consolidate small file edits into a single in-memory buffer and re-index lines.
850    fn consolidate_small_file(&mut self) {
851        if let Some(bytes) = self.get_all_text() {
852            let line_feed_cnt = bytes.iter().filter(|&&b| b == b'\n').count();
853            let len = bytes.len();
854
855            // Create a single loaded buffer with line indexing
856            let buffer = StringBuffer::new_loaded(0, bytes, true);
857
858            self.piece_tree = if len > 0 {
859                PieceTree::new(BufferLocation::Stored(0), 0, len, Some(line_feed_cnt))
860            } else {
861                PieceTree::empty()
862            };
863
864            self.buffers = vec![buffer];
865            self.next_buffer_id = 1;
866
867            tracing::debug!(
868                "Buffer::consolidate_small_file: consolidated into single loaded buffer of {} bytes",
869                len
870            );
871        }
872    }
873
874    /// Get the total number of bytes in the document
875    pub fn total_bytes(&self) -> usize {
876        self.piece_tree.total_bytes()
877    }
878
879    /// Get the total number of lines in the document
880    /// Uses the piece tree's integrated line tracking
881    /// Returns None if line count is unknown (e.g., for large files without line indexing)
882    pub fn line_count(&self) -> Option<usize> {
883        self.piece_tree.line_count()
884    }
885
886    /// Snapshot the current tree as the saved baseline
887    pub fn mark_saved_snapshot(&mut self) {
888        self.persistence.mark_saved_snapshot(&self.piece_tree);
889    }
890
891    /// Refresh the saved root to match the current tree structure without
892    /// clearing the modified flag.  Call this after structural-only changes
893    /// (e.g. chunk_split_and_load during search scan) so that
894    /// `diff_since_saved()` can take the fast `Arc::ptr_eq` path.
895    pub fn refresh_saved_root_if_unmodified(&mut self) {
896        self.persistence
897            .refresh_saved_root_if_unmodified(&self.piece_tree);
898    }
899
900    /// Diff the current piece tree against the last saved snapshot.
901    ///
902    /// See `Persistence::diff_since_saved` for the algorithm.
903    pub fn diff_since_saved(&self) -> PieceTreeDiff {
904        let _span = tracing::info_span!(
905            "diff_since_saved",
906            large_file = self.file_kind.is_large_file(),
907            modified = self.persistence.is_modified(),
908            lf_scanned = self.file_kind.has_line_feed_scan()
909        )
910        .entered();
911
912        self.persistence
913            .diff_since_saved(&self.piece_tree, &self.buffers)
914    }
915
916    /// Convert a byte offset to a line/column position
917    pub fn offset_to_position(&self, offset: usize) -> Option<Position> {
918        self.piece_tree
919            .offset_to_position(offset, &self.buffers)
920            .map(|(line, column)| Position { line, column })
921    }
922
923    /// Convert a line/column position to a byte offset
924    pub fn position_to_offset(&self, position: Position) -> usize {
925        self.piece_tree
926            .position_to_offset(position.line, position.column, &self.buffers)
927    }
928
929    /// Insert text at the given byte offset
930    pub fn insert_bytes(&mut self, offset: usize, text: Vec<u8>) -> Cursor {
931        if text.is_empty() {
932            return self.piece_tree.cursor_at_offset(offset);
933        }
934
935        // Mark as modified (updates version)
936        self.mark_content_modified();
937
938        // Count line feeds in the text to insert
939        let line_feed_cnt = Some(text.iter().filter(|&&b| b == b'\n').count());
940
941        // Optimization: try to append to existing buffer if insertion is at piece boundary
942        let (buffer_location, buffer_offset, text_len) =
943            if let Some(append_info) = self.try_append_to_existing_buffer(offset, &text) {
944                append_info
945            } else {
946                // Create a new StringBuffer for this insertion
947                let buffer_id = self.next_buffer_id;
948                self.next_buffer_id += 1;
949                let buffer = StringBuffer::new(buffer_id, text.clone());
950                self.buffers.push(buffer);
951                (BufferLocation::Added(buffer_id), 0, text.len())
952            };
953
954        // When line feeds have been scanned, ensure the chunk at the insertion
955        // point is loaded so compute_line_feeds_static can recount during splits.
956        if self.file_kind.has_line_feed_scan() {
957            self.ensure_chunk_loaded_at(offset);
958        }
959
960        // Update piece tree (need to pass buffers reference)
961        self.piece_tree.insert(
962            offset,
963            buffer_location,
964            buffer_offset,
965            text_len,
966            line_feed_cnt,
967            &self.buffers,
968        )
969    }
970
971    /// Try to append to an existing buffer if insertion point aligns with buffer end
972    /// Returns (BufferLocation, buffer_offset, text_len) if append succeeds, None otherwise
973    fn try_append_to_existing_buffer(
974        &mut self,
975        offset: usize,
976        text: &[u8],
977    ) -> Option<(BufferLocation, usize, usize)> {
978        // Only optimize for non-empty insertions after existing content
979        if text.is_empty() || offset == 0 {
980            return None;
981        }
982
983        // Find the piece containing the byte just before the insertion point
984        // This avoids the saturating_sub issue
985        let piece_info = self.piece_tree.find_by_offset(offset - 1)?;
986
987        // Check if insertion is exactly at the end of this piece
988        // offset_in_piece tells us where (offset-1) is within the piece
989        // For insertion to be at piece end, (offset-1) must be the last byte
990        let offset_in_piece = piece_info.offset_in_piece?;
991        if offset_in_piece + 1 != piece_info.bytes {
992            return None; // Not at the end of the piece
993        }
994
995        // Only append to "Added" buffers (not original Stored buffers)
996        if !matches!(piece_info.location, BufferLocation::Added(_)) {
997            return None;
998        }
999
1000        let buffer_id = piece_info.location.buffer_id();
1001        let buffer = self.buffers.get_mut(buffer_id)?;
1002
1003        // Check if buffer is loaded
1004        let buffer_len = buffer.get_data()?.len();
1005
1006        // Check if this piece ends exactly at the end of its buffer
1007        if piece_info.offset + piece_info.bytes != buffer_len {
1008            return None;
1009        }
1010
1011        // Perfect! Append to this buffer
1012        let append_offset = buffer.append(text);
1013
1014        Some((piece_info.location, append_offset, text.len()))
1015    }
1016
1017    /// Insert text (from &str) at the given byte offset
1018    pub fn insert(&mut self, offset: usize, text: &str) {
1019        self.insert_bytes(offset, text.as_bytes().to_vec());
1020    }
1021
1022    /// Insert text at a line/column position
1023    /// This now uses the optimized piece_tree.insert_at_position() for a single traversal
1024    pub fn insert_at_position(&mut self, position: Position, text: Vec<u8>) -> Cursor {
1025        if text.is_empty() {
1026            let offset = self.position_to_offset(position);
1027            return self.piece_tree.cursor_at_offset(offset);
1028        }
1029
1030        self.mark_content_modified();
1031
1032        // Count line feeds in the text to insert
1033        let line_feed_cnt = text.iter().filter(|&&b| b == b'\n').count();
1034
1035        // Create a new StringBuffer for this insertion
1036        let buffer_id = self.next_buffer_id;
1037        self.next_buffer_id += 1;
1038        let buffer = StringBuffer::new(buffer_id, text.clone());
1039        self.buffers.push(buffer);
1040
1041        // Use the optimized position-based insertion (single traversal)
1042        self.piece_tree.insert_at_position(
1043            position.line,
1044            position.column,
1045            BufferLocation::Added(buffer_id),
1046            0,
1047            text.len(),
1048            line_feed_cnt,
1049            &self.buffers,
1050        )
1051    }
1052
1053    /// Delete text starting at the given byte offset
1054    pub fn delete_bytes(&mut self, offset: usize, bytes: usize) {
1055        if bytes == 0 || offset >= self.total_bytes() {
1056            return;
1057        }
1058
1059        // When line feeds have been scanned, ensure chunks at delete boundaries
1060        // are loaded so compute_line_feeds_static can recount during splits.
1061        if self.file_kind.has_line_feed_scan() {
1062            self.ensure_chunk_loaded_at(offset);
1063            let end = (offset + bytes).min(self.total_bytes());
1064            if end > offset {
1065                self.ensure_chunk_loaded_at(end.saturating_sub(1));
1066            }
1067        }
1068
1069        // Update piece tree
1070        self.piece_tree.delete(offset, bytes, &self.buffers);
1071
1072        self.mark_content_modified();
1073    }
1074
1075    /// Delete text in a range
1076    pub fn delete(&mut self, range: Range<usize>) {
1077        if range.end > range.start {
1078            self.delete_bytes(range.start, range.end - range.start);
1079        }
1080    }
1081
1082    /// Delete text in a line/column range
1083    /// This now uses the optimized piece_tree.delete_position_range() for a single traversal
1084    pub fn delete_range(&mut self, start: Position, end: Position) {
1085        // Use the optimized position-based deletion
1086        self.piece_tree.delete_position_range(
1087            start.line,
1088            start.column,
1089            end.line,
1090            end.column,
1091            &self.buffers,
1092        );
1093        self.mark_content_modified();
1094    }
1095
1096    /// Replace the entire buffer content with new content
1097    /// This is an O(n) operation that rebuilds the piece tree in a single pass,
1098    /// avoiding the O(n²) complexity of applying individual edits.
1099    ///
1100    /// This is used for bulk operations like "replace all" where applying
1101    /// individual edits would be prohibitively slow.
1102    pub fn replace_content(&mut self, new_content: &str) {
1103        let bytes = new_content.len();
1104        let content_bytes = new_content.as_bytes().to_vec();
1105
1106        // Count line feeds in the new content
1107        let line_feed_cnt = content_bytes.iter().filter(|&&b| b == b'\n').count();
1108
1109        // Create a new StringBuffer for the new content
1110        let buffer_id = self.next_buffer_id;
1111        self.next_buffer_id += 1;
1112        let buffer = StringBuffer::new(buffer_id, content_bytes);
1113        self.buffers.push(buffer);
1114
1115        // Rebuild the piece tree with a single piece containing all the new content
1116        if bytes > 0 {
1117            self.piece_tree = PieceTree::new(
1118                BufferLocation::Added(buffer_id),
1119                0,
1120                bytes,
1121                Some(line_feed_cnt),
1122            );
1123        } else {
1124            self.piece_tree = PieceTree::empty();
1125        }
1126
1127        self.mark_content_modified();
1128    }
1129
1130    /// Restore a previously saved buffer state (for undo/redo of BulkEdit).
1131    ///
1132    /// This restores the piece tree AND the buffers list, which is critical
1133    /// because consolidate_after_save() replaces self.buffers. Without restoring
1134    /// buffers, the piece tree would reference buffer IDs that no longer exist.
1135    pub fn restore_buffer_state(&mut self, snapshot: &BufferSnapshot) {
1136        self.piece_tree = snapshot.piece_tree.clone();
1137        self.buffers = snapshot.buffers.clone();
1138        self.next_buffer_id = snapshot.next_buffer_id;
1139        self.mark_content_modified();
1140    }
1141
1142    /// Snapshot the current buffer state (piece tree + buffers) for BulkEdit undo/redo.
1143    ///
1144    /// The snapshot includes buffers because consolidate_after_save() can replace
1145    /// self.buffers between the snapshot and restore, which would otherwise cause
1146    /// the restored piece tree to reference nonexistent buffer IDs.
1147    pub fn snapshot_buffer_state(&self) -> Arc<BufferSnapshot> {
1148        Arc::new(BufferSnapshot {
1149            piece_tree: self.piece_tree.clone(),
1150            buffers: self.buffers.clone(),
1151            next_buffer_id: self.next_buffer_id,
1152        })
1153    }
1154
1155    /// Apply bulk edits efficiently in a single pass
1156    /// Returns the net change in bytes
1157    pub fn apply_bulk_edits(&mut self, edits: &[(usize, usize, &str)]) -> isize {
1158        // Pre-allocate buffers for all insert texts (only non-empty texts)
1159        // This avoids the borrow conflict in the closure
1160        // IMPORTANT: Only add entries for non-empty texts because the closure
1161        // is only called for edits with non-empty insert text
1162        let mut buffer_info: Vec<(BufferLocation, usize, usize, Option<usize>)> = Vec::new();
1163
1164        for (_, _, text) in edits {
1165            if !text.is_empty() {
1166                let buffer_id = self.next_buffer_id;
1167                self.next_buffer_id += 1;
1168                let content = text.as_bytes().to_vec();
1169                let lf_cnt = content.iter().filter(|&&b| b == b'\n').count();
1170                let bytes = content.len();
1171                let buffer = StringBuffer::new(buffer_id, content);
1172                self.buffers.push(buffer);
1173                buffer_info.push((BufferLocation::Added(buffer_id), 0, bytes, Some(lf_cnt)));
1174            }
1175            // No placeholder for empty texts - the closure is only called for non-empty texts
1176        }
1177
1178        // Now call apply_bulk_edits with a simple index-based closure
1179        let mut idx = 0;
1180        let delta = self
1181            .piece_tree
1182            .apply_bulk_edits(edits, &self.buffers, |_text| {
1183                let info = buffer_info[idx];
1184                idx += 1;
1185                info
1186            });
1187
1188        self.mark_content_modified();
1189        delta
1190    }
1191
1192    /// Get text from a byte offset range
1193    /// This now uses the optimized piece_tree.iter_pieces_in_range() for a single traversal
1194    /// Get text from a byte offset range (read-only)
1195    /// Returns None if any buffer in the range is unloaded
1196    /// PRIVATE: External code should use get_text_range_mut() which handles lazy loading
1197    fn get_text_range(&self, offset: usize, bytes: usize) -> Option<Vec<u8>> {
1198        if bytes == 0 {
1199            return Some(Vec::new());
1200        }
1201
1202        let mut result = Vec::with_capacity(bytes);
1203        let end_offset = offset + bytes;
1204        let mut collected = 0;
1205
1206        // Use the efficient piece iterator (single O(log n) traversal + O(N) iteration)
1207        for piece_view in self.piece_tree.iter_pieces_in_range(offset, end_offset) {
1208            let buffer_id = piece_view.location.buffer_id();
1209            if let Some(buffer) = self.buffers.get(buffer_id) {
1210                // Calculate the range to read from this piece
1211                let piece_start_in_doc = piece_view.doc_offset;
1212                let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
1213
1214                // Clip to the requested range
1215                let read_start = offset.max(piece_start_in_doc);
1216                let read_end = end_offset.min(piece_end_in_doc);
1217
1218                if read_end > read_start {
1219                    let offset_in_piece = read_start - piece_start_in_doc;
1220                    let bytes_to_read = read_end - read_start;
1221
1222                    let buffer_start = piece_view.buffer_offset + offset_in_piece;
1223                    let buffer_end = buffer_start + bytes_to_read;
1224
1225                    // Return None if buffer is unloaded (type-safe)
1226                    let data = buffer.get_data()?;
1227
1228                    if buffer_end <= data.len() {
1229                        result.extend_from_slice(&data[buffer_start..buffer_end]);
1230                        collected += bytes_to_read;
1231
1232                        if collected >= bytes {
1233                            break;
1234                        }
1235                    }
1236                }
1237            }
1238        }
1239
1240        Some(result)
1241    }
1242
1243    /// Get text from a byte offset range with lazy loading
1244    /// This will load unloaded chunks on-demand and always returns complete data
1245    ///
1246    /// Returns an error if loading fails or if data cannot be read for any reason.
1247    ///
1248    /// NOTE: Currently loads entire buffers on-demand. Future optimization would split
1249    /// large pieces and load only LOAD_CHUNK_SIZE chunks at a time.
1250    pub fn get_text_range_mut(&mut self, offset: usize, bytes: usize) -> Result<Vec<u8>> {
1251        let _span = tracing::info_span!("get_text_range_mut", offset, bytes).entered();
1252        if bytes == 0 {
1253            return Ok(Vec::new());
1254        }
1255
1256        let mut result = Vec::with_capacity(bytes);
1257        // Clamp end_offset to buffer length to handle reads beyond EOF
1258        let end_offset = (offset + bytes).min(self.len());
1259        let mut current_offset = offset;
1260        let mut iteration_count = 0u32;
1261
1262        // Keep iterating until we've collected all requested bytes
1263        while current_offset < end_offset {
1264            iteration_count += 1;
1265            let mut made_progress = false;
1266            let mut restarted_iteration = false;
1267
1268            // Use the efficient piece iterator (single O(log n) traversal + O(N) iteration)
1269            for piece_view in self
1270                .piece_tree
1271                .iter_pieces_in_range(current_offset, end_offset)
1272            {
1273                let buffer_id = piece_view.location.buffer_id();
1274
1275                // Check if buffer needs loading
1276                let needs_loading = self
1277                    .buffers
1278                    .get(buffer_id)
1279                    .map(|b| !b.is_loaded())
1280                    .unwrap_or(false);
1281
1282                if needs_loading && self.chunk_split_and_load(&piece_view, current_offset)? {
1283                    restarted_iteration = true;
1284                    break;
1285                }
1286
1287                // Calculate the range to read from this piece
1288                let piece_start_in_doc = piece_view.doc_offset;
1289                let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
1290
1291                // Clip to the requested range
1292                let read_start = current_offset.max(piece_start_in_doc);
1293                let read_end = end_offset.min(piece_end_in_doc);
1294
1295                if read_end > read_start {
1296                    let offset_in_piece = read_start - piece_start_in_doc;
1297                    let bytes_to_read = read_end - read_start;
1298
1299                    let buffer_start = piece_view.buffer_offset + offset_in_piece;
1300                    let buffer_end = buffer_start + bytes_to_read;
1301
1302                    // Buffer should be loaded now
1303                    let buffer = self.buffers.get(buffer_id).context("Buffer not found")?;
1304                    let data = buffer
1305                        .get_data()
1306                        .context("Buffer data unavailable after load")?;
1307
1308                    anyhow::ensure!(
1309                        buffer_end <= data.len(),
1310                        "Buffer range out of bounds: requested {}..{}, buffer size {}",
1311                        buffer_start,
1312                        buffer_end,
1313                        data.len()
1314                    );
1315
1316                    result.extend_from_slice(&data[buffer_start..buffer_end]);
1317                    current_offset = read_end;
1318                    made_progress = true;
1319                }
1320            }
1321
1322            // If we didn't make progress and didn't restart iteration, this is an error
1323            if !made_progress && !restarted_iteration {
1324                tracing::error!(
1325                    "get_text_range_mut: No progress at offset {} (requested range: {}..{}, buffer len: {})",
1326                    current_offset,
1327                    offset,
1328                    end_offset,
1329                    self.len()
1330                );
1331                tracing::error!(
1332                    "Piece tree stats: {} total bytes",
1333                    self.piece_tree.stats().total_bytes
1334                );
1335                anyhow::bail!(
1336                    "Failed to read data at offset {}: no progress made (requested {}..{}, buffer len: {})",
1337                    current_offset,
1338                    offset,
1339                    end_offset,
1340                    self.len()
1341                );
1342            }
1343        }
1344
1345        if iteration_count > 1 {
1346            tracing::info!(
1347                iteration_count,
1348                result_len = result.len(),
1349                "get_text_range_mut: completed with multiple iterations"
1350            );
1351        }
1352
1353        Ok(result)
1354    }
1355
1356    /// Prepare a viewport for rendering
1357    ///
1358    /// This is called before rendering with &mut access to pre-load all data
1359    /// that will be needed for the viewport. It estimates the number of bytes
1360    /// needed based on the line count and pre-loads them.
1361    ///
1362    /// # Arguments
1363    /// * `start_offset` - The byte offset where the viewport starts
1364    /// * `line_count` - The number of lines to prepare (estimate)
1365    ///
1366    /// # Returns
1367    /// Ok(()) if preparation succeeded, Err if loading failed
1368    pub fn prepare_viewport(&mut self, start_offset: usize, line_count: usize) -> Result<()> {
1369        let _span = tracing::info_span!("prepare_viewport", start_offset, line_count).entered();
1370        // Estimate how many bytes we need (pessimistic assumption)
1371        // Average line length is typically 80-100 bytes, but we use 200 to be safe
1372        let estimated_bytes = line_count.saturating_mul(200);
1373
1374        // Cap the estimate at the remaining bytes in the document
1375        let remaining_bytes = self.total_bytes().saturating_sub(start_offset);
1376        let bytes_to_load = estimated_bytes.min(remaining_bytes);
1377        tracing::trace!(
1378            bytes_to_load,
1379            total_bytes = self.total_bytes(),
1380            "prepare_viewport loading"
1381        );
1382
1383        // Pre-load with full chunk-splitting support
1384        // This may load more than we need, but ensures all data is available
1385        self.get_text_range_mut(start_offset, bytes_to_load)?;
1386
1387        Ok(())
1388    }
1389
1390    /// Split a piece that references a large unloaded buffer, create a chunk
1391    /// buffer for the region around `current_offset`, and load it.
1392    ///
1393    /// Returns `true` if the piece tree was modified (caller must restart its
1394    /// iteration), `false` if the piece was small enough to load in-place.
1395    fn chunk_split_and_load(
1396        &mut self,
1397        piece_view: &PieceView,
1398        current_offset: usize,
1399    ) -> Result<bool> {
1400        let buffer_id = piece_view.location.buffer_id();
1401
1402        // The underlying buffer may be much larger than this piece (e.g. the
1403        // whole-file Stored buffer after rebuild_with_pristine_saved_root).
1404        // We must chunk-split if either the piece or its buffer exceeds
1405        // LOAD_CHUNK_SIZE, because `load()` loads the entire buffer.
1406        let buffer_bytes = self
1407            .buffers
1408            .get(buffer_id)
1409            .and_then(|b| b.unloaded_bytes())
1410            .unwrap_or(0);
1411        let needs_chunk_split =
1412            piece_view.bytes > LOAD_CHUNK_SIZE || buffer_bytes > piece_view.bytes;
1413
1414        tracing::info!(
1415            buffer_id,
1416            piece_bytes = piece_view.bytes,
1417            buffer_bytes,
1418            needs_chunk_split,
1419            piece_doc_offset = piece_view.doc_offset,
1420            current_offset,
1421            "chunk_split_and_load: loading unloaded piece"
1422        );
1423
1424        if !needs_chunk_split {
1425            // Piece is small enough and its buffer matches — load in-place.
1426            let _span = tracing::info_span!(
1427                "load_small_buffer",
1428                piece_bytes = piece_view.bytes,
1429                buffer_id,
1430            )
1431            .entered();
1432            self.buffers
1433                .get_mut(buffer_id)
1434                .context("Buffer not found")?
1435                .load(&**self.persistence.fs())
1436                .context("Failed to load buffer")?;
1437            return Ok(false);
1438        }
1439
1440        let _span = tracing::info_span!(
1441            "chunk_split_and_load",
1442            piece_bytes = piece_view.bytes,
1443            buffer_id,
1444        )
1445        .entered();
1446
1447        let piece_start_in_doc = piece_view.doc_offset;
1448        let offset_in_piece = current_offset.saturating_sub(piece_start_in_doc);
1449
1450        // When the piece already fits within LOAD_CHUNK_SIZE, create a chunk
1451        // buffer for the exact piece range (no alignment/splitting needed).
1452        // Alignment rounding is only useful when carving a sub-range out of a
1453        // piece larger than LOAD_CHUNK_SIZE.
1454        let (chunk_start_in_buffer, chunk_bytes) = if piece_view.bytes <= LOAD_CHUNK_SIZE {
1455            (piece_view.buffer_offset, piece_view.bytes)
1456        } else {
1457            let start =
1458                (piece_view.buffer_offset + offset_in_piece) / CHUNK_ALIGNMENT * CHUNK_ALIGNMENT;
1459            let bytes = LOAD_CHUNK_SIZE
1460                .min((piece_view.buffer_offset + piece_view.bytes).saturating_sub(start));
1461            (start, bytes)
1462        };
1463
1464        // Calculate document offsets for splitting
1465        let chunk_start_offset_in_piece =
1466            chunk_start_in_buffer.saturating_sub(piece_view.buffer_offset);
1467        let split_start_in_doc = piece_start_in_doc + chunk_start_offset_in_piece;
1468        let split_end_in_doc = split_start_in_doc + chunk_bytes;
1469
1470        // Split the piece to isolate the chunk
1471        if chunk_start_offset_in_piece > 0 {
1472            self.piece_tree
1473                .split_at_offset(split_start_in_doc, &self.buffers);
1474        }
1475        if split_end_in_doc < piece_start_in_doc + piece_view.bytes {
1476            self.piece_tree
1477                .split_at_offset(split_end_in_doc, &self.buffers);
1478        }
1479
1480        // Create a new buffer for this chunk
1481        let chunk_buffer = self
1482            .buffers
1483            .get(buffer_id)
1484            .context("Buffer not found")?
1485            .create_chunk_buffer(self.next_buffer_id, chunk_start_in_buffer, chunk_bytes)
1486            .context("Failed to create chunk buffer")?;
1487
1488        self.next_buffer_id += 1;
1489        let new_buffer_id = chunk_buffer.id;
1490        self.buffers.push(chunk_buffer);
1491
1492        // Update the piece to reference the new chunk buffer
1493        self.piece_tree.replace_buffer_reference(
1494            buffer_id,
1495            piece_view.buffer_offset + chunk_start_offset_in_piece,
1496            chunk_bytes,
1497            BufferLocation::Added(new_buffer_id),
1498        );
1499
1500        // Load the chunk buffer
1501        self.buffers
1502            .get_mut(new_buffer_id)
1503            .context("Chunk buffer not found")?
1504            .load(&**self.persistence.fs())
1505            .context("Failed to load chunk")?;
1506
1507        // split_at_offset uses compute_line_feeds_static which returns None
1508        // for unloaded buffers, destroying the scanned line feed counts.
1509        // Fix up: the loaded chunk is counted from memory, remaining unloaded
1510        // pieces use the filesystem's count_line_feeds_in_range.
1511        if self.file_kind.has_line_feed_scan() {
1512            let leaves = self.piece_tree.get_leaves();
1513            let mut fixups: Vec<(usize, usize)> = Vec::new();
1514            for (idx, leaf) in leaves.iter().enumerate() {
1515                if leaf.line_feed_cnt.is_none() {
1516                    if let Ok(count) = self.scan_leaf(leaf) {
1517                        fixups.push((idx, count));
1518                    }
1519                }
1520            }
1521            if !fixups.is_empty() {
1522                self.piece_tree.update_leaf_line_feeds_path_copy(&fixups);
1523            }
1524        }
1525
1526        // Keep saved_root in sync with viewport-loading tree restructures so
1527        // that diff_since_saved() can match by (location, offset) identity.
1528        //
1529        // When !modified the current tree IS the saved state, so just snapshot.
1530        // When modified, we must apply the same Stored→Added leaf replacement
1531        // to saved_root so the diff doesn't see loaded-but-unedited regions as
1532        // changed.
1533        if !self.persistence.is_modified() {
1534            self.persistence.set_saved_root(self.piece_tree.root());
1535        } else {
1536            self.persistence.apply_chunk_load_to_saved_root(
1537                buffer_id,
1538                chunk_start_in_buffer,
1539                chunk_bytes,
1540                new_buffer_id,
1541            );
1542        }
1543
1544        Ok(true)
1545    }
1546
1547    /// Get all text as a single Vec<u8>
1548    /// Returns None if any buffers are unloaded (lazy loading)
1549    /// CRATE-PRIVATE: External code should use get_text_range_mut() or DocumentModel methods
1550    pub(crate) fn get_all_text(&self) -> Option<Vec<u8>> {
1551        self.get_text_range(0, self.total_bytes())
1552    }
1553
1554    /// Get all text as a String
1555    /// Returns None if any buffers are unloaded (lazy loading)
1556    /// CRATE-PRIVATE: External code should use get_text_range_mut() or DocumentModel methods
1557    pub(crate) fn get_all_text_string(&self) -> Option<String> {
1558        self.get_all_text()
1559            .map(|bytes| String::from_utf8_lossy(&bytes).into_owned())
1560    }
1561
1562    /// Get text from a byte range as bytes
1563    /// CRATE-PRIVATE: Returns empty vector if any buffers are unloaded (silently fails!)
1564    /// Only use this when you KNOW the data is loaded (e.g., for syntax highlighting small regions)
1565    /// External code should use get_text_range_mut() or DocumentModel methods
1566    pub(crate) fn slice_bytes(&self, range: Range<usize>) -> Vec<u8> {
1567        self.get_text_range(range.start, range.end.saturating_sub(range.start))
1568            .unwrap_or_default()
1569    }
1570
1571    /// Get all text as a String
1572    /// Returns None if any buffers are unloaded (lazy loading)
1573    pub fn to_string(&self) -> Option<String> {
1574        self.get_all_text_string()
1575    }
1576
1577    /// Get the total number of bytes
1578    pub fn len(&self) -> usize {
1579        self.total_bytes()
1580    }
1581
1582    /// Check if the buffer is empty
1583    pub fn is_empty(&self) -> bool {
1584        self.total_bytes() == 0
1585    }
1586
1587    /// Get the file path associated with this buffer
1588    pub fn file_path(&self) -> Option<&Path> {
1589        self.persistence.file_path()
1590    }
1591
1592    /// Update the file path after a rename operation on disk.
1593    pub fn rename_file_path(&mut self, path: PathBuf) {
1594        self.persistence.set_file_path(path);
1595    }
1596
1597    /// Clear the file path (make buffer unnamed)
1598    /// Note: This does NOT affect Unloaded chunk file_paths used for lazy loading.
1599    /// Those still point to the original source file for chunk loading.
1600    pub fn clear_file_path(&mut self) {
1601        self.persistence.clear_file_path();
1602    }
1603
1604    /// Extend buffer to include more bytes from a streaming source file.
1605    /// Used for stdin streaming where the temp file grows over time, and
1606    /// for plugin streaming via `RefreshBufferFromDisk`.
1607    ///
1608    /// Counts line feeds in the appended region so the new piece carries
1609    /// a real `line_feed_cnt` instead of `None`. Without this, any
1610    /// previously-known line count on the existing pieces propagates to
1611    /// `line_count() = None` (the piece-tree's `total_line_feeds`
1612    /// returns `None` if any piece is unknown), which in turn breaks the
1613    /// visual-row index used by the scrollbar.
1614    ///
1615    /// Falls back to `None` only when the filesystem can't count
1616    /// (errored stat / read). The buffer is still usable then — just
1617    /// without precise line indexing, same as a large file opened
1618    /// without a scan.
1619    pub fn extend_streaming(&mut self, source_path: &Path, new_size: usize) {
1620        let old_size = self.total_bytes();
1621        if new_size <= old_size {
1622            return;
1623        }
1624
1625        let additional_bytes = new_size - old_size;
1626
1627        // Create new Unloaded buffer for the appended region
1628        let buffer_id = self.next_buffer_id;
1629        self.next_buffer_id += 1;
1630
1631        let new_buffer = StringBuffer::new_unloaded(
1632            buffer_id,
1633            source_path.to_path_buf(),
1634            old_size,         // file_offset - where this chunk starts in the file
1635            additional_bytes, // bytes - size of this chunk
1636        );
1637        self.buffers.push(new_buffer);
1638
1639        // Count line feeds in the appended region from disk so the
1640        // piece carries a known line count. Counting is cheap — it's a
1641        // streaming scan of `additional_bytes`, no buffer materialisation.
1642        let line_feed_cnt = self
1643            .persistence
1644            .fs()
1645            .count_line_feeds_in_range(source_path, old_size as u64, additional_bytes)
1646            .ok();
1647
1648        // Append piece at end of document (insert at offset == total_bytes)
1649        self.piece_tree.insert(
1650            old_size,
1651            BufferLocation::Stored(buffer_id),
1652            0,
1653            additional_bytes,
1654            line_feed_cnt,
1655            &self.buffers,
1656        );
1657    }
1658
1659    /// Check if the buffer has been modified since last save
1660    pub fn is_modified(&self) -> bool {
1661        self.persistence.is_modified()
1662    }
1663
1664    /// Clear the modified flag (after save)
1665    pub fn clear_modified(&mut self) {
1666        self.persistence.clear_modified();
1667    }
1668
1669    /// Set the modified flag explicitly
1670    /// Used by undo/redo to restore the correct modified state
1671    pub fn set_modified(&mut self, modified: bool) {
1672        self.persistence.set_modified(modified);
1673    }
1674
1675    /// Check if buffer has pending changes for recovery auto-save
1676    pub fn is_recovery_pending(&self) -> bool {
1677        self.persistence.is_recovery_pending()
1678    }
1679
1680    /// Mark buffer as needing recovery auto-save (call after edits)
1681    pub fn set_recovery_pending(&mut self, pending: bool) {
1682        self.persistence.set_recovery_pending(pending);
1683    }
1684
1685    /// Ensure the buffer chunk at the given byte offset is loaded.
1686    ///
1687    /// When `line_feeds_scanned` is true, piece splits during insert/delete need
1688    /// the buffer data to be loaded so `compute_line_feeds_static` can accurately
1689    /// recount line feeds for each half. This method loads the chunk if needed.
1690    fn ensure_chunk_loaded_at(&mut self, offset: usize) {
1691        if let Some(piece_info) = self.piece_tree.find_by_offset(offset) {
1692            let buffer_id = piece_info.location.buffer_id();
1693            if let Some(buffer) = self.buffers.get_mut(buffer_id) {
1694                if !buffer.is_loaded() {
1695                    let buf_bytes = buffer.unloaded_bytes().unwrap_or(0);
1696                    tracing::info!(
1697                        "ensure_chunk_loaded_at: loading buffer {} ({} bytes) for offset {}",
1698                        buffer_id,
1699                        buf_bytes,
1700                        offset
1701                    );
1702                    if let Err(e) = buffer.load(&**self.persistence.fs()) {
1703                        tracing::warn!("Failed to load chunk at offset {offset}: {e}");
1704                    }
1705                }
1706            }
1707        }
1708    }
1709
1710    /// Check if this is a large file with lazy loading enabled
1711    pub fn is_large_file(&self) -> bool {
1712        self.file_kind.is_large_file()
1713    }
1714
1715    /// Check if line feeds have been scanned for this large file.
1716    /// When true, `line_count()` returns exact values.
1717    pub fn has_line_feed_scan(&self) -> bool {
1718        self.file_kind.has_line_feed_scan()
1719    }
1720
1721    /// Get the raw piece tree leaves (for storing alongside scan chunks).
1722    pub fn piece_tree_leaves(&self) -> Vec<crate::model::piece_tree::LeafData> {
1723        self.piece_tree.get_leaves()
1724    }
1725
1726    /// Prepare work items for an incremental line scan.
1727    ///
1728    /// First splits any oversized leaves in the piece tree so every leaf is
1729    /// at most `LOAD_CHUNK_SIZE` bytes.  Then returns one work item per leaf.
1730    /// After scanning, `get_text_range_mut` will never need to split a scanned
1731    /// leaf (it's already chunk-sized), so line-feed counts are preserved.
1732    ///
1733    /// Returns `(chunks, total_bytes)`.
1734    pub fn prepare_line_scan(&mut self) -> (Vec<LineScanChunk>, usize) {
1735        // Pre-split the tree so every leaf ≤ LOAD_CHUNK_SIZE.
1736        self.piece_tree.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
1737
1738        let leaves = self.piece_tree.get_leaves();
1739        let total_bytes: usize = leaves.iter().map(|l| l.bytes).sum();
1740        let mut chunks = Vec::new();
1741
1742        for (idx, leaf) in leaves.iter().enumerate() {
1743            chunks.push(LineScanChunk {
1744                leaf_index: idx,
1745                byte_len: leaf.bytes,
1746                already_known: leaf.line_feed_cnt.is_some(),
1747            });
1748        }
1749
1750        (chunks, total_bytes)
1751    }
1752
1753    /// Initialize a chunked search scan over this buffer's piece tree.
1754    ///
1755    /// Used for in-editor Ctrl+F (incremental, yields to the event loop
1756    /// between chunks) and for searching dirty buffers during project grep.
1757    /// For searching files on disk, use `FileSystem::search_file` instead.
1758    pub fn search_scan_init(
1759        &mut self,
1760        regex: regex::bytes::Regex,
1761        max_matches: usize,
1762        query_len: usize,
1763    ) -> ChunkedSearchState {
1764        let (chunks, total_bytes) = self.prepare_line_scan();
1765        ChunkedSearchState {
1766            chunks,
1767            next_chunk: 0,
1768            next_doc_offset: 0,
1769            total_bytes,
1770            scanned_bytes: 0,
1771            regex,
1772            matches: Vec::new(),
1773            overlap_tail: Vec::new(),
1774            overlap_doc_offset: 0,
1775            max_matches,
1776            capped: false,
1777            query_len,
1778            running_line: 1,
1779        }
1780    }
1781
1782    /// Process one chunk of a chunked search scan.
1783    ///
1784    /// Loads the next chunk via `get_text_range_mut`, prepends overlap from
1785    /// the previous chunk, runs the regex, and appends matches to `state`
1786    /// with line/column/context computed on the fly from the loaded bytes.
1787    ///
1788    /// Line numbers are tracked incrementally via `running_line` — each
1789    /// chunk counts newlines in its non-overlap portion to advance the
1790    /// counter for the next chunk, and matches use an incremental cursor
1791    /// so total line-counting work is O(chunk_size), not O(chunk × matches).
1792    ///
1793    /// Returns `Ok(true)` if there are more chunks to process, `Ok(false)`
1794    /// when the scan is complete.
1795    ///
1796    /// TODO: For concurrent/parallel search (searching multiple files at once),
1797    /// chunks would need to return chunk-relative line numbers and have them
1798    /// fixed up with each file's starting line offset after all chunks complete.
1799    pub fn search_scan_next_chunk(
1800        &mut self,
1801        state: &mut ChunkedSearchState,
1802    ) -> std::io::Result<bool> {
1803        if state.is_done() {
1804            return Ok(false);
1805        }
1806
1807        let chunk_info = state.chunks[state.next_chunk].clone();
1808        let doc_offset = state.next_doc_offset;
1809
1810        state.next_chunk += 1;
1811        state.scanned_bytes += chunk_info.byte_len;
1812        state.next_doc_offset += chunk_info.byte_len;
1813
1814        // Load the chunk bytes
1815        let chunk_bytes = self
1816            .get_text_range_mut(doc_offset, chunk_info.byte_len)
1817            .map_err(std::io::Error::other)?;
1818
1819        // Build search buffer: overlap tail + new chunk
1820        let overlap_len = state.overlap_tail.len();
1821        let mut search_buf = Vec::with_capacity(overlap_len + chunk_bytes.len());
1822        search_buf.extend_from_slice(&state.overlap_tail);
1823        search_buf.extend_from_slice(&chunk_bytes);
1824
1825        let buf_doc_offset = if overlap_len > 0 {
1826            state.overlap_doc_offset
1827        } else {
1828            doc_offset
1829        };
1830
1831        // Line number at buf_doc_offset: running_line tracks the line at
1832        // doc_offset (start of new chunk data). Count newlines in the overlap
1833        // prefix to get the line at the start of the full search_buf.
1834        let newlines_in_overlap = search_buf[..overlap_len]
1835            .iter()
1836            .filter(|&&b| b == b'\n')
1837            .count();
1838        let mut line_at = state.running_line - newlines_in_overlap;
1839        let mut counted_to = 0usize;
1840
1841        // Run regex on the combined buffer
1842        for m in state.regex.find_iter(&search_buf) {
1843            // Skip matches entirely within the overlap (already found)
1844            if overlap_len > 0 && m.end() <= overlap_len {
1845                continue;
1846            }
1847
1848            if state.matches.len() >= state.max_matches {
1849                state.capped = true;
1850                break;
1851            }
1852
1853            // Advance line counter incrementally to this match
1854            line_at += search_buf[counted_to..m.start()]
1855                .iter()
1856                .filter(|&&b| b == b'\n')
1857                .count();
1858            counted_to = m.start();
1859
1860            // Find line boundaries in search_buf for context
1861            let line_start = search_buf[..m.start()]
1862                .iter()
1863                .rposition(|&b| b == b'\n')
1864                .map(|p| p + 1)
1865                .unwrap_or(0);
1866            let line_end = search_buf[m.start()..]
1867                .iter()
1868                .position(|&b| b == b'\n')
1869                .map(|p| m.start() + p)
1870                .unwrap_or(search_buf.len());
1871
1872            let match_doc_offset = buf_doc_offset + m.start();
1873            let match_len = m.end() - m.start();
1874            let column = m.start() - line_start + 1;
1875            let context = String::from_utf8_lossy(&search_buf[line_start..line_end]).into_owned();
1876
1877            state.matches.push(SearchMatch {
1878                byte_offset: match_doc_offset,
1879                length: match_len,
1880                line: line_at,
1881                column,
1882                context,
1883            });
1884        }
1885
1886        // Advance running_line by newlines in the new (non-overlap) chunk data
1887        let newlines_in_chunk = chunk_bytes.iter().filter(|&&b| b == b'\n').count();
1888        state.running_line += newlines_in_chunk;
1889
1890        // Save overlap tail for next chunk
1891        let max_overlap = state.query_len.max(256).min(chunk_bytes.len());
1892        let tail_start = chunk_bytes.len().saturating_sub(max_overlap);
1893        state.overlap_tail = chunk_bytes[tail_start..].to_vec();
1894        state.overlap_doc_offset = doc_offset + tail_start;
1895
1896        Ok(!state.is_done())
1897    }
1898
1899    /// Run a complete chunked search over the piece tree (all chunks).
1900    ///
1901    /// Synchronous variant — used for dirty buffer snapshots in project
1902    /// grep and in tests.  For on-disk files, use `FileSystem::search_file`.
1903    pub fn search_scan_all(
1904        &mut self,
1905        regex: regex::bytes::Regex,
1906        max_matches: usize,
1907        query_len: usize,
1908    ) -> std::io::Result<ChunkedSearchState> {
1909        let mut state = self.search_scan_init(regex, max_matches, query_len);
1910        while self.search_scan_next_chunk(&mut state)? {}
1911        Ok(state)
1912    }
1913
1914    /// Build a hybrid search plan from the piece tree.
1915    ///
1916    /// Extracts regions (unloaded file ranges + loaded in-memory data) that
1917    /// can be searched independently.  The plan is `Send` so it can be
1918    /// executed on a background thread via `HybridSearchPlan::execute`.
1919    ///
1920    /// Returns `None` if the buffer has no file path (caller should fall
1921    /// back to `search_scan_all`).
1922    pub fn search_hybrid_plan(&mut self) -> Option<HybridSearchPlan> {
1923        let file_path = self.persistence.file_path_owned()?;
1924
1925        self.piece_tree.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
1926        let leaves = self.piece_tree.get_leaves();
1927
1928        let mut regions: Vec<SearchRegion> = Vec::new();
1929        let mut doc_offset = 0usize;
1930
1931        for leaf in &leaves {
1932            let buf = self.buffers.get(leaf.location.buffer_id());
1933            let is_unloaded_stored = matches!(
1934                (&leaf.location, buf),
1935                (
1936                    BufferLocation::Stored(_),
1937                    Some(StringBuffer {
1938                        data: BufferData::Unloaded { .. },
1939                        ..
1940                    }),
1941                )
1942            );
1943
1944            if is_unloaded_stored {
1945                let file_offset = match buf.unwrap().data {
1946                    BufferData::Unloaded {
1947                        file_offset: fo, ..
1948                    } => fo + leaf.offset,
1949                    _ => unreachable!(),
1950                };
1951
1952                // Merge with previous unloaded region if contiguous
1953                if let Some(SearchRegion::Unloaded {
1954                    file_offset: prev_fo,
1955                    bytes: prev_bytes,
1956                    ..
1957                }) = regions.last_mut()
1958                {
1959                    if *prev_fo + *prev_bytes == file_offset {
1960                        *prev_bytes += leaf.bytes;
1961                        doc_offset += leaf.bytes;
1962                        continue;
1963                    }
1964                }
1965                regions.push(SearchRegion::Unloaded {
1966                    file_offset,
1967                    bytes: leaf.bytes,
1968                    doc_offset,
1969                });
1970            } else {
1971                let data = match buf.and_then(|b| b.get_data()) {
1972                    Some(full) => {
1973                        let end = (leaf.offset + leaf.bytes).min(full.len());
1974                        full[leaf.offset..end].to_vec()
1975                    }
1976                    None => match self.get_text_range_mut(doc_offset, leaf.bytes) {
1977                        Ok(d) => d,
1978                        Err(_) => {
1979                            doc_offset += leaf.bytes;
1980                            continue;
1981                        }
1982                    },
1983                };
1984
1985                // Merge with previous loaded region
1986                if let Some(SearchRegion::Loaded {
1987                    data: prev_data, ..
1988                }) = regions.last_mut()
1989                {
1990                    prev_data.extend_from_slice(&data);
1991                    doc_offset += leaf.bytes;
1992                    continue;
1993                }
1994                regions.push(SearchRegion::Loaded { data, doc_offset });
1995            }
1996
1997            doc_offset += leaf.bytes;
1998        }
1999
2000        Some(HybridSearchPlan { file_path, regions })
2001    }
2002
2003    /// Hybrid search: uses `fs.search_file` for unloaded piece-tree regions
2004    /// (searches where the data lives, no network transfer) and in-memory regex
2005    /// for loaded/edited regions.  Handles overlap at region boundaries.
2006    ///
2007    /// For a huge remote file with a small local edit, this avoids transferring
2008    /// the entire file — only match metadata crosses the network.
2009    ///
2010    /// Falls back to `search_scan_all` when the buffer has no file path or is
2011    /// fully loaded.
2012    pub fn search_hybrid(
2013        &mut self,
2014        pattern: &str,
2015        opts: &FileSearchOptions,
2016        regex: Regex,
2017        max_matches: usize,
2018        query_len: usize,
2019    ) -> io::Result<Vec<SearchMatch>> {
2020        let plan = match self.search_hybrid_plan() {
2021            Some(p) => p,
2022            None => {
2023                let state = self.search_scan_all(regex, max_matches, query_len)?;
2024                return Ok(state.matches);
2025            }
2026        };
2027        plan.execute(
2028            &**self.persistence.fs(),
2029            pattern,
2030            opts,
2031            &regex,
2032            max_matches,
2033            query_len,
2034        )
2035    }
2036
2037    /// Count `\n` bytes in a single leaf.
2038    ///
2039    /// Uses `count_line_feeds_in_range` for unloaded buffers, which remote
2040    /// filesystem implementations can override to count server-side.
2041    pub fn scan_leaf(&self, leaf: &crate::model::piece_tree::LeafData) -> std::io::Result<usize> {
2042        let buffer_id = leaf.location.buffer_id();
2043        let buffer = self
2044            .buffers
2045            .get(buffer_id)
2046            .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::NotFound, "buffer not found"))?;
2047
2048        let count = match &buffer.data {
2049            crate::model::piece_tree::BufferData::Loaded { data, .. } => {
2050                let end = (leaf.offset + leaf.bytes).min(data.len());
2051                data[leaf.offset..end]
2052                    .iter()
2053                    .filter(|&&b| b == b'\n')
2054                    .count()
2055            }
2056            crate::model::piece_tree::BufferData::Unloaded {
2057                file_path,
2058                file_offset,
2059                ..
2060            } => {
2061                let read_offset = *file_offset as u64 + leaf.offset as u64;
2062                self.persistence.fs().count_line_feeds_in_range(
2063                    file_path,
2064                    read_offset,
2065                    leaf.bytes,
2066                )?
2067            }
2068        };
2069        Ok(count)
2070    }
2071
2072    /// Return the I/O parameters for an unloaded leaf, or `None` if loaded.
2073    ///
2074    /// Used by the incremental scan to distinguish leaves that can be counted
2075    /// in-memory (via `scan_leaf`) from those that need filesystem I/O.
2076    pub fn leaf_io_params(
2077        &self,
2078        leaf: &crate::model::piece_tree::LeafData,
2079    ) -> Option<(std::path::PathBuf, u64, usize)> {
2080        let buffer_id = leaf.location.buffer_id();
2081        let buffer = self.buffers.get(buffer_id)?;
2082        match &buffer.data {
2083            crate::model::piece_tree::BufferData::Loaded { .. } => None,
2084            crate::model::piece_tree::BufferData::Unloaded {
2085                file_path,
2086                file_offset,
2087                ..
2088            } => {
2089                let read_offset = *file_offset as u64 + leaf.offset as u64;
2090                Some((file_path.clone(), read_offset, leaf.bytes))
2091            }
2092        }
2093    }
2094
2095    /// Get a reference to the string buffers (for parallel scanning).
2096    pub fn buffer_slice(&self) -> &[StringBuffer] {
2097        &self.buffers
2098    }
2099
2100    /// Apply the results of an incremental line scan.
2101    pub fn apply_scan_updates(&mut self, updates: &[(usize, usize)]) {
2102        self.piece_tree.update_leaf_line_feeds(updates);
2103        self.file_kind.mark_line_feed_scan_complete();
2104    }
2105
2106    /// After an incremental line-feed scan completes, rebuild the tree so that
2107    /// `saved_root` and the current tree share `Arc` pointers for unedited
2108    /// subtrees. This makes `diff_since_saved()` O(edited regions) instead of
2109    /// O(file size).
2110    pub fn rebuild_with_pristine_saved_root(&mut self, scan_updates: &[(usize, usize)]) {
2111        let file_size = match self.persistence.saved_file_size() {
2112            Some(s) => s,
2113            None => {
2114                // Fallback: no saved file size means we can't build a pristine
2115                // tree. Just apply updates the old way.
2116                self.apply_scan_updates(scan_updates);
2117                return;
2118            }
2119        };
2120
2121        // --- Walk the current tree to extract deletions and insertions ---
2122        let total = self.total_bytes();
2123        // Deletions: gaps in Stored coverage (orig_offset, len).
2124        let mut deletions: Vec<(usize, usize)> = Vec::new();
2125        // Insertions: (post_delete_offset, location, buf_offset, bytes, lf_cnt).
2126        // post_delete_offset = cumulative surviving Stored bytes before this point.
2127        let mut insertions: Vec<(usize, BufferLocation, usize, usize, Option<usize>)> = Vec::new();
2128        let mut orig_cursor: usize = 0;
2129        let mut stored_bytes_in_doc: usize = 0;
2130
2131        for piece in self.piece_tree.iter_pieces_in_range(0, total) {
2132            match piece.location {
2133                BufferLocation::Stored(_) => {
2134                    if piece.buffer_offset > orig_cursor {
2135                        deletions.push((orig_cursor, piece.buffer_offset - orig_cursor));
2136                    }
2137                    orig_cursor = piece.buffer_offset + piece.bytes;
2138                    stored_bytes_in_doc += piece.bytes;
2139                }
2140                BufferLocation::Added(id) => {
2141                    // Check if this Added buffer was created by loading a chunk
2142                    // from the stored file (via get_text_range_mut chunk loading).
2143                    // If so, treat it as stored content, not a user edit.
2144                    if let Some(file_off) = self.buffers.get(id).and_then(|b| b.stored_file_offset)
2145                    {
2146                        if file_off > orig_cursor {
2147                            deletions.push((orig_cursor, file_off - orig_cursor));
2148                        }
2149                        orig_cursor = file_off + piece.bytes;
2150                        stored_bytes_in_doc += piece.bytes;
2151                    } else {
2152                        insertions.push((
2153                            stored_bytes_in_doc,
2154                            piece.location,
2155                            piece.buffer_offset,
2156                            piece.bytes,
2157                            piece.line_feed_cnt,
2158                        ));
2159                    }
2160                }
2161            }
2162        }
2163        // Trailing deletion.
2164        if orig_cursor < file_size {
2165            deletions.push((orig_cursor, file_size - orig_cursor));
2166        }
2167
2168        // --- Build pristine tree (full original file, pre-split, with lf counts) ---
2169        let mut pristine = if file_size > 0 {
2170            PieceTree::new(BufferLocation::Stored(0), 0, file_size, None)
2171        } else {
2172            PieceTree::empty()
2173        };
2174        pristine.split_leaves_to_chunk_size(LOAD_CHUNK_SIZE);
2175        pristine.update_leaf_line_feeds(scan_updates);
2176
2177        // Snapshot the pristine tree as saved_root.
2178        self.persistence.set_saved_root(pristine.root());
2179
2180        // If no edits, the pristine tree IS the current tree.
2181        if deletions.is_empty() && insertions.is_empty() {
2182            self.piece_tree = pristine;
2183            self.file_kind.mark_line_feed_scan_complete();
2184            return;
2185        }
2186
2187        // --- Replay edits onto a clone of the pristine tree ---
2188        let mut tree = pristine;
2189
2190        // Apply deletions from HIGH to LOW offset so earlier offsets stay valid.
2191        deletions.sort_by(|a, b| b.0.cmp(&a.0));
2192        for &(offset, len) in &deletions {
2193            tree.delete(offset, len, &self.buffers);
2194        }
2195
2196        // Apply insertions from LOW to HIGH. Each insertion shifts subsequent
2197        // offsets by its byte count, tracked via insert_delta.
2198        let mut insert_delta: usize = 0;
2199        for &(offset, location, buf_offset, bytes, lf_cnt) in &insertions {
2200            tree.insert(
2201                offset + insert_delta,
2202                location,
2203                buf_offset,
2204                bytes,
2205                lf_cnt,
2206                &self.buffers,
2207            );
2208            insert_delta += bytes;
2209        }
2210
2211        // Path-copy insert/delete may split Stored leaves whose data is
2212        // Unloaded, producing fragments with line_feed_cnt = None
2213        // (compute_line_feeds_static can't read unloaded data). Fix them up
2214        // by scanning any remaining None leaves.
2215        let leaves = tree.get_leaves();
2216        let mut fixups: Vec<(usize, usize)> = Vec::new();
2217        for (idx, leaf) in leaves.iter().enumerate() {
2218            if leaf.line_feed_cnt.is_none() {
2219                if let Ok(count) = self.scan_leaf(leaf) {
2220                    fixups.push((idx, count));
2221                }
2222            }
2223        }
2224        if !fixups.is_empty() {
2225            tree.update_leaf_line_feeds_path_copy(&fixups);
2226        }
2227
2228        self.piece_tree = tree;
2229        self.file_kind.mark_line_feed_scan_complete();
2230    }
2231
2232    /// Resolve the exact byte offset for a given line number (0-indexed).
2233    ///
2234    /// Uses the tree's line feed counts to find the piece containing the target line,
2235    /// then loads/reads that piece's data to find the exact newline position.
2236    /// This works even when buffers are unloaded (large file with scanned line index).
2237    pub fn resolve_line_byte_offset(&mut self, target_line: usize) -> Option<usize> {
2238        if target_line == 0 {
2239            return Some(0);
2240        }
2241
2242        // Use tree metadata to find the piece containing the target line
2243        let (doc_offset, buffer_id, piece_offset, piece_bytes, lines_before) =
2244            self.piece_tree.piece_info_for_line(target_line)?;
2245
2246        // We need to find the (target_line - lines_before)-th newline within this piece
2247        let lines_to_skip = target_line - lines_before;
2248
2249        // Get the piece data — either from loaded buffer or read from disk
2250        let buffer = self.buffers.get(buffer_id)?;
2251        let piece_data: Vec<u8> = match &buffer.data {
2252            crate::model::piece_tree::BufferData::Loaded { data, .. } => {
2253                let end = (piece_offset + piece_bytes).min(data.len());
2254                data[piece_offset..end].to_vec()
2255            }
2256            crate::model::piece_tree::BufferData::Unloaded {
2257                file_path,
2258                file_offset,
2259                ..
2260            } => {
2261                let read_offset = *file_offset as u64 + piece_offset as u64;
2262                self.persistence
2263                    .fs()
2264                    .read_range(file_path, read_offset, piece_bytes)
2265                    .ok()?
2266            }
2267        };
2268
2269        // Count newlines to find the target line start
2270        let mut newlines_found = 0;
2271        for (i, &byte) in piece_data.iter().enumerate() {
2272            if byte == b'\n' {
2273                newlines_found += 1;
2274                if newlines_found == lines_to_skip {
2275                    // The target line starts right after this newline
2276                    return Some(doc_offset + i + 1);
2277                }
2278            }
2279        }
2280
2281        // If we didn't find enough newlines, the line starts in the next piece
2282        // Return the end of this piece as an approximation
2283        Some(doc_offset + piece_bytes)
2284    }
2285
2286    /// Get the saved file size (size of the file on disk after last load/save)
2287    /// For large files, this is used during recovery to know the expected original file size.
2288    /// Returns None for new unsaved buffers.
2289    pub fn original_file_size(&self) -> Option<usize> {
2290        // Return the tracked saved file size - this is updated when the file is
2291        // loaded or saved, so it always reflects the current file on disk.
2292        self.persistence.saved_file_size()
2293    }
2294
2295    /// Get recovery chunks for this buffer (only modified portions)
2296    ///
2297    /// For large files, this returns only the pieces that come from Added buffers
2298    /// (i.e., the modifications), not the original file content. This allows
2299    /// efficient incremental recovery without reading/writing the entire file.
2300    ///
2301    /// Returns: Vec of (original_file_offset, data) for each modified chunk
2302    /// The offset is the position in the ORIGINAL file where this chunk should be inserted.
2303    pub fn get_recovery_chunks(&self) -> Vec<(usize, Vec<u8>)> {
2304        use crate::model::piece_tree::BufferLocation;
2305
2306        let mut chunks = Vec::new();
2307        let total = self.total_bytes();
2308
2309        // Track cumulative bytes from Stored pieces as we iterate.
2310        // This gives us the original file offset for Added pieces.
2311        // The key insight: Added pieces should be inserted at the position
2312        // corresponding to where they appear relative to Stored content,
2313        // not their position in the current document.
2314        let mut stored_bytes_before = 0;
2315
2316        for piece in self.piece_tree.iter_pieces_in_range(0, total) {
2317            match piece.location {
2318                BufferLocation::Stored(_) => {
2319                    // Accumulate stored bytes to track position in original file
2320                    stored_bytes_before += piece.bytes;
2321                }
2322                BufferLocation::Added(buffer_id) => {
2323                    if let Some(buffer) = self.buffers.iter().find(|b| b.id == buffer_id) {
2324                        // Skip buffers that originate from the original file
2325                        // (loaded by chunk_split_and_load for viewport display).
2326                        // These have stored_file_offset set and are not user edits.
2327                        //
2328                        // Why Added and not Stored? The piece tree only has two
2329                        // variants: Stored and Added. chunk_split_and_load marks
2330                        // loaded chunks as Added(new_id) because
2331                        // rebuild_with_pristine_saved_root interprets Stored
2332                        // pieces' buffer_offset as a position in the original
2333                        // file — but a chunk buffer starts at offset 0, so using
2334                        // Stored would corrupt the rebuild logic. We rely on
2335                        // stored_file_offset instead to distinguish "loaded from
2336                        // disk" from "user edit". A third BufferLocation variant
2337                        // (e.g. LoadedChunk) would make this distinction explicit
2338                        // in the type system rather than requiring this runtime
2339                        // check.
2340                        if buffer.stored_file_offset.is_some() {
2341                            stored_bytes_before += piece.bytes;
2342                            continue;
2343                        }
2344                        // Get the data from the buffer if loaded
2345                        if let Some(data) = buffer.get_data() {
2346                            // Extract just the portion this piece references
2347                            let start = piece.buffer_offset;
2348                            let end = start + piece.bytes;
2349                            if end <= data.len() {
2350                                // Use stored_bytes_before as the original file offset.
2351                                // This is where this insertion should go relative to
2352                                // the original file content.
2353                                chunks.push((stored_bytes_before, data[start..end].to_vec()));
2354                            }
2355                        }
2356                    }
2357                }
2358            }
2359        }
2360
2361        chunks
2362    }
2363
2364    /// Check if this buffer contains binary content
2365    pub fn is_binary(&self) -> bool {
2366        self.file_kind.is_binary()
2367    }
2368
2369    /// Get the line ending format for this buffer
2370    pub fn line_ending(&self) -> LineEnding {
2371        self.format.line_ending()
2372    }
2373
2374    /// Set the line ending format for this buffer
2375    ///
2376    /// This marks the buffer as modified since the line ending format has changed.
2377    /// On save, the buffer content will be converted to the new format.
2378    pub fn set_line_ending(&mut self, line_ending: LineEnding) {
2379        self.format.set_line_ending(line_ending);
2380        self.mark_content_modified();
2381    }
2382
2383    /// Set the default line ending format for a new/empty buffer
2384    ///
2385    /// Unlike `set_line_ending`, this does NOT mark the buffer as modified.
2386    /// This should be used when initializing a new buffer with a configured default.
2387    pub fn set_default_line_ending(&mut self, line_ending: LineEnding) {
2388        self.format.set_default_line_ending(line_ending);
2389    }
2390
2391    /// Get the encoding format for this buffer
2392    pub fn encoding(&self) -> Encoding {
2393        self.format.encoding()
2394    }
2395
2396    /// Set the encoding format for this buffer
2397    ///
2398    /// This marks the buffer as modified since the encoding format has changed.
2399    /// On save, the buffer content will be converted to the new encoding.
2400    pub fn set_encoding(&mut self, encoding: Encoding) {
2401        self.format.set_encoding(encoding);
2402        self.mark_content_modified();
2403    }
2404
2405    /// Set the default encoding format for a new/empty buffer
2406    ///
2407    /// Unlike `set_encoding`, this does NOT mark the buffer as modified.
2408    /// This should be used when initializing a new buffer with a configured default.
2409    pub fn set_default_encoding(&mut self, encoding: Encoding) {
2410        self.format.set_default_encoding(encoding);
2411    }
2412
2413    /// Get the first line of the buffer as a lossy UTF-8 string, suitable
2414    /// for shebang / first-line grammar detection. Returns `None` for an
2415    /// empty buffer. Non-UTF-8 bytes are replaced with U+FFFD.
2416    pub fn first_line_lossy(&self) -> Option<String> {
2417        let bytes = self.get_line(0)?;
2418        if bytes.is_empty() {
2419            return None;
2420        }
2421        Some(String::from_utf8_lossy(&bytes).into_owned())
2422    }
2423
2424    /// Get text for a specific line
2425    pub fn get_line(&self, line: usize) -> Option<Vec<u8>> {
2426        let (start, end) = self.piece_tree.line_range(line, &self.buffers)?;
2427
2428        let bytes = if let Some(end_offset) = end {
2429            end_offset.saturating_sub(start)
2430        } else {
2431            self.total_bytes().saturating_sub(start)
2432        };
2433
2434        self.get_text_range(start, bytes)
2435    }
2436
2437    /// Get the byte offset where a line starts
2438    pub fn line_start_offset(&self, line: usize) -> Option<usize> {
2439        let (start, _) = self.piece_tree.line_range(line, &self.buffers)?;
2440        Some(start)
2441    }
2442
2443    /// Get piece information at a byte offset
2444    pub fn piece_info_at_offset(&self, offset: usize) -> Option<PieceInfo> {
2445        self.piece_tree.find_by_offset(offset)
2446    }
2447
2448    /// Get tree statistics for debugging
2449    pub fn stats(&self) -> TreeStats {
2450        self.piece_tree.stats()
2451    }
2452
2453    // Search and Replace Operations
2454
2455    /// Find the next occurrence of a pattern, with wrap-around
2456    pub fn find_next(&self, pattern: &str, start_pos: usize) -> Option<usize> {
2457        if pattern.is_empty() {
2458            return None;
2459        }
2460
2461        let pattern_bytes = pattern.as_bytes();
2462        let buffer_len = self.len();
2463
2464        // Search from start_pos to end
2465        if start_pos < buffer_len {
2466            if let Some(offset) = self.find_pattern(start_pos, buffer_len, pattern_bytes) {
2467                return Some(offset);
2468            }
2469        }
2470
2471        // Wrap around: search from beginning to start_pos
2472        if start_pos > 0 {
2473            if let Some(offset) = self.find_pattern(0, start_pos, pattern_bytes) {
2474                return Some(offset);
2475            }
2476        }
2477
2478        None
2479    }
2480
2481    /// Find the next occurrence of a pattern within an optional range
2482    /// If range is None, searches the entire buffer with wrap-around (same as find_next)
2483    /// If range is Some, searches only within that range without wrap-around
2484    pub fn find_next_in_range(
2485        &self,
2486        pattern: &str,
2487        start_pos: usize,
2488        range: Option<Range<usize>>,
2489    ) -> Option<usize> {
2490        if pattern.is_empty() {
2491            return None;
2492        }
2493
2494        if let Some(search_range) = range {
2495            // Search within range only, no wrap-around
2496            let pattern_bytes = pattern.as_bytes();
2497            let search_start = start_pos.max(search_range.start);
2498            let search_end = search_range.end.min(self.len());
2499
2500            if search_start < search_end {
2501                self.find_pattern(search_start, search_end, pattern_bytes)
2502            } else {
2503                None
2504            }
2505        } else {
2506            // No range specified, use normal find_next with wrap-around
2507            self.find_next(pattern, start_pos)
2508        }
2509    }
2510
2511    /// Find pattern in a byte range using overlapping chunks
2512    fn find_pattern(&self, start: usize, end: usize, pattern: &[u8]) -> Option<usize> {
2513        if pattern.is_empty() || start >= end {
2514            return None;
2515        }
2516
2517        const CHUNK_SIZE: usize = 65536; // 64KB chunks
2518        let overlap = pattern.len().saturating_sub(1).max(1);
2519
2520        // Use the overlapping chunks iterator for efficient streaming search
2521        let chunks = OverlappingChunks::new(self, start, end, CHUNK_SIZE, overlap);
2522
2523        for chunk in chunks {
2524            // Search the entire chunk buffer
2525            if let Some(pos) = Self::find_in_bytes(&chunk.buffer, pattern) {
2526                let match_end = pos + pattern.len();
2527                // Only report if match ENDS in or after the valid zone
2528                // This ensures patterns spanning boundaries are found exactly once
2529                if match_end > chunk.valid_start {
2530                    let absolute_pos = chunk.absolute_pos + pos;
2531                    // Verify the match doesn't extend beyond our search range
2532                    if absolute_pos + pattern.len() <= end {
2533                        return Some(absolute_pos);
2534                    }
2535                }
2536            }
2537        }
2538
2539        None
2540    }
2541
2542    /// Simple byte pattern search using naive algorithm
2543    fn find_in_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
2544        if needle.is_empty() || needle.len() > haystack.len() {
2545            return None;
2546        }
2547
2548        (0..=haystack.len() - needle.len()).find(|&i| &haystack[i..i + needle.len()] == needle)
2549    }
2550
2551    /// Find the next occurrence of a regex pattern, with wrap-around
2552    pub fn find_next_regex(&self, regex: &Regex, start_pos: usize) -> Option<usize> {
2553        let buffer_len = self.len();
2554
2555        // Search from start_pos to end
2556        if start_pos < buffer_len {
2557            if let Some(offset) = self.find_regex(start_pos, buffer_len, regex) {
2558                return Some(offset);
2559            }
2560        }
2561
2562        // Wrap around: search from beginning to start_pos
2563        if start_pos > 0 {
2564            if let Some(offset) = self.find_regex(0, start_pos, regex) {
2565                return Some(offset);
2566            }
2567        }
2568
2569        None
2570    }
2571
2572    /// Find the next occurrence of a regex pattern within an optional range
2573    pub fn find_next_regex_in_range(
2574        &self,
2575        regex: &Regex,
2576        start_pos: usize,
2577        range: Option<Range<usize>>,
2578    ) -> Option<usize> {
2579        if let Some(search_range) = range {
2580            let search_start = start_pos.max(search_range.start);
2581            let search_end = search_range.end.min(self.len());
2582
2583            if search_start < search_end {
2584                self.find_regex(search_start, search_end, regex)
2585            } else {
2586                None
2587            }
2588        } else {
2589            self.find_next_regex(regex, start_pos)
2590        }
2591    }
2592
2593    /// Find regex pattern in a byte range using overlapping chunks
2594    fn find_regex(&self, start: usize, end: usize, regex: &Regex) -> Option<usize> {
2595        if start >= end {
2596            return None;
2597        }
2598
2599        const CHUNK_SIZE: usize = 1048576; // 1MB chunks
2600        const OVERLAP: usize = 4096; // 4KB overlap for regex
2601
2602        // Use the overlapping chunks iterator for efficient streaming search
2603        // This fixes the critical bug where regex patterns spanning chunk boundaries were missed
2604        let chunks = OverlappingChunks::new(self, start, end, CHUNK_SIZE, OVERLAP);
2605
2606        for chunk in chunks {
2607            // Search the entire chunk buffer
2608            if let Some(mat) = regex.find(&chunk.buffer) {
2609                let match_end = mat.end();
2610                // Only report if match ENDS in or after the valid zone
2611                // This ensures patterns spanning boundaries are found exactly once
2612                if match_end > chunk.valid_start {
2613                    let absolute_pos = chunk.absolute_pos + mat.start();
2614                    // Verify the match doesn't extend beyond our search range
2615                    let match_len = mat.end() - mat.start();
2616                    if absolute_pos + match_len <= end {
2617                        return Some(absolute_pos);
2618                    }
2619                }
2620            }
2621        }
2622
2623        None
2624    }
2625
2626    /// Replace a range with replacement text
2627    pub fn replace_range(&mut self, range: Range<usize>, replacement: &str) -> bool {
2628        if range.start >= self.len() {
2629            return false;
2630        }
2631
2632        let end = range.end.min(self.len());
2633        if end > range.start {
2634            self.delete_bytes(range.start, end - range.start);
2635        }
2636
2637        if !replacement.is_empty() {
2638            self.insert(range.start, replacement);
2639        }
2640
2641        true
2642    }
2643
2644    /// Find and replace the next occurrence of a pattern
2645    pub fn replace_next(
2646        &mut self,
2647        pattern: &str,
2648        replacement: &str,
2649        start_pos: usize,
2650        range: Option<Range<usize>>,
2651    ) -> Option<usize> {
2652        if let Some(pos) = self.find_next_in_range(pattern, start_pos, range.clone()) {
2653            self.replace_range(pos..pos + pattern.len(), replacement);
2654            Some(pos)
2655        } else {
2656            None
2657        }
2658    }
2659
2660    /// Replace all occurrences of a pattern with replacement text
2661    pub fn replace_all(&mut self, pattern: &str, replacement: &str) -> usize {
2662        if pattern.is_empty() {
2663            return 0;
2664        }
2665
2666        let mut count = 0;
2667        let mut pos = 0;
2668
2669        // Keep searching and replacing
2670        // Note: we search forward from last replacement to handle growth/shrinkage
2671        // Find next occurrence (no wrap-around for replace_all)
2672        while let Some(found_pos) = self.find_next_in_range(pattern, pos, Some(0..self.len())) {
2673            self.replace_range(found_pos..found_pos + pattern.len(), replacement);
2674            count += 1;
2675
2676            // Move past the replacement
2677            pos = found_pos + replacement.len();
2678
2679            // If we're at or past the end, stop
2680            if pos >= self.len() {
2681                break;
2682            }
2683        }
2684
2685        count
2686    }
2687
2688    /// Replace all occurrences of a regex pattern with replacement text
2689    pub fn replace_all_regex(&mut self, regex: &Regex, replacement: &str) -> Result<usize> {
2690        let mut count = 0;
2691        let mut pos = 0;
2692
2693        while let Some(found_pos) = self.find_next_regex_in_range(regex, pos, Some(0..self.len())) {
2694            // Get the match to find its length
2695            let text = self
2696                .get_text_range_mut(found_pos, self.len() - found_pos)
2697                .context("Failed to read text for regex match")?;
2698
2699            if let Some(mat) = regex.find(&text) {
2700                self.replace_range(found_pos..found_pos + mat.len(), replacement);
2701                count += 1;
2702                pos = found_pos + replacement.len();
2703
2704                if pos >= self.len() {
2705                    break;
2706                }
2707            } else {
2708                break;
2709            }
2710        }
2711
2712        Ok(count)
2713    }
2714
2715    // LSP Support (UTF-16 conversions)
2716
2717    /// Convert byte position to (line, column) in bytes
2718    pub fn position_to_line_col(&self, byte_pos: usize) -> (usize, usize) {
2719        self.offset_to_position(byte_pos)
2720            .map(|pos| (pos.line, pos.column))
2721            .unwrap_or_else(|| (byte_pos / 80, 0)) // Estimate if metadata unavailable
2722    }
2723
2724    /// Convert (line, character) to byte position - 0-indexed
2725    /// character is in BYTES, not UTF-16 code units
2726    /// Optimized to use single line_range() call instead of two
2727    pub fn line_col_to_position(&self, line: usize, character: usize) -> usize {
2728        if let Some((start, end)) = self.piece_tree.line_range(line, &self.buffers) {
2729            // Calculate line length from the range
2730            let line_len = if let Some(end_offset) = end {
2731                end_offset.saturating_sub(start)
2732            } else {
2733                self.total_bytes().saturating_sub(start)
2734            };
2735            let byte_offset = character.min(line_len);
2736            start + byte_offset
2737        } else {
2738            // Line doesn't exist, return end of buffer
2739            self.len()
2740        }
2741    }
2742
2743    /// Convert byte position to LSP position (line, UTF-16 code units)
2744    /// LSP protocol uses UTF-16 code units for character offsets
2745    pub fn position_to_lsp_position(&self, byte_pos: usize) -> (usize, usize) {
2746        let (line, column_bytes) = self
2747            .offset_to_position(byte_pos)
2748            .map(|pos| (pos.line, pos.column))
2749            .unwrap_or_else(|| (byte_pos / 80, 0)); // Estimate if metadata unavailable
2750
2751        // Get the line content
2752        if let Some(line_bytes) = self.get_line(line) {
2753            // Convert byte offset to UTF-16 code units
2754            let text_before = &line_bytes[..column_bytes.min(line_bytes.len())];
2755            let text_str = String::from_utf8_lossy(text_before);
2756            let utf16_offset = text_str.encode_utf16().count();
2757            (line, utf16_offset)
2758        } else {
2759            (line, 0)
2760        }
2761    }
2762
2763    /// Convert LSP position (line, UTF-16 code units) to byte position
2764    /// LSP uses UTF-16 code units for character offsets, not bytes
2765    /// Optimized to use single line_range() call instead of two
2766    pub fn lsp_position_to_byte(&self, line: usize, utf16_offset: usize) -> usize {
2767        if let Some((line_start, end)) = self.piece_tree.line_range(line, &self.buffers) {
2768            // Calculate line length and get line content
2769            let line_len = if let Some(end_offset) = end {
2770                end_offset.saturating_sub(line_start)
2771            } else {
2772                self.total_bytes().saturating_sub(line_start)
2773            };
2774
2775            if line_len > 0 {
2776                // If data is unloaded, return line_start as fallback
2777                let Some(line_bytes) = self.get_text_range(line_start, line_len) else {
2778                    return line_start;
2779                };
2780                let line_str = String::from_utf8_lossy(&line_bytes);
2781
2782                // Convert UTF-16 offset to byte offset
2783                let mut utf16_count = 0;
2784                let mut byte_offset = 0;
2785
2786                for ch in line_str.chars() {
2787                    if utf16_count >= utf16_offset {
2788                        break;
2789                    }
2790                    utf16_count += ch.len_utf16();
2791                    byte_offset += ch.len_utf8();
2792                }
2793
2794                line_start + byte_offset
2795            } else {
2796                line_start
2797            }
2798        } else {
2799            // Line doesn't exist, return end of buffer
2800            self.len()
2801        }
2802    }
2803
2804    // Navigation helpers
2805
2806    /// Find the previous character boundary (UTF-8 aware)
2807    pub fn prev_char_boundary(&self, pos: usize) -> usize {
2808        if pos == 0 {
2809            return 0;
2810        }
2811
2812        // Get a few bytes before pos to find the character boundary
2813        let start = pos.saturating_sub(4);
2814        let Some(bytes) = self.get_text_range(start, pos - start) else {
2815            // Data unloaded, return pos as fallback
2816            return pos;
2817        };
2818
2819        // Walk backwards to find a UTF-8 leading byte
2820        for i in (0..bytes.len()).rev() {
2821            let byte = bytes[i];
2822            // Check if this is a UTF-8 leading byte (not a continuation byte)
2823            if (byte & 0b1100_0000) != 0b1000_0000 {
2824                return start + i;
2825            }
2826        }
2827
2828        // Fallback
2829        pos.saturating_sub(1)
2830    }
2831
2832    /// Find the next character boundary (UTF-8 aware)
2833    pub fn next_char_boundary(&self, pos: usize) -> usize {
2834        let len = self.len();
2835        if pos >= len {
2836            return len;
2837        }
2838
2839        // Get a few bytes after pos to find the character boundary
2840        let end = (pos + 5).min(len);
2841        let Some(bytes) = self.get_text_range(pos, end - pos) else {
2842            // Data unloaded, return pos as fallback
2843            return pos;
2844        };
2845
2846        // Start from index 1 (we want the NEXT boundary)
2847        for (i, &byte) in bytes.iter().enumerate().skip(1) {
2848            // Check if this is a UTF-8 leading byte (not a continuation byte)
2849            if (byte & 0b1100_0000) != 0b1000_0000 {
2850                return pos + i;
2851            }
2852        }
2853
2854        // If we got here, we're at the end or found no boundary in the range
2855        end
2856    }
2857
2858    /// Check if a byte is a UTF-8 continuation byte (not at a char boundary)
2859    /// UTF-8 continuation bytes have the pattern 10xxxxxx (0x80-0xBF)
2860    /// This is the same check that str::is_char_boundary uses internally.
2861    #[inline]
2862    fn is_utf8_continuation_byte(byte: u8) -> bool {
2863        (byte & 0b1100_0000) == 0b1000_0000
2864    }
2865
2866    /// Snap position to a valid UTF-8 character boundary
2867    /// If already at a boundary, returns the same position.
2868    /// Otherwise, moves to the previous valid boundary.
2869    pub fn snap_to_char_boundary(&self, pos: usize) -> usize {
2870        let len = self.len();
2871        if pos == 0 || pos >= len {
2872            return pos.min(len);
2873        }
2874
2875        // Get the byte at pos to check if we're at a character boundary
2876        let Some(bytes) = self.get_text_range(pos, 1) else {
2877            // Data unloaded, return pos as fallback
2878            return pos;
2879        };
2880
2881        // A position is at a char boundary if the byte there is NOT a continuation byte
2882        if !Self::is_utf8_continuation_byte(bytes[0]) {
2883            // Already at a character boundary
2884            return pos;
2885        }
2886
2887        // Not at a boundary, find the previous one
2888        self.prev_char_boundary(pos)
2889    }
2890
2891    /// Find the previous grapheme cluster boundary (for proper cursor movement with combining characters)
2892    ///
2893    /// This handles complex scripts like Thai where multiple Unicode code points
2894    /// form a single visual character (grapheme cluster). For example, Thai "ที่"
2895    /// is 3 code points but 1 grapheme cluster.
2896    ///
2897    /// The lookahead window starts at 32 bytes but grows whenever the
2898    /// returned boundary sits at the start of the chunk — that is, whenever
2899    /// the chunk might not contain the full grapheme. This matters for ZWJ
2900    /// emoji sequences and Zalgo strings with many combining marks, which
2901    /// can easily exceed 32 bytes.
2902    pub fn prev_grapheme_boundary(&self, pos: usize) -> usize {
2903        if pos == 0 {
2904            return 0;
2905        }
2906
2907        let mut lookback: usize = 32;
2908        loop {
2909            // IMPORTANT: Align start to a valid character boundary to avoid invalid UTF-8
2910            // when get_text_range starts mid-character
2911            let raw_start = pos.saturating_sub(lookback);
2912            let start = if raw_start == 0 {
2913                0
2914            } else {
2915                // Find the character boundary at or before raw_start
2916                self.prev_char_boundary(raw_start + 1)
2917            };
2918
2919            let Some(bytes) = self.get_text_range(start, pos - start) else {
2920                // Data unloaded, fall back to char boundary
2921                return self.prev_char_boundary(pos);
2922            };
2923
2924            let text = match std::str::from_utf8(&bytes) {
2925                Ok(s) => s,
2926                Err(e) => {
2927                    // Still got invalid UTF-8 (shouldn't happen after alignment)
2928                    // Try using just the valid portion
2929                    let valid_bytes = &bytes[..e.valid_up_to()];
2930                    match std::str::from_utf8(valid_bytes) {
2931                        Ok(s) if !s.is_empty() => s,
2932                        _ => return self.prev_char_boundary(pos),
2933                    }
2934                }
2935            };
2936
2937            // Use shared grapheme utility with relative position
2938            let rel_pos = pos - start;
2939            let new_rel_pos = grapheme::prev_grapheme_boundary(text, rel_pos);
2940
2941            // If the returned boundary is at the start of our chunk, the
2942            // grapheme may extend further back. Only trust the answer when
2943            // either we already reached the beginning of the buffer or the
2944            // boundary sits strictly inside the chunk.
2945            if new_rel_pos > 0 || start == 0 {
2946                return start + new_rel_pos;
2947            }
2948
2949            // Expand the lookback window and retry. Cap at the full buffer.
2950            if lookback >= pos {
2951                return 0;
2952            }
2953            lookback = lookback.saturating_mul(2);
2954        }
2955    }
2956
2957    /// Find the next grapheme cluster boundary (for proper cursor movement with combining characters)
2958    ///
2959    /// This handles complex scripts like Thai where multiple Unicode code points
2960    /// form a single visual character (grapheme cluster). For example, Thai "ที่"
2961    /// is 3 code points but 1 grapheme cluster.
2962    ///
2963    /// The lookahead window grows whenever the first grapheme reaches the
2964    /// end of the chunk — otherwise ZWJ emoji and Zalgo strings whose byte
2965    /// length exceeds the initial 32-byte window would be split mid-cluster.
2966    pub fn next_grapheme_boundary(&self, pos: usize) -> usize {
2967        let len = self.len();
2968        if pos >= len {
2969            return len;
2970        }
2971
2972        let mut lookahead: usize = 32;
2973        loop {
2974            let end = (pos + lookahead).min(len);
2975            let Some(bytes) = self.get_text_range(pos, end - pos) else {
2976                // Data unloaded, fall back to char boundary
2977                return self.next_char_boundary(pos);
2978            };
2979
2980            // Convert to UTF-8 string, handling the case where we might have
2981            // grabbed bytes that end mid-character (truncate to valid UTF-8)
2982            let text = match std::str::from_utf8(&bytes) {
2983                Ok(s) => s,
2984                Err(e) => {
2985                    // The bytes end in an incomplete UTF-8 sequence
2986                    // Use only the valid portion (which includes at least the first grapheme)
2987                    let valid_bytes = &bytes[..e.valid_up_to()];
2988                    match std::str::from_utf8(valid_bytes) {
2989                        Ok(s) if !s.is_empty() => s,
2990                        _ => return self.next_char_boundary(pos),
2991                    }
2992                }
2993            };
2994
2995            let new_rel_pos = grapheme::next_grapheme_boundary(text, 0);
2996
2997            // If the first grapheme reaches the end of our chunk and there
2998            // is more buffer left beyond it, the grapheme may extend further.
2999            // Expand the window and retry.
3000            if new_rel_pos == text.len() && end < len {
3001                if lookahead >= len - pos {
3002                    return len;
3003                }
3004                lookahead = lookahead.saturating_mul(2);
3005                continue;
3006            }
3007
3008            return pos + new_rel_pos;
3009        }
3010    }
3011
3012    /// Find the previous word boundary
3013    pub fn prev_word_boundary(&self, pos: usize) -> usize {
3014        if pos == 0 {
3015            return 0;
3016        }
3017
3018        // Get some text before pos
3019        let start = pos.saturating_sub(256).max(0);
3020        let Some(bytes) = self.get_text_range(start, pos - start) else {
3021            // Data unloaded, return pos as fallback
3022            return pos;
3023        };
3024        let text = String::from_utf8_lossy(&bytes);
3025
3026        let mut found_word_char = false;
3027        let chars: Vec<char> = text.chars().collect();
3028
3029        for i in (0..chars.len()).rev() {
3030            let ch = chars[i];
3031            let is_word_char = ch.is_alphanumeric() || ch == '_';
3032
3033            if found_word_char && !is_word_char {
3034                // We've transitioned from word to non-word
3035                // Calculate the byte position
3036                let byte_offset: usize = chars[0..=i].iter().map(|c| c.len_utf8()).sum();
3037                return start + byte_offset;
3038            }
3039
3040            if is_word_char {
3041                found_word_char = true;
3042            }
3043        }
3044
3045        0
3046    }
3047
3048    /// Find the next word boundary
3049    pub fn next_word_boundary(&self, pos: usize) -> usize {
3050        let len = self.len();
3051        if pos >= len {
3052            return len;
3053        }
3054
3055        // Get some text after pos
3056        let end = (pos + 256).min(len);
3057        let Some(bytes) = self.get_text_range(pos, end - pos) else {
3058            // Data unloaded, return pos as fallback
3059            return pos;
3060        };
3061        let text = String::from_utf8_lossy(&bytes);
3062
3063        let mut found_word_char = false;
3064        let mut byte_offset = 0;
3065
3066        for ch in text.chars() {
3067            let is_word_char = ch.is_alphanumeric() || ch == '_';
3068
3069            if found_word_char && !is_word_char {
3070                // We've transitioned from word to non-word
3071                return pos + byte_offset;
3072            }
3073
3074            if is_word_char {
3075                found_word_char = true;
3076            }
3077
3078            byte_offset += ch.len_utf8();
3079        }
3080
3081        len
3082    }
3083
3084    /// Create a line iterator starting at the given byte position
3085    ///
3086    /// This iterator lazily loads chunks as needed, never scanning the entire file.
3087    /// For large files with unloaded buffers, chunks are loaded on-demand (1MB at a time).
3088    pub fn line_iterator(
3089        &mut self,
3090        byte_pos: usize,
3091        estimated_line_length: usize,
3092    ) -> LineIterator<'_> {
3093        LineIterator::new(self, byte_pos, estimated_line_length)
3094    }
3095
3096    /// Iterate over lines starting from a given byte offset, with line numbers
3097    ///
3098    /// This is a more efficient alternative to using line_iterator() + offset_to_position()
3099    /// because it calculates line numbers incrementally during iteration by accumulating
3100    /// line_feed_cnt from pieces (which is already tracked in the piece tree).
3101    ///
3102    /// Returns: Iterator yielding (byte_offset, content, line_number: Option<usize>)
3103    /// - line_number is Some(n) for small files with line metadata
3104    /// - line_number is None for large files without line metadata
3105    ///
3106    /// # Performance
3107    /// - O(1) per line for line number calculation (vs O(log n) per line with offset_to_position)
3108    /// - Uses single source of truth: piece tree's existing line_feed_cnt metadata
3109    pub fn iter_lines_from(
3110        &mut self,
3111        byte_pos: usize,
3112        max_lines: usize,
3113    ) -> Result<TextBufferLineIterator> {
3114        TextBufferLineIterator::new(self, byte_pos, max_lines)
3115    }
3116
3117    // Legacy API methods for backwards compatibility
3118
3119    /// Get the line number for a given byte offset
3120    ///
3121    /// Returns exact line number if metadata available, otherwise estimates based on bytes.
3122    ///
3123    /// # Behavior by File Size:
3124    /// - **Small files (< 1MB)**: Returns exact line number from piece tree's `line_starts` metadata
3125    /// - **Large files (≥ 1MB)**: Returns estimated line number using `byte_offset / estimated_line_length`
3126    ///
3127    /// Large files don't maintain line metadata for performance reasons. The estimation
3128    /// uses the configured `estimated_line_length` (default 80 bytes).
3129    pub fn get_line_number(&self, byte_offset: usize) -> usize {
3130        self.offset_to_position(byte_offset)
3131            .map(|pos| pos.line)
3132            .unwrap_or_else(|| {
3133                // Estimate line number based on configured average line length
3134                byte_offset / self.config.estimated_line_length
3135            })
3136    }
3137
3138    /// Get the configured estimated line length for approximate line number calculations.
3139    pub fn estimated_line_length(&self) -> usize {
3140        self.config.estimated_line_length
3141    }
3142
3143    /// Get the starting line number at a byte offset (used for viewport rendering)
3144    ///
3145    /// # Line Cache Architecture (Post-Refactoring):
3146    ///
3147    /// The concept of a separate "line cache" is **now obsolete**. After the refactoring,
3148    /// line tracking is integrated directly into the piece tree via:
3149    /// ```rust
3150    /// BufferData::Loaded {
3151    ///     data: Vec<u8>,
3152    ///     line_starts: Option<Vec<usize>>  // None = large file mode (no line metadata)
3153    /// }
3154    /// ```
3155    ///
3156    /// ## Why This Method Still Exists:
3157    /// The rendering code needs to know what line number to display in the margin at the
3158    /// top of the viewport. This method returns that line number, handling both small
3159    /// and large file modes transparently.
3160    ///
3161    /// ## Small vs Large File Modes:
3162    /// - **Small files**: `line_starts = Some(vec)` → returns exact line number from metadata
3163    /// - **Large files**: `line_starts = None` → returns estimated line number (byte_offset / estimated_line_length)
3164    ///
3165    /// ## Legacy Line Cache Methods:
3166    /// These methods are now no-ops and can be removed in a future cleanup:
3167    /// - `invalidate_line_cache_from()` - No-op (piece tree updates automatically)
3168    /// - `handle_line_cache_insertion()` - No-op (piece tree updates automatically)
3169    /// - `handle_line_cache_deletion()` - No-op (piece tree updates automatically)
3170    /// - `clear_line_cache()` - No-op (can't clear piece tree metadata)
3171    ///
3172    /// ## Bug Fix (2025-11):
3173    /// Previously this method always returned `0`, causing line numbers in the margin
3174    /// to always show 1, 2, 3... regardless of scroll position. Now it correctly returns
3175    /// the actual line number at `start_byte`.
3176    pub fn populate_line_cache(&mut self, start_byte: usize, _line_count: usize) -> usize {
3177        // No-op for cache population: LineIndex maintains all line starts automatically
3178        // But we need to return the actual line number at start_byte for rendering
3179        self.get_line_number(start_byte)
3180    }
3181
3182    /// Get cached byte offset for line (compatibility method)
3183    pub fn get_cached_byte_offset_for_line(&self, line_number: usize) -> Option<usize> {
3184        self.line_start_offset(line_number)
3185    }
3186
3187    /// Invalidate line cache from offset (no-op in new implementation)
3188    pub fn invalidate_line_cache_from(&mut self, _byte_offset: usize) {
3189        // No-op: LineIndex updates automatically
3190    }
3191
3192    /// Handle line cache insertion (no-op in new implementation)
3193    pub fn handle_line_cache_insertion(&mut self, _byte_offset: usize, _bytes_inserted: usize) {
3194        // No-op: LineIndex updates automatically during insert
3195    }
3196
3197    /// Handle line cache deletion (no-op in new implementation)
3198    pub fn handle_line_cache_deletion(&mut self, _byte_offset: usize, _bytes_deleted: usize) {
3199        // No-op: LineIndex updates automatically during delete
3200    }
3201
3202    /// Clear line cache (no-op in new implementation)
3203    pub fn clear_line_cache(&mut self) {
3204        // No-op: LineIndex can't be cleared
3205    }
3206
3207    // Test helper methods
3208
3209    /// Create a buffer from a string for testing
3210    #[cfg(test)]
3211    pub fn from_str_test(s: &str) -> Self {
3212        Self::from_bytes(
3213            s.as_bytes().to_vec(),
3214            std::sync::Arc::new(crate::model::filesystem::StdFileSystem),
3215        )
3216    }
3217
3218    /// Create a new empty buffer for testing
3219    #[cfg(test)]
3220    pub fn new_test() -> Self {
3221        Self::empty(std::sync::Arc::new(crate::model::filesystem::StdFileSystem))
3222    }
3223}
3224
3225/// Type alias for backwards compatibility
3226pub type Buffer = TextBuffer;
3227
3228// Re-export LineIterator from the line_iterator module
3229pub use crate::primitives::line_iterator::LineIterator;
3230
3231// ============================================================================
3232// Overlapping Chunks Iterator for Efficient Search
3233// ============================================================================
3234
3235/// Information about a chunk of data for pattern matching
3236#[derive(Debug)]
3237pub struct ChunkInfo {
3238    /// The buffer containing this chunk's data (includes overlap from previous chunk)
3239    pub buffer: Vec<u8>,
3240
3241    /// Absolute position in the document where this buffer starts
3242    pub absolute_pos: usize,
3243
3244    /// Offset within buffer where "new" data starts (valid match zone)
3245    /// Matches starting before this offset were already checked in the previous chunk
3246    pub valid_start: usize,
3247}
3248
3249/// Iterator that yields overlapping chunks for pattern matching
3250///
3251/// This iterator implements the VSCode/Sublime approach: pull overlapping chunks
3252/// from the underlying piece tree and use standard search algorithms on them.
3253///
3254/// # Algorithm
3255///
3256/// ```text
3257/// Chunk 1: [------------ valid -----------]
3258/// Chunk 2:      [overlap][---- valid ----]
3259/// Chunk 3:                   [overlap][-- valid --]
3260///
3261/// Only matches starting in the "valid" zone are reported to avoid duplicates.
3262/// ```
3263///
3264/// # Example
3265///
3266/// ```ignore
3267/// let chunks = OverlappingChunks::new(&text_buffer, start, end, 4096, pattern.len()-1);
3268/// for chunk in chunks {
3269///     // Search only starting from chunk.valid_start
3270///     if let Some(pos) = search(&chunk.buffer[chunk.valid_start..]) {
3271///         let absolute_pos = chunk.absolute_pos + chunk.valid_start + pos;
3272///         return Some(absolute_pos);
3273///     }
3274/// }
3275/// ```
3276pub struct OverlappingChunks<'a> {
3277    piece_iter: PieceRangeIter,
3278    buffers: &'a [StringBuffer],
3279
3280    // Reusable chunk buffer that we fill from pieces
3281    buffer: Vec<u8>,
3282    buffer_absolute_pos: usize,
3283
3284    // Current state
3285    current_pos: usize,
3286    end_pos: usize,
3287
3288    // Configuration
3289    chunk_size: usize,
3290    overlap: usize,
3291
3292    // Track first chunk special case
3293    first_chunk: bool,
3294
3295    // Cached piece data for incremental reading
3296    current_piece_data: Option<Vec<u8>>,
3297    current_piece_offset: usize,
3298}
3299
3300impl<'a> OverlappingChunks<'a> {
3301    /// Create a new overlapping chunks iterator
3302    ///
3303    /// # Arguments
3304    ///
3305    /// * `text_buffer` - The text buffer to iterate over
3306    /// * `start` - Start position in the document
3307    /// * `end` - End position in the document (exclusive)
3308    /// * `chunk_size` - Target size for each chunk (excluding overlap)
3309    /// * `overlap` - Number of bytes to overlap between chunks
3310    ///
3311    /// # Recommendations
3312    ///
3313    /// * For literal string search: `chunk_size=65536, overlap=pattern.len()-1`
3314    /// * For regex search: `chunk_size=1048576, overlap=4096`
3315    pub fn new(
3316        text_buffer: &'a TextBuffer,
3317        start: usize,
3318        end: usize,
3319        chunk_size: usize,
3320        overlap: usize,
3321    ) -> Self {
3322        let piece_iter = text_buffer.piece_tree.iter_pieces_in_range(start, end);
3323
3324        Self {
3325            piece_iter,
3326            buffers: &text_buffer.buffers,
3327            buffer: Vec::with_capacity(chunk_size + overlap),
3328            buffer_absolute_pos: start,
3329            current_pos: start,
3330            end_pos: end,
3331            chunk_size,
3332            overlap,
3333            first_chunk: true,
3334            current_piece_data: None,
3335            current_piece_offset: 0,
3336        }
3337    }
3338
3339    /// Read one byte from the piece iterator
3340    fn read_byte(&mut self) -> Option<u8> {
3341        loop {
3342            // If we have cached piece data, read from it
3343            if let Some(ref data) = self.current_piece_data {
3344                if self.current_piece_offset < data.len() {
3345                    let byte = data[self.current_piece_offset];
3346                    self.current_piece_offset += 1;
3347                    self.current_pos += 1;
3348                    return Some(byte);
3349                } else {
3350                    // Exhausted current piece, move to next
3351                    self.current_piece_data = None;
3352                    self.current_piece_offset = 0;
3353                }
3354            }
3355
3356            // Get next piece
3357            if let Some(piece_view) = self.piece_iter.next() {
3358                let buffer_id = piece_view.location.buffer_id();
3359                if let Some(buffer) = self.buffers.get(buffer_id) {
3360                    // Extract the relevant slice from this piece
3361                    let piece_start_in_doc = piece_view.doc_offset;
3362                    let piece_end_in_doc = piece_view.doc_offset + piece_view.bytes;
3363
3364                    // Clip to our search range
3365                    let read_start = self.current_pos.max(piece_start_in_doc);
3366                    let read_end = self.end_pos.min(piece_end_in_doc);
3367
3368                    if read_end > read_start {
3369                        let offset_in_piece = read_start - piece_start_in_doc;
3370                        let bytes_to_read = read_end - read_start;
3371
3372                        let buffer_start = piece_view.buffer_offset + offset_in_piece;
3373                        let buffer_end = buffer_start + bytes_to_read;
3374
3375                        if let Some(data) = buffer.get_data() {
3376                            if buffer_end <= data.len() {
3377                                // Cache this piece's data
3378                                self.current_piece_data =
3379                                    Some(data[buffer_start..buffer_end].to_vec());
3380                                self.current_piece_offset = 0;
3381                                continue;
3382                            }
3383                        }
3384                    }
3385                }
3386            }
3387
3388            // No more data
3389            return None;
3390        }
3391    }
3392
3393    /// Fill the buffer with the next chunk of data
3394    fn fill_next_chunk(&mut self) -> bool {
3395        if self.first_chunk {
3396            // First chunk: fill up to chunk_size
3397            self.first_chunk = false;
3398            while self.buffer.len() < self.chunk_size && self.current_pos < self.end_pos {
3399                if let Some(byte) = self.read_byte() {
3400                    self.buffer.push(byte);
3401                } else {
3402                    break;
3403                }
3404            }
3405            !self.buffer.is_empty()
3406        } else {
3407            // Subsequent chunks: keep overlap, fill chunk_size NEW bytes
3408            if self.current_pos >= self.end_pos {
3409                return false;
3410            }
3411
3412            // Keep overlap bytes at the end
3413            if self.buffer.len() > self.overlap {
3414                let drain_amount = self.buffer.len() - self.overlap;
3415                self.buffer.drain(0..drain_amount);
3416                self.buffer_absolute_pos += drain_amount;
3417            }
3418
3419            // Fill chunk_size NEW bytes (in addition to overlap)
3420            let before_len = self.buffer.len();
3421            let target_len = self.overlap + self.chunk_size;
3422            while self.buffer.len() < target_len && self.current_pos < self.end_pos {
3423                if let Some(byte) = self.read_byte() {
3424                    self.buffer.push(byte);
3425                } else {
3426                    break;
3427                }
3428            }
3429
3430            // Return true if we added new data
3431            self.buffer.len() > before_len
3432        }
3433    }
3434}
3435
3436impl<'a> Iterator for OverlappingChunks<'a> {
3437    type Item = ChunkInfo;
3438
3439    fn next(&mut self) -> Option<Self::Item> {
3440        // Track if this is the first chunk before filling
3441        let is_first = self.buffer_absolute_pos == self.current_pos;
3442
3443        if !self.fill_next_chunk() {
3444            return None;
3445        }
3446
3447        // First chunk: all data is valid (no overlap from previous)
3448        // Subsequent chunks: overlap bytes are not valid (already checked)
3449        let valid_start = if is_first {
3450            0
3451        } else {
3452            self.overlap.min(self.buffer.len())
3453        };
3454
3455        Some(ChunkInfo {
3456            buffer: self.buffer.clone(),
3457            absolute_pos: self.buffer_absolute_pos,
3458            valid_start,
3459        })
3460    }
3461}
3462
3463#[cfg(test)]
3464mod tests;
3465
3466#[cfg(test)]
3467mod property_tests;
3468
3469/// Line data with optional line number
3470#[derive(Debug, Clone)]
3471pub struct LineData {
3472    /// Byte offset where this line starts in the document
3473    pub byte_offset: usize,
3474    /// Line content (without trailing newline)
3475    pub content: String,
3476    /// Whether this line ends with a newline
3477    pub has_newline: bool,
3478    /// Line number (None for large files without line metadata)
3479    pub line_number: Option<usize>,
3480}
3481
3482/// Iterator over lines in a TextBuffer that efficiently tracks line numbers
3483/// using piece tree metadata (single source of truth)
3484pub struct TextBufferLineIterator {
3485    /// Collected lines (we collect all at once since we need mutable access to load chunks)
3486    lines: Vec<LineData>,
3487    /// Current index in the lines vector
3488    current_index: usize,
3489    /// Whether there are more lines after these
3490    pub has_more: bool,
3491}
3492
3493impl TextBufferLineIterator {
3494    pub(crate) fn new(buffer: &mut TextBuffer, byte_pos: usize, max_lines: usize) -> Result<Self> {
3495        let buffer_len = buffer.len();
3496        if byte_pos >= buffer_len {
3497            return Ok(Self {
3498                lines: Vec::new(),
3499                current_index: 0,
3500                has_more: false,
3501            });
3502        }
3503
3504        // Check if buffer has line metadata (None for large files > 1MB)
3505        let has_line_metadata = buffer.line_count().is_some();
3506
3507        // Determine starting line number by querying piece tree once
3508        // (only if we have line metadata)
3509        let mut current_line = if has_line_metadata {
3510            buffer.offset_to_position(byte_pos).map(|pos| pos.line)
3511        } else {
3512            None
3513        };
3514
3515        let mut lines = Vec::with_capacity(max_lines);
3516        let mut current_offset = byte_pos;
3517        let estimated_line_length = 80; // Use default estimate
3518
3519        // Collect lines by scanning forward
3520        for _ in 0..max_lines {
3521            if current_offset >= buffer_len {
3522                break;
3523            }
3524
3525            let line_start = current_offset;
3526            let line_number = current_line;
3527
3528            // Estimate how many bytes to load for this line
3529            let estimated_max_line_length = estimated_line_length * 3;
3530            let bytes_to_scan = estimated_max_line_length.min(buffer_len - current_offset);
3531
3532            // Load chunk (this handles lazy loading)
3533            let chunk = buffer.get_text_range_mut(current_offset, bytes_to_scan)?;
3534
3535            // Scan for newline
3536            let mut line_len = 0;
3537            let mut found_newline = false;
3538            for &byte in chunk.iter() {
3539                line_len += 1;
3540                if byte == b'\n' {
3541                    found_newline = true;
3542                    break;
3543                }
3544            }
3545
3546            // Handle long lines (rare case)
3547            if !found_newline && current_offset + line_len < buffer_len {
3548                // Line is longer than expected, load more data
3549                let remaining = buffer_len - current_offset - line_len;
3550                let additional_bytes = estimated_max_line_length.min(remaining);
3551                let more_chunk =
3552                    buffer.get_text_range_mut(current_offset + line_len, additional_bytes)?;
3553
3554                let mut extended_chunk = chunk;
3555                extended_chunk.extend_from_slice(&more_chunk);
3556
3557                for &byte in more_chunk.iter() {
3558                    line_len += 1;
3559                    if byte == b'\n' {
3560                        found_newline = true;
3561                        break;
3562                    }
3563                }
3564
3565                let line_string = String::from_utf8_lossy(&extended_chunk[..line_len]).into_owned();
3566                let has_newline = line_string.ends_with('\n');
3567                let content = if has_newline {
3568                    line_string[..line_string.len() - 1].to_string()
3569                } else {
3570                    line_string
3571                };
3572
3573                lines.push(LineData {
3574                    byte_offset: line_start,
3575                    content,
3576                    has_newline,
3577                    line_number,
3578                });
3579
3580                current_offset += line_len;
3581                if has_line_metadata && found_newline {
3582                    current_line = current_line.map(|n| n + 1);
3583                }
3584                continue;
3585            }
3586
3587            // Normal case
3588            let line_string = String::from_utf8_lossy(&chunk[..line_len]).into_owned();
3589            let has_newline = line_string.ends_with('\n');
3590            let content = if has_newline {
3591                line_string[..line_string.len() - 1].to_string()
3592            } else {
3593                line_string
3594            };
3595
3596            lines.push(LineData {
3597                byte_offset: line_start,
3598                content,
3599                has_newline,
3600                line_number,
3601            });
3602
3603            current_offset += line_len;
3604            // Increment line number if we have metadata and found a newline
3605            if has_line_metadata && found_newline {
3606                current_line = current_line.map(|n| n + 1);
3607            }
3608        }
3609
3610        // Check if there are more lines
3611        let has_more = current_offset < buffer_len;
3612
3613        Ok(Self {
3614            lines,
3615            current_index: 0,
3616            has_more,
3617        })
3618    }
3619}
3620
3621impl Iterator for TextBufferLineIterator {
3622    type Item = LineData;
3623
3624    fn next(&mut self) -> Option<Self::Item> {
3625        if self.current_index < self.lines.len() {
3626            let line = self.lines[self.current_index].clone();
3627            self.current_index += 1;
3628            Some(line)
3629        } else {
3630            None
3631        }
3632    }
3633}