Skip to main content

scirs2_ndimage/
chunked_processing.rs

1//! Enhanced chunked image processing for large images
2//!
3//! This module provides memory-efficient, GPU-accelerated chunked processing
4//! for images that are too large to fit in memory. It supports:
5//!
6//! - **Chunked filter operations** with proper overlap handling
7//! - **Zero-copy transformations** using memory-mapped arrays
8//! - **Memory-efficient morphological operations**
9//! - **Out-of-core processing pipeline** for very large images
10//! - **GPU acceleration** for supported operations
11//!
12//! # Architecture
13//!
14//! The module is built around the `ChunkedImageProcessor` which manages:
15//! - Automatic chunk size optimization based on available memory
16//! - Overlap regions for seamless filtering across chunk boundaries
17//! - Parallel processing of independent chunks
18//! - GPU offloading for supported operations
19//!
20//! # Example
21//!
22//! ```rust,no_run
23//! use scirs2_ndimage::chunked_processing::{ChunkedImageProcessor, ChunkingConfig};
24//! use scirs2_core::ndarray::Array2;
25//!
26//! let processor = ChunkedImageProcessor::new(ChunkingConfig::default());
27//! let large_image = Array2::<f64>::zeros((10000, 10000));
28//!
29//! // Apply Gaussian filter in chunks
30//! let result = processor.gaussian_filter(&large_image.view(), 2.0, None).unwrap();
31//! ```
32
33use std::collections::VecDeque;
34use std::fmt::Debug;
35use std::fs::File;
36use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
37use std::marker::PhantomData;
38use std::path::{Path, PathBuf};
39use std::sync::atomic::{AtomicUsize, Ordering};
40use std::sync::Arc;
41
42use scirs2_core::ndarray::{
43    self as ndarray, Array, Array1, Array2, ArrayView, ArrayView2, ArrayViewMut, ArrayViewMut2,
44    Axis, Dimension, Ix2, IxDyn, Slice, SliceInfoElem,
45};
46use scirs2_core::numeric::{Float, FromPrimitive, NumCast, Zero};
47use scirs2_core::parallel_ops::*;
48
49use crate::error::{NdimageError, NdimageResult};
50use crate::filters::BorderMode;
51use crate::morphology::MorphBorderMode;
52
53// ============================================================================
54// Configuration Types
55// ============================================================================
56
57/// Configuration for chunked image processing
58#[derive(Debug, Clone)]
59pub struct ChunkingConfig {
60    /// Maximum chunk size in bytes (default: 64 MB)
61    pub max_chunk_bytes: usize,
62
63    /// Minimum chunk size in pixels per dimension (default: 32)
64    pub min_chunk_pixels: usize,
65
66    /// Maximum overlap in pixels per dimension (default: 64)
67    pub max_overlap_pixels: usize,
68
69    /// Whether to enable parallel chunk processing (default: true)
70    pub enable_parallel: bool,
71
72    /// Number of chunks to prefetch for pipelining (default: 2)
73    pub prefetch_count: usize,
74
75    /// Whether to use memory-mapped files for very large images (default: true)
76    pub use_mmap: bool,
77
78    /// Threshold for using memory-mapped files in bytes (default: 1 GB)
79    pub mmap_threshold: usize,
80
81    /// Temporary directory for intermediate files
82    pub temp_dir: Option<PathBuf>,
83
84    /// Whether to use GPU acceleration when available (default: true)
85    pub enable_gpu: bool,
86
87    /// Memory fraction to use for processing (0.0 - 1.0, default: 0.5)
88    pub memory_fraction: f64,
89
90    /// Strategy for handling overlap regions
91    pub overlap_strategy: OverlapStrategy,
92}
93
94impl Default for ChunkingConfig {
95    fn default() -> Self {
96        Self {
97            max_chunk_bytes: 64 * 1024 * 1024, // 64 MB
98            min_chunk_pixels: 32,
99            max_overlap_pixels: 64,
100            enable_parallel: true,
101            prefetch_count: 2,
102            use_mmap: true,
103            mmap_threshold: 1024 * 1024 * 1024, // 1 GB
104            temp_dir: None,
105            enable_gpu: true,
106            memory_fraction: 0.5,
107            overlap_strategy: OverlapStrategy::Blend,
108        }
109    }
110}
111
112/// Strategy for handling overlap regions between chunks
113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
114pub enum OverlapStrategy {
115    /// Simply overwrite (last chunk wins)
116    Overwrite,
117
118    /// Linear blending in overlap region
119    Blend,
120
121    /// Use weighted average based on distance from chunk center
122    WeightedAverage,
123
124    /// Use the chunk with higher quality metric
125    QualityBased,
126}
127
128// ============================================================================
129// Chunk Types
130// ============================================================================
131
132/// Represents a rectangular chunk of an image
133#[derive(Debug, Clone)]
134pub struct ChunkRegion {
135    /// Start coordinates in the full image (row, col)
136    pub start: (usize, usize),
137
138    /// End coordinates in the full image (exclusive)
139    pub end: (usize, usize),
140
141    /// Start coordinates including overlap (for processing)
142    pub padded_start: (usize, usize),
143
144    /// End coordinates including overlap (for processing)
145    pub padded_end: (usize, usize),
146
147    /// The chunk index in the grid
148    pub chunk_id: (usize, usize),
149}
150
151impl ChunkRegion {
152    /// Create a new chunk region
153    pub fn new(
154        start: (usize, usize),
155        end: (usize, usize),
156        overlap: usize,
157        image_shape: (usize, usize),
158    ) -> Self {
159        let padded_start = (
160            start.0.saturating_sub(overlap),
161            start.1.saturating_sub(overlap),
162        );
163        let padded_end = (
164            (end.0 + overlap).min(image_shape.0),
165            (end.1 + overlap).min(image_shape.1),
166        );
167
168        Self {
169            start,
170            end,
171            padded_start,
172            padded_end,
173            chunk_id: (0, 0),
174        }
175    }
176
177    /// Get the size of the chunk (excluding padding)
178    pub fn size(&self) -> (usize, usize) {
179        (self.end.0 - self.start.0, self.end.1 - self.start.1)
180    }
181
182    /// Get the size of the padded chunk
183    pub fn padded_size(&self) -> (usize, usize) {
184        (
185            self.padded_end.0 - self.padded_start.0,
186            self.padded_end.1 - self.padded_start.1,
187        )
188    }
189
190    /// Get the overlap on each side
191    pub fn overlap(&self) -> ((usize, usize), (usize, usize)) {
192        (
193            (
194                self.start.0 - self.padded_start.0,
195                self.start.1 - self.padded_start.1,
196            ),
197            (
198                self.padded_end.0 - self.end.0,
199                self.padded_end.1 - self.end.1,
200            ),
201        )
202    }
203}
204
205/// Iterator over chunk regions
206pub struct ChunkRegionIterator {
207    image_shape: (usize, usize),
208    chunk_size: (usize, usize),
209    overlap: usize,
210    current_row: usize,
211    current_col: usize,
212    num_chunks_row: usize,
213    num_chunks_col: usize,
214}
215
216impl ChunkRegionIterator {
217    /// Create a new chunk region iterator
218    pub fn new(image_shape: (usize, usize), chunk_size: (usize, usize), overlap: usize) -> Self {
219        let num_chunks_row = (image_shape.0 + chunk_size.0 - 1) / chunk_size.0;
220        let num_chunks_col = (image_shape.1 + chunk_size.1 - 1) / chunk_size.1;
221
222        Self {
223            image_shape,
224            chunk_size,
225            overlap,
226            current_row: 0,
227            current_col: 0,
228            num_chunks_row,
229            num_chunks_col,
230        }
231    }
232
233    /// Get total number of chunks
234    pub fn total_chunks(&self) -> usize {
235        self.num_chunks_row * self.num_chunks_col
236    }
237}
238
239impl Iterator for ChunkRegionIterator {
240    type Item = ChunkRegion;
241
242    fn next(&mut self) -> Option<Self::Item> {
243        if self.current_row >= self.num_chunks_row {
244            return None;
245        }
246
247        let start_row = self.current_row * self.chunk_size.0;
248        let start_col = self.current_col * self.chunk_size.1;
249        let end_row = ((self.current_row + 1) * self.chunk_size.0).min(self.image_shape.0);
250        let end_col = ((self.current_col + 1) * self.chunk_size.1).min(self.image_shape.1);
251
252        let mut region = ChunkRegion::new(
253            (start_row, start_col),
254            (end_row, end_col),
255            self.overlap,
256            self.image_shape,
257        );
258        region.chunk_id = (self.current_row, self.current_col);
259
260        // Advance to next chunk
261        self.current_col += 1;
262        if self.current_col >= self.num_chunks_col {
263            self.current_col = 0;
264            self.current_row += 1;
265        }
266
267        Some(region)
268    }
269}
270
271// ============================================================================
272// Chunk Data Types
273// ============================================================================
274
275/// Represents chunk data that can be either in-memory or memory-mapped
276pub enum ChunkData<T> {
277    /// In-memory array
278    InMemory(Array2<T>),
279
280    /// Memory-mapped array (stored as path and loaded on demand)
281    MemoryMapped {
282        path: PathBuf,
283        shape: (usize, usize),
284        _phantom: PhantomData<T>,
285    },
286}
287
288impl<T: Float + FromPrimitive + Clone> ChunkData<T> {
289    /// Get the data as an owned array
290    pub fn to_array(&self) -> NdimageResult<Array2<T>> {
291        match self {
292            ChunkData::InMemory(arr) => Ok(arr.clone()),
293            ChunkData::MemoryMapped { path, shape, .. } => load_chunk_from_file(path, *shape),
294        }
295    }
296
297    /// Get the shape of the chunk
298    pub fn shape(&self) -> (usize, usize) {
299        match self {
300            ChunkData::InMemory(arr) => (arr.nrows(), arr.ncols()),
301            ChunkData::MemoryMapped { shape, .. } => *shape,
302        }
303    }
304}
305
306// ============================================================================
307// Chunk Operation Traits
308// ============================================================================
309
310/// Trait for operations that can be applied to chunks
311pub trait ChunkOperation<T>: Send + Sync
312where
313    T: Float + FromPrimitive + Debug + Clone + Send + Sync,
314{
315    /// Apply the operation to a single chunk
316    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>>;
317
318    /// Get the required overlap for this operation
319    fn required_overlap(&self) -> usize;
320
321    /// Get the operation name for logging/debugging
322    fn name(&self) -> &str;
323
324    /// Whether this operation supports GPU acceleration
325    fn supports_gpu(&self) -> bool {
326        false
327    }
328
329    /// Apply the operation on GPU (if supported)
330    #[cfg(feature = "gpu")]
331    fn apply_gpu(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
332        // Default: fall back to CPU
333        self.apply(chunk)
334    }
335}
336
337/// Trait for binary operations that can be applied to chunks
338/// This trait is specifically for bool arrays and doesn't require numeric traits
339pub trait BinaryChunkOperation: Send + Sync {
340    /// Apply the operation to a single chunk
341    fn apply(&self, chunk: &ArrayView2<bool>) -> NdimageResult<Array2<bool>>;
342
343    /// Get the required overlap for this operation
344    fn required_overlap(&self) -> usize;
345
346    /// Get the operation name for logging/debugging
347    fn name(&self) -> &str;
348
349    /// Whether this operation supports GPU acceleration
350    fn supports_gpu(&self) -> bool {
351        false
352    }
353}
354
355/// Trait for operations that can be applied in-place
356pub trait InPlaceChunkOperation<T>: Send + Sync
357where
358    T: Float + FromPrimitive + Debug + Clone + Send + Sync,
359{
360    /// Apply the operation in-place to a chunk
361    fn apply_inplace(&self, chunk: &mut ArrayViewMut2<T>) -> NdimageResult<()>;
362
363    /// Get the required overlap for this operation
364    fn required_overlap(&self) -> usize;
365}
366
367// ============================================================================
368// Chunked Image Processor
369// ============================================================================
370
371/// Main processor for chunked image operations
372pub struct ChunkedImageProcessor {
373    config: ChunkingConfig,
374    stats: ProcessingStats,
375}
376
377/// Statistics about chunk processing
378#[derive(Debug, Default)]
379pub struct ProcessingStats {
380    /// Total chunks processed
381    pub chunks_processed: AtomicUsize,
382
383    /// Total bytes processed
384    pub bytes_processed: AtomicUsize,
385
386    /// Number of GPU-accelerated chunks
387    pub gpu_chunks: AtomicUsize,
388
389    /// Number of CPU-processed chunks
390    pub cpu_chunks: AtomicUsize,
391}
392
393impl ChunkedImageProcessor {
394    /// Create a new chunked image processor
395    pub fn new(config: ChunkingConfig) -> Self {
396        Self {
397            config,
398            stats: ProcessingStats::default(),
399        }
400    }
401
402    /// Create with default configuration
403    pub fn with_defaults() -> Self {
404        Self::new(ChunkingConfig::default())
405    }
406
407    /// Calculate optimal chunk size based on image dimensions and config
408    pub fn calculate_chunk_size(
409        &self,
410        image_shape: (usize, usize),
411        element_size: usize,
412    ) -> (usize, usize) {
413        let target_elements = self.config.max_chunk_bytes / element_size;
414
415        // Start with square chunks
416        let base_size =
417            ((target_elements as f64).sqrt() as usize).max(self.config.min_chunk_pixels);
418
419        // Adjust to fit image dimensions
420        let chunk_rows = base_size.min(image_shape.0);
421        let chunk_cols = base_size.min(image_shape.1);
422
423        (chunk_rows, chunk_cols)
424    }
425
426    /// Calculate required overlap for an operation
427    pub fn calculate_overlap<T, Op>(&self, operation: &Op) -> usize
428    where
429        T: Float + FromPrimitive + Debug + Clone + Send + Sync,
430        Op: ChunkOperation<T> + ?Sized,
431    {
432        operation
433            .required_overlap()
434            .min(self.config.max_overlap_pixels)
435    }
436
437    /// Process an image using a chunk operation
438    pub fn process<T, Op>(&self, input: &ArrayView2<T>, operation: &Op) -> NdimageResult<Array2<T>>
439    where
440        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static,
441        Op: ChunkOperation<T> + ?Sized,
442    {
443        let image_shape = (input.nrows(), input.ncols());
444        let element_size = std::mem::size_of::<T>();
445        let chunk_size = self.calculate_chunk_size(image_shape, element_size);
446        let overlap = self.calculate_overlap(operation);
447
448        // Create output array
449        let mut output = Array2::zeros(image_shape);
450
451        // Create chunk iterator
452        let chunk_iter = ChunkRegionIterator::new(image_shape, chunk_size, overlap);
453
454        if self.config.enable_parallel && is_parallel_enabled() {
455            self.process_parallel(input, &mut output, operation, chunk_iter)?;
456        } else {
457            self.process_sequential(input, &mut output, operation, chunk_iter)?;
458        }
459
460        Ok(output)
461    }
462
463    /// Process a binary image using a binary chunk operation
464    pub fn process_binary<Op>(
465        &self,
466        input: &ArrayView2<bool>,
467        operation: &Op,
468    ) -> NdimageResult<Array2<bool>>
469    where
470        Op: BinaryChunkOperation,
471    {
472        let image_shape = (input.nrows(), input.ncols());
473        let element_size = std::mem::size_of::<bool>();
474        let chunk_size = self.calculate_chunk_size(image_shape, element_size);
475        let overlap = operation
476            .required_overlap()
477            .min(self.config.max_overlap_pixels);
478
479        // Create output array
480        let mut output = Array2::from_elem(image_shape, false);
481
482        // Create chunk iterator
483        let chunk_iter = ChunkRegionIterator::new(image_shape, chunk_size, overlap);
484
485        if self.config.enable_parallel && is_parallel_enabled() {
486            self.process_binary_parallel(input, &mut output, operation, chunk_iter)?;
487        } else {
488            self.process_binary_sequential(input, &mut output, operation, chunk_iter)?;
489        }
490
491        Ok(output)
492    }
493
494    /// Process chunks sequentially
495    fn process_sequential<T, Op>(
496        &self,
497        input: &ArrayView2<T>,
498        output: &mut Array2<T>,
499        operation: &Op,
500        chunk_iter: ChunkRegionIterator,
501    ) -> NdimageResult<()>
502    where
503        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static,
504        Op: ChunkOperation<T> + ?Sized,
505    {
506        for region in chunk_iter {
507            // Extract padded chunk from input
508            let chunk = self.extract_chunk(input, &region)?;
509
510            // Apply operation
511            let result = operation.apply(&chunk.view())?;
512
513            // Insert result into output (handling overlap)
514            self.insert_chunk_result(output, &result.view(), &region)?;
515
516            // Update stats
517            self.stats.chunks_processed.fetch_add(1, Ordering::Relaxed);
518            self.stats.cpu_chunks.fetch_add(1, Ordering::Relaxed);
519            self.stats.bytes_processed.fetch_add(
520                region.size().0 * region.size().1 * std::mem::size_of::<T>(),
521                Ordering::Relaxed,
522            );
523        }
524
525        Ok(())
526    }
527
528    /// Process chunks in parallel
529    fn process_parallel<T, Op>(
530        &self,
531        input: &ArrayView2<T>,
532        output: &mut Array2<T>,
533        operation: &Op,
534        chunk_iter: ChunkRegionIterator,
535    ) -> NdimageResult<()>
536    where
537        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static,
538        Op: ChunkOperation<T> + ?Sized,
539    {
540        let regions: Vec<_> = chunk_iter.collect();
541
542        // Process chunks in parallel and collect results
543        let results: Vec<NdimageResult<(ChunkRegion, Array2<T>)>> = regions
544            .into_par_iter()
545            .map(|region| {
546                let chunk = self.extract_chunk(input, &region)?;
547                let result = operation.apply(&chunk.view())?;
548
549                // Update stats
550                self.stats.chunks_processed.fetch_add(1, Ordering::Relaxed);
551                self.stats.cpu_chunks.fetch_add(1, Ordering::Relaxed);
552                self.stats.bytes_processed.fetch_add(
553                    region.size().0 * region.size().1 * std::mem::size_of::<T>(),
554                    Ordering::Relaxed,
555                );
556
557                Ok((region, result))
558            })
559            .collect();
560
561        // Insert results into output
562        for result in results {
563            let (region, chunk_result) = result?;
564            self.insert_chunk_result(output, &chunk_result.view(), &region)?;
565        }
566
567        Ok(())
568    }
569
570    /// Process binary chunks sequentially
571    fn process_binary_sequential<Op>(
572        &self,
573        input: &ArrayView2<bool>,
574        output: &mut Array2<bool>,
575        operation: &Op,
576        chunk_iter: ChunkRegionIterator,
577    ) -> NdimageResult<()>
578    where
579        Op: BinaryChunkOperation,
580    {
581        for region in chunk_iter {
582            // Extract padded chunk from input
583            let chunk = self.extract_binary_chunk(input, &region)?;
584
585            // Apply operation
586            let result = operation.apply(&chunk.view())?;
587
588            // Insert result into output (handling overlap)
589            self.insert_binary_chunk_result(output, &result.view(), &region)?;
590
591            // Update stats
592            self.stats.chunks_processed.fetch_add(1, Ordering::Relaxed);
593            self.stats.cpu_chunks.fetch_add(1, Ordering::Relaxed);
594            self.stats.bytes_processed.fetch_add(
595                region.size().0 * region.size().1 * std::mem::size_of::<bool>(),
596                Ordering::Relaxed,
597            );
598        }
599
600        Ok(())
601    }
602
603    /// Process binary chunks in parallel
604    fn process_binary_parallel<Op>(
605        &self,
606        input: &ArrayView2<bool>,
607        output: &mut Array2<bool>,
608        operation: &Op,
609        chunk_iter: ChunkRegionIterator,
610    ) -> NdimageResult<()>
611    where
612        Op: BinaryChunkOperation,
613    {
614        let regions: Vec<_> = chunk_iter.collect();
615
616        // Process chunks in parallel and collect results
617        let results: Vec<NdimageResult<(ChunkRegion, Array2<bool>)>> = regions
618            .into_par_iter()
619            .map(|region| {
620                let chunk = self.extract_binary_chunk(input, &region)?;
621                let result = operation.apply(&chunk.view())?;
622
623                // Update stats
624                self.stats.chunks_processed.fetch_add(1, Ordering::Relaxed);
625                self.stats.cpu_chunks.fetch_add(1, Ordering::Relaxed);
626                self.stats.bytes_processed.fetch_add(
627                    region.size().0 * region.size().1 * std::mem::size_of::<bool>(),
628                    Ordering::Relaxed,
629                );
630
631                Ok((region, result))
632            })
633            .collect();
634
635        // Insert results into output
636        for result in results {
637            let (region, chunk_result) = result?;
638            self.insert_binary_chunk_result(output, &chunk_result.view(), &region)?;
639        }
640
641        Ok(())
642    }
643
644    /// Extract a chunk from the input array
645    fn extract_chunk<T>(
646        &self,
647        input: &ArrayView2<T>,
648        region: &ChunkRegion,
649    ) -> NdimageResult<Array2<T>>
650    where
651        T: Float + Clone,
652    {
653        let rows = region.padded_start.0..region.padded_end.0;
654        let cols = region.padded_start.1..region.padded_end.1;
655
656        Ok(input.slice(ndarray::s![rows, cols]).to_owned())
657    }
658
659    /// Insert a processed chunk result into the output array
660    fn insert_chunk_result<T>(
661        &self,
662        output: &mut Array2<T>,
663        chunk_result: &ArrayView2<T>,
664        region: &ChunkRegion,
665    ) -> NdimageResult<()>
666    where
667        T: Float + FromPrimitive + Clone,
668    {
669        let overlap = region.overlap();
670
671        // Calculate the region within the chunk result that corresponds to the output region
672        let result_start_row = overlap.0 .0;
673        let result_start_col = overlap.0 .1;
674        let result_end_row = chunk_result.nrows() - overlap.1 .0;
675        let result_end_col = chunk_result.ncols() - overlap.1 .1;
676
677        // Validate dimensions
678        let expected_rows = region.end.0 - region.start.0;
679        let expected_cols = region.end.1 - region.start.1;
680        let actual_rows = result_end_row - result_start_row;
681        let actual_cols = result_end_col - result_start_col;
682
683        if actual_rows != expected_rows || actual_cols != expected_cols {
684            return Err(NdimageError::DimensionError(format!(
685                "Chunk result dimension mismatch: expected ({}, {}), got ({}, {})",
686                expected_rows, expected_cols, actual_rows, actual_cols
687            )));
688        }
689
690        // Extract the relevant portion of the result
691        let result_slice = chunk_result.slice(ndarray::s![
692            result_start_row..result_end_row,
693            result_start_col..result_end_col
694        ]);
695
696        // Get the output region
697        let mut output_slice = output.slice_mut(ndarray::s![
698            region.start.0..region.end.0,
699            region.start.1..region.end.1
700        ]);
701
702        match self.config.overlap_strategy {
703            OverlapStrategy::Overwrite => {
704                output_slice.assign(&result_slice);
705            }
706            OverlapStrategy::Blend => {
707                self.blend_overlap(&mut output_slice, &result_slice, region)?;
708            }
709            OverlapStrategy::WeightedAverage => {
710                self.weighted_average_overlap(&mut output_slice, &result_slice, region)?;
711            }
712            OverlapStrategy::QualityBased => {
713                // For now, default to simple overwrite
714                output_slice.assign(&result_slice);
715            }
716        }
717
718        Ok(())
719    }
720
721    /// Extract a binary chunk from the input array
722    fn extract_binary_chunk(
723        &self,
724        input: &ArrayView2<bool>,
725        region: &ChunkRegion,
726    ) -> NdimageResult<Array2<bool>> {
727        let rows = region.padded_start.0..region.padded_end.0;
728        let cols = region.padded_start.1..region.padded_end.1;
729
730        Ok(input.slice(ndarray::s![rows, cols]).to_owned())
731    }
732
733    /// Insert a processed binary chunk result into the output array
734    fn insert_binary_chunk_result(
735        &self,
736        output: &mut Array2<bool>,
737        chunk_result: &ArrayView2<bool>,
738        region: &ChunkRegion,
739    ) -> NdimageResult<()> {
740        let overlap = region.overlap();
741
742        // Calculate the region within the chunk result that corresponds to the output region
743        let result_start_row = overlap.0 .0;
744        let result_start_col = overlap.0 .1;
745        let result_end_row = chunk_result.nrows() - overlap.1 .0;
746        let result_end_col = chunk_result.ncols() - overlap.1 .1;
747
748        // Validate dimensions
749        let expected_rows = region.end.0 - region.start.0;
750        let expected_cols = region.end.1 - region.start.1;
751        let actual_rows = result_end_row - result_start_row;
752        let actual_cols = result_end_col - result_start_col;
753
754        if actual_rows != expected_rows || actual_cols != expected_cols {
755            return Err(NdimageError::DimensionError(format!(
756                "Binary chunk result dimension mismatch: expected ({}, {}), got ({}, {})",
757                expected_rows, expected_cols, actual_rows, actual_cols
758            )));
759        }
760
761        // Extract the relevant portion of the result
762        let result_slice = chunk_result.slice(ndarray::s![
763            result_start_row..result_end_row,
764            result_start_col..result_end_col
765        ]);
766
767        // Get the output region
768        let mut output_slice = output.slice_mut(ndarray::s![
769            region.start.0..region.end.0,
770            region.start.1..region.end.1
771        ]);
772
773        // For binary data, always use simple overwrite
774        output_slice.assign(&result_slice);
775
776        Ok(())
777    }
778
779    /// Blend overlapping regions using linear interpolation
780    fn blend_overlap<T>(
781        &self,
782        output: &mut ArrayViewMut2<T>,
783        result: &ArrayView2<T>,
784        region: &ChunkRegion,
785    ) -> NdimageResult<()>
786    where
787        T: Float + FromPrimitive + Clone,
788    {
789        // For the first chunk in each direction, just assign directly
790        if region.chunk_id.0 == 0 && region.chunk_id.1 == 0 {
791            output.assign(result);
792            return Ok(());
793        }
794
795        // Simple blend: average with existing values in overlap regions
796        for (i, row) in output.rows_mut().into_iter().enumerate() {
797            for (j, val) in row.into_iter().enumerate() {
798                let new_val = result[[i, j]];
799                if *val == T::zero() {
800                    *val = new_val;
801                } else {
802                    // Simple average
803                    *val = (*val + new_val)
804                        / T::from_f64(2.0).ok_or_else(|| {
805                            NdimageError::ComputationError("Failed to convert 2.0 to float".into())
806                        })?;
807                }
808            }
809        }
810
811        Ok(())
812    }
813
814    /// Use weighted average for overlap regions
815    fn weighted_average_overlap<T>(
816        &self,
817        output: &mut ArrayViewMut2<T>,
818        result: &ArrayView2<T>,
819        region: &ChunkRegion,
820    ) -> NdimageResult<()>
821    where
822        T: Float + FromPrimitive + Clone,
823    {
824        let (rows, cols) = (output.nrows(), output.ncols());
825
826        for i in 0..rows {
827            for j in 0..cols {
828                let new_val = result[[i, j]];
829
830                if output[[i, j]] == T::zero() {
831                    output[[i, j]] = new_val;
832                } else {
833                    // Weight based on distance from chunk edge
834                    let center_row = rows as f64 / 2.0;
835                    let center_col = cols as f64 / 2.0;
836                    let dist =
837                        ((i as f64 - center_row).powi(2) + (j as f64 - center_col).powi(2)).sqrt();
838                    let max_dist = (center_row.powi(2) + center_col.powi(2)).sqrt();
839                    let weight = T::from_f64(1.0 - dist / max_dist).ok_or_else(|| {
840                        NdimageError::ComputationError("Failed to convert weight".into())
841                    })?;
842
843                    output[[i, j]] = output[[i, j]] * (T::one() - weight) + new_val * weight;
844                }
845            }
846        }
847
848        Ok(())
849    }
850
851    /// Get processing statistics
852    pub fn stats(&self) -> &ProcessingStats {
853        &self.stats
854    }
855
856    /// Reset processing statistics
857    pub fn reset_stats(&self) {
858        self.stats.chunks_processed.store(0, Ordering::Relaxed);
859        self.stats.bytes_processed.store(0, Ordering::Relaxed);
860        self.stats.gpu_chunks.store(0, Ordering::Relaxed);
861        self.stats.cpu_chunks.store(0, Ordering::Relaxed);
862    }
863
864    // ========================================================================
865    // Convenience methods for common operations
866    // ========================================================================
867
868    /// Apply Gaussian filter in chunks
869    pub fn gaussian_filter<T>(
870        &self,
871        input: &ArrayView2<T>,
872        sigma: f64,
873        border_mode: Option<BorderMode>,
874    ) -> NdimageResult<Array2<T>>
875    where
876        T: Float
877            + FromPrimitive
878            + Debug
879            + Clone
880            + Send
881            + Sync
882            + Zero
883            + 'static
884            + std::ops::AddAssign
885            + std::ops::DivAssign,
886    {
887        let operation =
888            GaussianFilterOperation::new(sigma, border_mode.unwrap_or(BorderMode::Reflect));
889        self.process(input, &operation)
890    }
891
892    /// Apply uniform (box) filter in chunks
893    pub fn uniform_filter<T>(
894        &self,
895        input: &ArrayView2<T>,
896        size: usize,
897        border_mode: Option<BorderMode>,
898    ) -> NdimageResult<Array2<T>>
899    where
900        T: Float
901            + FromPrimitive
902            + Debug
903            + Clone
904            + Send
905            + Sync
906            + Zero
907            + 'static
908            + std::ops::AddAssign
909            + std::ops::DivAssign,
910    {
911        let operation =
912            UniformFilterOperation::new(size, border_mode.unwrap_or(BorderMode::Reflect));
913        self.process(input, &operation)
914    }
915
916    /// Apply median filter in chunks
917    pub fn median_filter<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
918    where
919        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
920    {
921        let operation = MedianFilterOperation::new(size);
922        self.process(input, &operation)
923    }
924
925    /// Apply morphological erosion in chunks
926    pub fn grey_erosion<T>(
927        &self,
928        input: &ArrayView2<T>,
929        size: usize,
930        border_mode: Option<MorphBorderMode>,
931    ) -> NdimageResult<Array2<T>>
932    where
933        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
934    {
935        let operation =
936            MorphErosionOperation::new(size, border_mode.unwrap_or(MorphBorderMode::Constant));
937        self.process(input, &operation)
938    }
939
940    /// Apply morphological dilation in chunks
941    pub fn grey_dilation<T>(
942        &self,
943        input: &ArrayView2<T>,
944        size: usize,
945        border_mode: Option<MorphBorderMode>,
946    ) -> NdimageResult<Array2<T>>
947    where
948        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
949    {
950        let operation =
951            MorphDilationOperation::new(size, border_mode.unwrap_or(MorphBorderMode::Constant));
952        self.process(input, &operation)
953    }
954}
955
956// ============================================================================
957// Built-in Chunk Operations
958// ============================================================================
959
960/// Gaussian filter operation for chunk processing
961pub struct GaussianFilterOperation {
962    sigma: f64,
963    border_mode: BorderMode,
964    kernel_radius: usize,
965}
966
967impl GaussianFilterOperation {
968    /// Create a new Gaussian filter operation
969    pub fn new(sigma: f64, border_mode: BorderMode) -> Self {
970        let kernel_radius = (sigma * 4.0).ceil() as usize;
971        Self {
972            sigma,
973            border_mode,
974            kernel_radius,
975        }
976    }
977}
978
979impl<T> ChunkOperation<T> for GaussianFilterOperation
980where
981    T: Float
982        + FromPrimitive
983        + Debug
984        + Clone
985        + Send
986        + Sync
987        + Zero
988        + 'static
989        + std::ops::AddAssign
990        + std::ops::DivAssign,
991{
992    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
993        // Convert to f64 for processing
994        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
995
996        let result =
997            crate::filters::gaussian_filter(&chunk_f64, self.sigma, Some(self.border_mode), None)?;
998
999        // Convert back to T
1000        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
1001    }
1002
1003    fn required_overlap(&self) -> usize {
1004        self.kernel_radius
1005    }
1006
1007    fn name(&self) -> &str {
1008        "gaussian_filter"
1009    }
1010
1011    fn supports_gpu(&self) -> bool {
1012        cfg!(feature = "gpu")
1013    }
1014}
1015
1016/// Uniform (box) filter operation for chunk processing
1017pub struct UniformFilterOperation {
1018    size: usize,
1019    border_mode: BorderMode,
1020}
1021
1022impl UniformFilterOperation {
1023    /// Create a new uniform filter operation
1024    pub fn new(size: usize, border_mode: BorderMode) -> Self {
1025        Self { size, border_mode }
1026    }
1027}
1028
1029impl<T> ChunkOperation<T> for UniformFilterOperation
1030where
1031    T: Float
1032        + FromPrimitive
1033        + Debug
1034        + Clone
1035        + Send
1036        + Sync
1037        + Zero
1038        + 'static
1039        + std::ops::AddAssign
1040        + std::ops::DivAssign,
1041{
1042    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
1043        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
1044
1045        let result = crate::filters::uniform_filter(
1046            &chunk_f64,
1047            &[self.size, self.size],
1048            Some(self.border_mode),
1049            None,
1050        )?;
1051
1052        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
1053    }
1054
1055    fn required_overlap(&self) -> usize {
1056        self.size / 2 + 1
1057    }
1058
1059    fn name(&self) -> &str {
1060        "uniform_filter"
1061    }
1062}
1063
1064/// Median filter operation for chunk processing
1065pub struct MedianFilterOperation {
1066    size: usize,
1067}
1068
1069impl MedianFilterOperation {
1070    /// Create a new median filter operation
1071    pub fn new(size: usize) -> Self {
1072        Self { size }
1073    }
1074}
1075
1076impl<T> ChunkOperation<T> for MedianFilterOperation
1077where
1078    T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
1079{
1080    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
1081        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
1082
1083        let result = crate::filters::median_filter(
1084            &chunk_f64,
1085            &[self.size, self.size],
1086            Some(BorderMode::Reflect),
1087        )?;
1088
1089        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
1090    }
1091
1092    fn required_overlap(&self) -> usize {
1093        self.size / 2 + 1
1094    }
1095
1096    fn name(&self) -> &str {
1097        "median_filter"
1098    }
1099}
1100
1101/// Morphological erosion operation for chunk processing
1102pub struct MorphErosionOperation {
1103    size: usize,
1104    border_mode: MorphBorderMode,
1105}
1106
1107impl MorphErosionOperation {
1108    /// Create a new morphological erosion operation
1109    pub fn new(size: usize, border_mode: MorphBorderMode) -> Self {
1110        Self { size, border_mode }
1111    }
1112}
1113
1114impl<T> ChunkOperation<T> for MorphErosionOperation
1115where
1116    T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
1117{
1118    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
1119        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
1120        let chunk_owned = chunk_f64.to_owned();
1121
1122        let result = crate::morphology::grey_erosion(
1123            &chunk_owned,
1124            Some(&[self.size, self.size]),
1125            None,
1126            Some(self.border_mode),
1127            None,
1128            None,
1129        )?;
1130
1131        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
1132    }
1133
1134    fn required_overlap(&self) -> usize {
1135        self.size / 2 + 1
1136    }
1137
1138    fn name(&self) -> &str {
1139        "grey_erosion"
1140    }
1141}
1142
1143/// Morphological dilation operation for chunk processing
1144pub struct MorphDilationOperation {
1145    size: usize,
1146    border_mode: MorphBorderMode,
1147}
1148
1149impl MorphDilationOperation {
1150    /// Create a new morphological dilation operation
1151    pub fn new(size: usize, border_mode: MorphBorderMode) -> Self {
1152        Self { size, border_mode }
1153    }
1154}
1155
1156impl<T> ChunkOperation<T> for MorphDilationOperation
1157where
1158    T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
1159{
1160    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
1161        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
1162        let chunk_owned = chunk_f64.to_owned();
1163
1164        let result = crate::morphology::grey_dilation(
1165            &chunk_owned,
1166            Some(&[self.size, self.size]),
1167            None,
1168            Some(self.border_mode),
1169            None,
1170            None,
1171        )?;
1172
1173        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
1174    }
1175
1176    fn required_overlap(&self) -> usize {
1177        self.size / 2 + 1
1178    }
1179
1180    fn name(&self) -> &str {
1181        "grey_dilation"
1182    }
1183}
1184
1185// ============================================================================
1186// Out-of-Core Processing Pipeline
1187// ============================================================================
1188
1189/// Pipeline for processing images that don't fit in memory
1190pub struct OutOfCorePipeline<T> {
1191    config: ChunkingConfig,
1192    operations: Vec<Box<dyn ChunkOperation<T>>>,
1193    _phantom: PhantomData<T>,
1194}
1195
1196impl<T> OutOfCorePipeline<T>
1197where
1198    T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static,
1199{
1200    /// Create a new out-of-core processing pipeline
1201    pub fn new(config: ChunkingConfig) -> Self {
1202        Self {
1203            config,
1204            operations: Vec::new(),
1205            _phantom: PhantomData,
1206        }
1207    }
1208
1209    /// Add an operation to the pipeline
1210    pub fn add_operation(mut self, operation: Box<dyn ChunkOperation<T>>) -> Self {
1211        self.operations.push(operation);
1212        self
1213    }
1214
1215    /// Get the maximum required overlap across all operations
1216    fn max_overlap(&self) -> usize {
1217        self.operations
1218            .iter()
1219            .map(|op| op.required_overlap())
1220            .max()
1221            .unwrap_or(0)
1222            .min(self.config.max_overlap_pixels)
1223    }
1224
1225    /// Process an image stored as a file
1226    pub fn process_file(
1227        &self,
1228        input_path: &Path,
1229        output_path: &Path,
1230        image_shape: (usize, usize),
1231    ) -> NdimageResult<()> {
1232        let element_size = std::mem::size_of::<T>();
1233        let processor = ChunkedImageProcessor::new(self.config.clone());
1234        let chunk_size = processor.calculate_chunk_size(image_shape, element_size);
1235        let overlap = self.max_overlap();
1236
1237        // Create chunk iterator
1238        let chunk_iter = ChunkRegionIterator::new(image_shape, chunk_size, overlap);
1239
1240        // Process each chunk through the pipeline
1241        for region in chunk_iter {
1242            // Load chunk from input file
1243            let chunk = load_chunk_from_raw_file::<T>(input_path, image_shape, &region)?;
1244
1245            // Apply all operations in sequence
1246            let mut result = chunk;
1247            for operation in &self.operations {
1248                result = operation.apply(&result.view())?;
1249            }
1250
1251            // Write result chunk to output file
1252            write_chunk_to_raw_file(&result.view(), output_path, image_shape, &region)?;
1253        }
1254
1255        Ok(())
1256    }
1257
1258    /// Process an in-memory image (for smaller images or testing)
1259    pub fn process_memory(&self, input: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
1260        let mut result = input.to_owned();
1261
1262        for operation in &self.operations {
1263            let processor = ChunkedImageProcessor::new(self.config.clone());
1264            result = processor.process(&result.view(), &**operation)?;
1265        }
1266
1267        Ok(result)
1268    }
1269}
1270
1271// ============================================================================
1272// Zero-Copy Transformations
1273// ============================================================================
1274
1275/// A view into a chunk that can be transformed without copying
1276pub struct ZeroCopyChunk<'a, T> {
1277    data: ArrayView2<'a, T>,
1278    region: ChunkRegion,
1279}
1280
1281impl<'a, T: Float + Clone> ZeroCopyChunk<'a, T> {
1282    /// Create a new zero-copy chunk view
1283    pub fn new(data: ArrayView2<'a, T>, region: ChunkRegion) -> Self {
1284        Self { data, region }
1285    }
1286
1287    /// Get the data view
1288    pub fn data(&self) -> &ArrayView2<'a, T> {
1289        &self.data
1290    }
1291
1292    /// Get the region
1293    pub fn region(&self) -> &ChunkRegion {
1294        &self.region
1295    }
1296
1297    /// Apply a point-wise transformation without copying
1298    pub fn map<F>(&self, f: F) -> Array2<T>
1299    where
1300        F: Fn(T) -> T,
1301    {
1302        self.data.mapv(|x| f(x))
1303    }
1304
1305    /// Get the non-overlapping portion of the chunk
1306    pub fn core_view(&self) -> ArrayView2<T> {
1307        let overlap = self.region.overlap();
1308        let rows = overlap.0 .0..(self.data.nrows() - overlap.1 .0);
1309        let cols = overlap.0 .1..(self.data.ncols() - overlap.1 .1);
1310        self.data.slice(ndarray::s![rows, cols])
1311    }
1312}
1313
1314/// Iterator that yields zero-copy chunks
1315pub struct ZeroCopyChunkIterator<'a, T> {
1316    input: &'a ArrayView2<'a, T>,
1317    region_iter: ChunkRegionIterator,
1318}
1319
1320impl<'a, T: Float + Clone> ZeroCopyChunkIterator<'a, T> {
1321    /// Create a new zero-copy chunk iterator
1322    pub fn new(input: &'a ArrayView2<'a, T>, chunk_size: (usize, usize), overlap: usize) -> Self {
1323        let image_shape = (input.nrows(), input.ncols());
1324        let region_iter = ChunkRegionIterator::new(image_shape, chunk_size, overlap);
1325
1326        Self { input, region_iter }
1327    }
1328}
1329
1330impl<'a, T: Float + Clone> Iterator for ZeroCopyChunkIterator<'a, T> {
1331    type Item = ZeroCopyChunk<'a, T>;
1332
1333    fn next(&mut self) -> Option<Self::Item> {
1334        self.region_iter.next().map(|region| {
1335            let rows = region.padded_start.0..region.padded_end.0;
1336            let cols = region.padded_start.1..region.padded_end.1;
1337            let data = self.input.slice(ndarray::s![rows, cols]);
1338            ZeroCopyChunk::new(data, region)
1339        })
1340    }
1341}
1342
1343// ============================================================================
1344// File I/O Helpers
1345// ============================================================================
1346
1347/// Load a chunk from a raw binary file
1348fn load_chunk_from_raw_file<T: Float + FromPrimitive + Clone>(
1349    path: &Path,
1350    image_shape: (usize, usize),
1351    region: &ChunkRegion,
1352) -> NdimageResult<Array2<T>> {
1353    let element_size = std::mem::size_of::<T>();
1354    let mut file = BufReader::new(File::open(path).map_err(NdimageError::IoError)?);
1355
1356    let chunk_rows = region.padded_end.0 - region.padded_start.0;
1357    let chunk_cols = region.padded_end.1 - region.padded_start.1;
1358    let mut data = Vec::with_capacity(chunk_rows * chunk_cols);
1359
1360    for row in region.padded_start.0..region.padded_end.0 {
1361        // Seek to the start of this row in the chunk
1362        let offset = (row * image_shape.1 + region.padded_start.1) * element_size;
1363        file.seek(SeekFrom::Start(offset as u64))
1364            .map_err(NdimageError::IoError)?;
1365
1366        // Read the row
1367        let mut row_buffer = vec![0u8; chunk_cols * element_size];
1368        file.read_exact(&mut row_buffer)
1369            .map_err(NdimageError::IoError)?;
1370
1371        // Convert bytes to T values
1372        for i in 0..chunk_cols {
1373            let bytes = &row_buffer[i * element_size..(i + 1) * element_size];
1374            let value = if element_size == 8 {
1375                let arr: [u8; 8] = bytes.try_into().map_err(|_| {
1376                    NdimageError::ComputationError("Failed to convert bytes to f64".into())
1377                })?;
1378                T::from_f64(f64::from_le_bytes(arr)).ok_or_else(|| {
1379                    NdimageError::ComputationError("Failed to convert f64 to T".into())
1380                })?
1381            } else {
1382                let arr: [u8; 4] = bytes.try_into().map_err(|_| {
1383                    NdimageError::ComputationError("Failed to convert bytes to f32".into())
1384                })?;
1385                T::from_f32(f32::from_le_bytes(arr)).ok_or_else(|| {
1386                    NdimageError::ComputationError("Failed to convert f32 to T".into())
1387                })?
1388            };
1389            data.push(value);
1390        }
1391    }
1392
1393    Array2::from_shape_vec((chunk_rows, chunk_cols), data).map_err(|e| NdimageError::ShapeError(e))
1394}
1395
1396/// Write a chunk to a raw binary file
1397fn write_chunk_to_raw_file<T: Float + Clone>(
1398    chunk: &ArrayView2<T>,
1399    path: &Path,
1400    image_shape: (usize, usize),
1401    region: &ChunkRegion,
1402) -> NdimageResult<()> {
1403    let element_size = std::mem::size_of::<T>();
1404    let file = std::fs::OpenOptions::new()
1405        .write(true)
1406        .create(true)
1407        .truncate(true)
1408        .open(path)
1409        .map_err(NdimageError::IoError)?;
1410    let mut writer = BufWriter::new(file);
1411
1412    let overlap = region.overlap();
1413    let core_start_row = overlap.0 .0;
1414    let core_start_col = overlap.0 .1;
1415    let core_end_row = chunk.nrows() - overlap.1 .0;
1416    let core_end_col = chunk.ncols() - overlap.1 .1;
1417
1418    for (chunk_row, output_row) in (core_start_row..core_end_row).zip(region.start.0..region.end.0)
1419    {
1420        // Seek to the start of this row in the output
1421        let offset = (output_row * image_shape.1 + region.start.1) * element_size;
1422        writer
1423            .seek(SeekFrom::Start(offset as u64))
1424            .map_err(NdimageError::IoError)?;
1425
1426        // Write the row
1427        for chunk_col in core_start_col..core_end_col {
1428            let value = chunk[[chunk_row, chunk_col]];
1429            let bytes = if element_size == 8 {
1430                value.to_f64().unwrap_or(0.0).to_le_bytes().to_vec()
1431            } else {
1432                value.to_f32().unwrap_or(0.0).to_le_bytes().to_vec()
1433            };
1434            writer.write_all(&bytes).map_err(NdimageError::IoError)?;
1435        }
1436    }
1437
1438    writer.flush().map_err(NdimageError::IoError)?;
1439    Ok(())
1440}
1441
1442/// Load a chunk from a binary file (for ChunkData::MemoryMapped)
1443fn load_chunk_from_file<T: Float + FromPrimitive + Clone>(
1444    path: &Path,
1445    shape: (usize, usize),
1446) -> NdimageResult<Array2<T>> {
1447    let element_size = std::mem::size_of::<T>();
1448    let mut file = BufReader::new(File::open(path).map_err(NdimageError::IoError)?);
1449
1450    let total_elements = shape.0 * shape.1;
1451    let mut data = Vec::with_capacity(total_elements);
1452
1453    let mut buffer = vec![0u8; total_elements * element_size];
1454    file.read_exact(&mut buffer)
1455        .map_err(NdimageError::IoError)?;
1456
1457    for i in 0..total_elements {
1458        let bytes = &buffer[i * element_size..(i + 1) * element_size];
1459        let value = if element_size == 8 {
1460            let arr: [u8; 8] = bytes
1461                .try_into()
1462                .map_err(|_| NdimageError::ComputationError("Failed to convert bytes".into()))?;
1463            T::from_f64(f64::from_le_bytes(arr))
1464                .ok_or_else(|| NdimageError::ComputationError("Failed to convert to T".into()))?
1465        } else {
1466            let arr: [u8; 4] = bytes
1467                .try_into()
1468                .map_err(|_| NdimageError::ComputationError("Failed to convert bytes".into()))?;
1469            T::from_f32(f32::from_le_bytes(arr))
1470                .ok_or_else(|| NdimageError::ComputationError("Failed to convert to T".into()))?
1471        };
1472        data.push(value);
1473    }
1474
1475    Array2::from_shape_vec(shape, data).map_err(|e| NdimageError::ShapeError(e))
1476}
1477
1478// ============================================================================
1479// GPU-Accelerated Chunk Processing
1480// ============================================================================
1481
1482#[cfg(feature = "gpu")]
1483pub mod gpu {
1484    use super::*;
1485    use crate::backend::{Backend, DeviceManager};
1486
1487    /// GPU-accelerated chunk processor
1488    pub struct GpuChunkProcessor {
1489        config: ChunkingConfig,
1490        device_manager: DeviceManager,
1491    }
1492
1493    impl GpuChunkProcessor {
1494        /// Create a new GPU chunk processor
1495        pub fn new(config: ChunkingConfig) -> NdimageResult<Self> {
1496            let device_manager = DeviceManager::new()?;
1497            Ok(Self {
1498                config,
1499                device_manager,
1500            })
1501        }
1502
1503        /// Check if GPU is available and beneficial for the given size
1504        pub fn should_use_gpu(&self, array_size: usize) -> bool {
1505            let capabilities = self.device_manager.get_capabilities();
1506            capabilities.gpu_available && array_size >= 1024 * 1024
1507        }
1508
1509        /// Process a chunk on GPU
1510        pub fn process_chunk<T, Op>(
1511            &self,
1512            chunk: &ArrayView2<T>,
1513            operation: &Op,
1514        ) -> NdimageResult<Array2<T>>
1515        where
1516            T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static,
1517            Op: ChunkOperation<T> + ?Sized,
1518        {
1519            if self.should_use_gpu(chunk.len()) && operation.supports_gpu() {
1520                operation.apply_gpu(chunk)
1521            } else {
1522                operation.apply(chunk)
1523            }
1524        }
1525    }
1526}
1527
1528// ============================================================================
1529// Tests
1530// ============================================================================
1531
1532#[cfg(test)]
1533mod tests {
1534    use super::*;
1535    use scirs2_core::ndarray::Array2;
1536
1537    #[test]
1538    fn test_chunk_region_iterator() {
1539        let image_shape = (100, 100);
1540        let chunk_size = (30, 30);
1541        let overlap = 5;
1542
1543        let iter = ChunkRegionIterator::new(image_shape, chunk_size, overlap);
1544        let regions: Vec<_> = iter.collect();
1545
1546        // Should have 4x4 = 16 chunks
1547        assert_eq!(regions.len(), 16);
1548
1549        // Check first chunk
1550        assert_eq!(regions[0].start, (0, 0));
1551        assert_eq!(regions[0].padded_start, (0, 0));
1552        assert!(regions[0].end.0 <= 30);
1553        assert!(regions[0].end.1 <= 30);
1554
1555        // Check last chunk covers the end
1556        let last = regions.last().expect("Should have regions");
1557        assert_eq!(last.end, image_shape);
1558    }
1559
1560    #[test]
1561    fn test_chunked_processor_small_image() {
1562        let processor = ChunkedImageProcessor::new(ChunkingConfig {
1563            max_chunk_bytes: 1024, // Small chunks
1564            enable_parallel: false,
1565            ..Default::default()
1566        });
1567
1568        let input = Array2::<f64>::ones((50, 50));
1569        let operation = GaussianFilterOperation::new(1.0, BorderMode::Reflect);
1570
1571        let result = processor.process(&input.view(), &operation);
1572        assert!(result.is_ok());
1573
1574        let output = result.expect("Should succeed");
1575        assert_eq!(output.shape(), input.shape());
1576    }
1577
1578    #[test]
1579    fn test_gaussian_filter_chunked() {
1580        let processor = ChunkedImageProcessor::new(ChunkingConfig {
1581            max_chunk_bytes: 8192, // Force chunking
1582            enable_parallel: false,
1583            ..Default::default()
1584        });
1585
1586        let input = Array2::<f64>::ones((100, 100));
1587        let result = processor.gaussian_filter(&input.view(), 1.0, None);
1588
1589        assert!(result.is_ok());
1590        let output = result.expect("Should succeed");
1591
1592        // Interior values should be approximately 1.0
1593        for i in 10..90 {
1594            for j in 10..90 {
1595                assert!((output[[i, j]] - 1.0).abs() < 0.1);
1596            }
1597        }
1598    }
1599
1600    #[test]
1601    fn test_uniform_filter_chunked() {
1602        let processor = ChunkedImageProcessor::new(ChunkingConfig {
1603            max_chunk_bytes: 8192,
1604            enable_parallel: false,
1605            ..Default::default()
1606        });
1607
1608        let input = Array2::<f64>::ones((100, 100));
1609        let result = processor.uniform_filter(&input.view(), 3, None);
1610
1611        assert!(result.is_ok());
1612        let output = result.expect("Should succeed");
1613        assert_eq!(output.shape(), input.shape());
1614    }
1615
1616    #[test]
1617    fn test_morphological_erosion_chunked() {
1618        let processor = ChunkedImageProcessor::new(ChunkingConfig {
1619            max_chunk_bytes: 8192,
1620            enable_parallel: false,
1621            ..Default::default()
1622        });
1623
1624        let input = Array2::<f64>::ones((100, 100));
1625        let result = processor.grey_erosion(&input.view(), 3, None);
1626
1627        assert!(result.is_ok());
1628        let output = result.expect("Should succeed");
1629        assert_eq!(output.shape(), input.shape());
1630    }
1631
1632    #[test]
1633    fn test_out_of_core_pipeline() {
1634        let config = ChunkingConfig {
1635            max_chunk_bytes: 4096,
1636            enable_parallel: false,
1637            ..Default::default()
1638        };
1639
1640        let pipeline = OutOfCorePipeline::<f64>::new(config).add_operation(Box::new(
1641            GaussianFilterOperation::new(1.0, BorderMode::Reflect),
1642        ));
1643
1644        let input = Array2::<f64>::ones((50, 50));
1645        let result = pipeline.process_memory(&input.view());
1646
1647        assert!(result.is_ok());
1648    }
1649
1650    #[test]
1651    fn test_zero_copy_chunk_iterator() {
1652        let input = Array2::<f64>::ones((100, 100));
1653        let chunk_size = (30, 30);
1654        let overlap = 5;
1655
1656        let input_view = input.view();
1657        let iter = ZeroCopyChunkIterator::new(&input_view, chunk_size, overlap);
1658        let chunks: Vec<_> = iter.collect();
1659
1660        assert!(!chunks.is_empty());
1661
1662        // Check that we can access data without copying
1663        for chunk in &chunks {
1664            let core = chunk.core_view();
1665            assert!(!core.is_empty());
1666        }
1667    }
1668
1669    #[test]
1670    fn test_overlap_strategies() {
1671        for strategy in [
1672            OverlapStrategy::Overwrite,
1673            OverlapStrategy::Blend,
1674            OverlapStrategy::WeightedAverage,
1675        ] {
1676            let config = ChunkingConfig {
1677                max_chunk_bytes: 4096,
1678                overlap_strategy: strategy,
1679                enable_parallel: false,
1680                ..Default::default()
1681            };
1682
1683            let processor = ChunkedImageProcessor::new(config);
1684            let input = Array2::<f64>::ones((50, 50));
1685
1686            let result = processor.gaussian_filter(&input.view(), 1.0, None);
1687            assert!(result.is_ok());
1688        }
1689    }
1690
1691    #[test]
1692    fn test_processing_stats() {
1693        let processor = ChunkedImageProcessor::new(ChunkingConfig {
1694            max_chunk_bytes: 4096,
1695            enable_parallel: false,
1696            ..Default::default()
1697        });
1698
1699        let input = Array2::<f64>::ones((100, 100));
1700        let _ = processor.gaussian_filter(&input.view(), 1.0, None);
1701
1702        let stats = processor.stats();
1703        assert!(stats.chunks_processed.load(Ordering::Relaxed) > 0);
1704        assert!(stats.bytes_processed.load(Ordering::Relaxed) > 0);
1705        assert!(stats.cpu_chunks.load(Ordering::Relaxed) > 0);
1706    }
1707}