wsi_streamer/format/
detect.rs

1//! Format detection for Whole Slide Image files.
2//!
3//! This module provides automatic detection of WSI file formats by examining
4//! magic bytes and vendor-specific markers. Currently supports:
5//!
6//! - **Aperio SVS**: TIFF-based format identified by "Aperio" string in ImageDescription
7//! - **Generic Pyramidal TIFF**: Standard tiled TIFF with multiple resolution levels
8//!
9//! Unsupported formats return an error that should map to HTTP 415 Unsupported Media Type.
10
11use crate::error::FormatError;
12use crate::io::RangeReader;
13
14use super::tiff::{ByteOrder, Ifd, TiffHeader, TiffTag, BIGTIFF_HEADER_SIZE, TIFF_HEADER_SIZE};
15
16// =============================================================================
17// SlideFormat
18// =============================================================================
19
20/// Detected slide format.
21///
22/// This enum represents the different WSI formats that can be served.
23/// Format detection is based on magic bytes and vendor-specific markers.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum SlideFormat {
26    /// Aperio SVS format (TIFF-based with JPEGTables for abbreviated streams)
27    AperioSvs,
28
29    /// Generic pyramidal TIFF (standard tiled TIFF with multiple resolutions)
30    GenericTiff,
31}
32
33impl SlideFormat {
34    /// Get a human-readable name for the format.
35    pub const fn name(&self) -> &'static str {
36        match self {
37            SlideFormat::AperioSvs => "Aperio SVS",
38            SlideFormat::GenericTiff => "Generic Pyramidal TIFF",
39        }
40    }
41}
42
43// =============================================================================
44// Format Detection
45// =============================================================================
46
47/// Minimum bytes needed for initial format detection (TIFF/BigTIFF header).
48const MIN_HEADER_BYTES: usize = BIGTIFF_HEADER_SIZE;
49
50/// Maximum bytes to read from ImageDescription for format detection.
51/// We don't need to read the entire description, just enough to find markers.
52const MAX_DESCRIPTION_BYTES: usize = 1024;
53
54/// Marker string for Aperio SVS format.
55const APERIO_MARKER: &[u8] = b"Aperio";
56
57/// Detect the format of a slide file.
58///
59/// This function reads the file header and examines vendor-specific markers
60/// to determine the slide format.
61///
62/// # Arguments
63/// * `reader` - Range reader for the file
64///
65/// # Returns
66/// * `Ok(SlideFormat)` - The detected format
67/// * `Err(FormatError::UnsupportedFormat)` - File is not a recognized format
68/// * `Err(FormatError::Tiff)` - Error parsing TIFF structure
69///
70/// # Format Detection Logic
71///
72/// 1. Read initial bytes and verify TIFF/BigTIFF magic
73/// 2. Parse the first IFD to access ImageDescription tag
74/// 3. If ImageDescription contains "Aperio", classify as SVS
75/// 4. Otherwise, classify as generic pyramidal TIFF
76pub async fn detect_format<R: RangeReader>(reader: &R) -> Result<SlideFormat, FormatError> {
77    // Check file size
78    if reader.size() < MIN_HEADER_BYTES as u64 {
79        return Err(FormatError::UnsupportedFormat {
80            reason: "File too small to be a valid TIFF".to_string(),
81        });
82    }
83
84    // Read and parse header
85    let header_bytes = reader.read_exact_at(0, MIN_HEADER_BYTES).await?;
86    let header = TiffHeader::parse(&header_bytes, reader.size())?;
87
88    // Read the first IFD to check for format-specific markers
89    let format = detect_format_from_first_ifd(reader, &header).await?;
90
91    Ok(format)
92}
93
94/// Detect format by examining the first IFD.
95///
96/// This reads the first IFD and checks the ImageDescription tag for
97/// vendor-specific markers.
98async fn detect_format_from_first_ifd<R: RangeReader>(
99    reader: &R,
100    header: &TiffHeader,
101) -> Result<SlideFormat, FormatError> {
102    // Read first IFD entry count
103    let count_size = header.ifd_count_size();
104    let count_bytes = reader
105        .read_exact_at(header.first_ifd_offset, count_size)
106        .await?;
107
108    let entry_count = if header.is_bigtiff {
109        header.byte_order.read_u64(&count_bytes)
110    } else {
111        header.byte_order.read_u16(&count_bytes) as u64
112    };
113
114    // Read the full IFD
115    let ifd_size = Ifd::calculate_size(entry_count, header);
116    let ifd_bytes = reader
117        .read_exact_at(header.first_ifd_offset, ifd_size)
118        .await?;
119    let ifd = Ifd::parse(&ifd_bytes, header)?;
120
121    // Check for ImageDescription tag
122    if let Some(description) = read_image_description(reader, &ifd, header).await? {
123        // Check for Aperio marker
124        if contains_aperio_marker(&description) {
125            return Ok(SlideFormat::AperioSvs);
126        }
127    }
128
129    // Default to generic TIFF
130    Ok(SlideFormat::GenericTiff)
131}
132
133/// Read the ImageDescription tag value from an IFD.
134///
135/// Returns None if the tag is not present.
136async fn read_image_description<R: RangeReader>(
137    reader: &R,
138    ifd: &Ifd,
139    header: &TiffHeader,
140) -> Result<Option<Vec<u8>>, FormatError> {
141    let entry = match ifd.get_entry_by_tag(TiffTag::ImageDescription) {
142        Some(e) => e,
143        None => return Ok(None),
144    };
145
146    // Limit how much we read
147    let read_len = (entry.count as usize).min(MAX_DESCRIPTION_BYTES);
148    if read_len == 0 {
149        return Ok(None);
150    }
151
152    // Read the bytes
153    let bytes = if entry.is_inline {
154        // Inline value - extract from entry
155        entry.value_offset_bytes[..read_len.min(entry.value_offset_bytes.len())].to_vec()
156    } else {
157        // Value at offset
158        let offset = entry.value_offset(header.byte_order);
159        reader.read_exact_at(offset, read_len).await?.to_vec()
160    };
161
162    Ok(Some(bytes))
163}
164
165/// Check if bytes contain the Aperio marker.
166fn contains_aperio_marker(data: &[u8]) -> bool {
167    // Simple substring search
168    data.windows(APERIO_MARKER.len())
169        .any(|window| window == APERIO_MARKER)
170}
171
172/// Check if bytes represent a valid TIFF header.
173///
174/// This is a quick check that can be used before attempting full parsing.
175pub fn is_tiff_header(bytes: &[u8]) -> bool {
176    if bytes.len() < TIFF_HEADER_SIZE {
177        return false;
178    }
179
180    // Check magic bytes
181    let magic = u16::from_le_bytes([bytes[0], bytes[1]]);
182    if magic != 0x4949 && magic != 0x4D4D {
183        return false;
184    }
185
186    // Check version
187    let byte_order = if magic == 0x4949 {
188        ByteOrder::LittleEndian
189    } else {
190        ByteOrder::BigEndian
191    };
192
193    let version = byte_order.read_u16(&bytes[2..4]);
194    version == 42 || version == 43
195}
196
197// =============================================================================
198// Tests
199// =============================================================================
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    // -------------------------------------------------------------------------
206    // is_tiff_header tests
207    // -------------------------------------------------------------------------
208
209    #[test]
210    fn test_is_tiff_header_little_endian_classic() {
211        let header = [
212            0x49, 0x49, // II (little-endian)
213            0x2A, 0x00, // Version 42 (TIFF)
214            0x08, 0x00, 0x00, 0x00, // IFD offset
215        ];
216        assert!(is_tiff_header(&header));
217    }
218
219    #[test]
220    fn test_is_tiff_header_big_endian_classic() {
221        let header = [
222            0x4D, 0x4D, // MM (big-endian)
223            0x00, 0x2A, // Version 42 (TIFF)
224            0x00, 0x00, 0x00, 0x08, // IFD offset
225        ];
226        assert!(is_tiff_header(&header));
227    }
228
229    #[test]
230    fn test_is_tiff_header_little_endian_bigtiff() {
231        let header = [
232            0x49, 0x49, // II (little-endian)
233            0x2B, 0x00, // Version 43 (BigTIFF)
234            0x08, 0x00, // Offset size
235            0x00, 0x00, // Reserved
236            0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // IFD offset
237        ];
238        assert!(is_tiff_header(&header));
239    }
240
241    #[test]
242    fn test_is_tiff_header_big_endian_bigtiff() {
243        let header = [
244            0x4D, 0x4D, // MM (big-endian)
245            0x00, 0x2B, // Version 43 (BigTIFF)
246            0x00, 0x08, // Offset size
247            0x00, 0x00, // Reserved
248            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, // IFD offset
249        ];
250        assert!(is_tiff_header(&header));
251    }
252
253    #[test]
254    fn test_is_tiff_header_invalid_magic() {
255        let header = [
256            0x00, 0x00, // Invalid magic
257            0x2A, 0x00, 0x08, 0x00, 0x00, 0x00,
258        ];
259        assert!(!is_tiff_header(&header));
260    }
261
262    #[test]
263    fn test_is_tiff_header_invalid_version() {
264        let header = [
265            0x49, 0x49, // II
266            0x00, 0x00, // Invalid version
267            0x08, 0x00, 0x00, 0x00,
268        ];
269        assert!(!is_tiff_header(&header));
270    }
271
272    #[test]
273    fn test_is_tiff_header_too_small() {
274        let header = [0x49, 0x49, 0x2A, 0x00]; // Only 4 bytes
275        assert!(!is_tiff_header(&header));
276    }
277
278    #[test]
279    fn test_is_tiff_header_jpeg() {
280        // JPEG magic bytes
281        let header = [0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46];
282        assert!(!is_tiff_header(&header));
283    }
284
285    #[test]
286    fn test_is_tiff_header_png() {
287        // PNG magic bytes
288        let header = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
289        assert!(!is_tiff_header(&header));
290    }
291
292    // -------------------------------------------------------------------------
293    // contains_aperio_marker tests
294    // -------------------------------------------------------------------------
295
296    #[test]
297    fn test_contains_aperio_marker_present() {
298        let data = b"Aperio Image Library v12.0.0";
299        assert!(contains_aperio_marker(data));
300    }
301
302    #[test]
303    fn test_contains_aperio_marker_in_description() {
304        let data = b"Some prefix|Aperio Image Library|Some suffix";
305        assert!(contains_aperio_marker(data));
306    }
307
308    #[test]
309    fn test_contains_aperio_marker_not_present() {
310        let data = b"Generic TIFF image description";
311        assert!(!contains_aperio_marker(data));
312    }
313
314    #[test]
315    fn test_contains_aperio_marker_empty() {
316        let data = b"";
317        assert!(!contains_aperio_marker(data));
318    }
319
320    #[test]
321    fn test_contains_aperio_marker_partial() {
322        let data = b"Aperi"; // Partial match
323        assert!(!contains_aperio_marker(data));
324    }
325
326    #[test]
327    fn test_contains_aperio_marker_case_sensitive() {
328        let data = b"aperio"; // Lowercase
329        assert!(!contains_aperio_marker(data));
330    }
331
332    // -------------------------------------------------------------------------
333    // SlideFormat tests
334    // -------------------------------------------------------------------------
335
336    #[test]
337    fn test_slide_format_name() {
338        assert_eq!(SlideFormat::AperioSvs.name(), "Aperio SVS");
339        assert_eq!(SlideFormat::GenericTiff.name(), "Generic Pyramidal TIFF");
340    }
341}