omniparse 0.4.1

A Rust toolkit for detecting and extracting metadata, text, and content from various file formats
Documentation
//! PNG image parser

use crate::core::{Error, ExtractionResult, Metadata, MetadataValue, Result};
use crate::parsers::Parser;
use crate::parsers::image::maybe_ocr_content;
use flate2::read::ZlibDecoder;
use image::io::Reader as ImageReader;
use std::io::{Cursor, Read};

/// Parser for PNG images
pub struct PngParser;

impl Parser for PngParser {
    fn supported_types(&self) -> &[&str] {
        &["image/png"]
    }
    
    fn parse(&self, data: &[u8], mime_type: &str) -> Result<ExtractionResult> {
        // Load image to extract basic information
        let img = ImageReader::new(Cursor::new(data))
            .with_guessed_format()
            .map_err(|e| Error::ParseError(format!("Failed to read PNG: {}", e)))?
            .decode()
            .map_err(|e| Error::ParseError(format!("Failed to decode PNG: {}", e)))?;
        
        let mut metadata = Metadata::new();
        
        // Extract dimensions
        let width = img.width();
        let height = img.height();
        metadata.insert("width".to_string(), MetadataValue::Number(width as i64));
        metadata.insert("height".to_string(), MetadataValue::Number(height as i64));
        
        // Extract color type information
        let color_type = img.color();
        metadata.insert("color_type".to_string(), MetadataValue::Text(format!("{:?}", color_type)));
        
        // Extract PNG metadata chunks (tEXt, iTXt, zTXt)
        if let Ok(chunks) = Self::extract_png_chunks(data) {
            for (key, value) in chunks {
                metadata.insert(key, value);
            }
        }

        let content = maybe_ocr_content(data, &mut metadata);

        Ok(ExtractionResult {
            mime_type: mime_type.to_string(),
            content,
            metadata,
            detection_confidence: 0.0,
        })
    }
    
    fn name(&self) -> &str {
        "PngParser"
    }
}

impl PngParser {
    /// Extract PNG metadata chunks (tEXt, iTXt, zTXt)
    fn extract_png_chunks(data: &[u8]) -> Result<Vec<(String, MetadataValue)>> {
        let mut chunks = Vec::new();
        
        // PNG signature is 8 bytes: 137 80 78 71 13 10 26 10
        if data.len() < 8 || &data[0..8] != b"\x89PNG\r\n\x1a\n" {
            return Err(Error::ParseError("Invalid PNG signature".to_string()));
        }
        
        let mut pos = 8;
        
        while pos + 12 <= data.len() {
            // Read chunk length (4 bytes, big-endian)
            let length = u32::from_be_bytes([
                data[pos],
                data[pos + 1],
                data[pos + 2],
                data[pos + 3],
            ]) as usize;
            
            // Read chunk type (4 bytes)
            let chunk_type = &data[pos + 4..pos + 8];
            
            // Check if we have enough data for the chunk
            if pos + 12 + length > data.len() {
                break;
            }
            
            // Extract text chunks
            match chunk_type {
                b"tEXt" => {
                    if let Some((key, value)) = Self::parse_text_chunk(&data[pos + 8..pos + 8 + length]) {
                        chunks.push((format!("text_{}", key), MetadataValue::Text(value)));
                    }
                }
                b"iTXt" => {
                    if let Some((key, value)) = Self::parse_itext_chunk(&data[pos + 8..pos + 8 + length]) {
                        chunks.push((format!("itext_{}", key), MetadataValue::Text(value)));
                    }
                }
                b"zTXt" => {
                    if let Some((key, value)) = Self::parse_ztext_chunk(&data[pos + 8..pos + 8 + length]) {
                        chunks.push((format!("ztext_{}", key), MetadataValue::Text(value)));
                    }
                }
                _ => {}
            }
            
            // Move to next chunk (length + type + data + CRC)
            pos += 12 + length;
        }
        
        Ok(chunks)
    }
    
    /// Parse tEXt chunk (uncompressed Latin-1 text)
    fn parse_text_chunk(data: &[u8]) -> Option<(String, String)> {
        // Find null separator between keyword and text
        let null_pos = data.iter().position(|&b| b == 0)?;
        
        let keyword = String::from_utf8_lossy(&data[..null_pos]).to_string();
        let text = String::from_utf8_lossy(&data[null_pos + 1..]).to_string();
        
        Some((keyword, text))
    }
    
    /// Parse iTXt chunk (international text, UTF-8)
    fn parse_itext_chunk(data: &[u8]) -> Option<(String, String)> {
        // Find null separator for keyword
        let null_pos = data.iter().position(|&b| b == 0)?;
        let keyword = String::from_utf8_lossy(&data[..null_pos]).to_string();

        // Compression flag (1 byte), compression method (1 byte)
        if null_pos + 2 >= data.len() {
            return None;
        }
        let compression_flag = data[null_pos + 1];

        // Find next null (language tag)
        let lang_start = null_pos + 3;
        if lang_start >= data.len() {
            return None;
        }
        let lang_end = data[lang_start..].iter().position(|&b| b == 0)? + lang_start;

        // Find next null (translated keyword)
        let trans_start = lang_end + 1;
        if trans_start >= data.len() {
            return None;
        }
        let trans_end = data[trans_start..].iter().position(|&b| b == 0).map(|p| p + trans_start)?;

        let text_bytes = &data[trans_end + 1..];
        let text = if compression_flag == 1 {
            inflate_zlib(text_bytes)?
        } else {
            String::from_utf8_lossy(text_bytes).to_string()
        };
        Some((keyword, text))
    }

    /// Parse zTXt chunk (zlib-compressed Latin-1 text)
    fn parse_ztext_chunk(data: &[u8]) -> Option<(String, String)> {
        let null_pos = data.iter().position(|&b| b == 0)?;
        let keyword = String::from_utf8_lossy(&data[..null_pos]).to_string();

        // Byte after keyword+null = compression method (only 0 = zlib deflate is defined).
        if null_pos + 1 >= data.len() {
            return None;
        }
        let compressed = &data[null_pos + 2..];
        let text = inflate_zlib(compressed)?;
        Some((keyword, text))
    }
}

fn inflate_zlib(bytes: &[u8]) -> Option<String> {
    let mut decoder = ZlibDecoder::new(bytes);
    let mut out = Vec::with_capacity(bytes.len() * 2);
    decoder.read_to_end(&mut out).ok()?;
    Some(String::from_utf8_lossy(&out).to_string())
}