koicore 0.2.3 - Docs.rs

//! Tests for the DecodeBufReader functionality

use koicore::parser::decode_buf_reader::DecodeBufReader;
use koicore::parser::input::EncodingErrorStrategy;
use std::io::{BufRead, Cursor};

#[test]
fn test_decode_buf_reader_utf8() {
    let data = "Hello, 世界!\n测试数据\nLine 3".as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Test reading lines
    let mut line = String::new();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "Hello, 世界!\n");

    line.clear();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "测试数据\n");

    line.clear();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "Line 3");

    // Test EOF
    line.clear();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert_eq!(bytes_read, 0);
    assert_eq!(line, "");
}

#[test]
fn test_decode_buf_reader_chunk_decoding() {
    let data = "Hello, 世界!\n测试数据\nLine 3".as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Test chunk decoding
    assert!(decoder.decode_chunk(100).unwrap());
    let result = decoder.take_string().unwrap();
    assert!(!result.is_empty());
    assert!(result.contains("Hello"));
}

#[test]
fn test_decode_buf_reader_empty_input() {
    let data: &[u8] = &[];
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    let mut line = String::new();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert_eq!(bytes_read, 0);
    assert_eq!(line, "");
}

#[test]
fn test_decode_buf_reader_with_encoding() {
    // Test with GBK encoding
    let data = vec![0xC4, 0xE3, 0xBA, 0xC3, 0x0A, 0xCA, 0xC0, 0xBD, 0xE7]; // "你好\n世界" in GBK
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::with_encoding(cursor, encoding_rs::GBK);

    let mut line = String::new();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert!(line.contains("好")); // Should contain "你好\n"
}

#[test]
fn test_decode_buf_reader_error_handling() {
    // Test with strict error handling
    let data = vec![0xC4, 0xE3, 0xBA, 0xC3, 0x0A, 0xFF, 0xFF]; // Invalid UTF-8
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::with_encoding_and_strategy(
        cursor, 
        encoding_rs::UTF_8, 
        EncodingErrorStrategy::Strict
    );

    let mut line = String::new();
    let result = decoder.read_line(&mut line);
    // Should fail with strict mode
    assert!(result.is_err());
}

#[test]
fn test_decode_buf_reader_ultra_long_stream() {
    // Test with ultra-long input stream (simulating 256GB+ data)
    // We'll simulate massive data by calculating what 256GB would represent
    // 256GB = 256 * 1024 * 1024 * 1024 bytes = 274,877,906,944 bytes
    // If each line is ~100 bytes, that's approximately 2.7 billion lines
    
    let simulated_line_count = 2_700_000_000u64; // 2.7 billion lines to simulate 256GB+
    let test_line_count = 100_000u64; // Use a reasonable test size that completes quickly
    
    println!("Simulating ultra-long stream processing (would represent {} lines for 256GB+)", simulated_line_count);
    
    // Create a custom reader that simulates massive data without actually allocating it
    struct MassiveDataStreamSimulator {
        current_line: u64,
        total_lines: u64,
        line_template: String,
    }
    
    impl MassiveDataStreamSimulator {
        fn new(total_lines: u64) -> Self {
            Self {
                current_line: 0,
                total_lines,
                line_template: "This is a simulated line number {} with substantial content to represent realistic data size including Unicode characters: 你好世界 🚀🌟⭐ and additional text to make each line approximately 100 bytes long for accurate simulation\n".to_string(),
            }
        }
    }
    
    impl std::io::Read for MassiveDataStreamSimulator {
        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
            if self.current_line >= self.total_lines {
                return Ok(0); // EOF
            }
            
            // Generate line content dynamically to simulate massive data
            let line_content = self.line_template.replace("{}", &self.current_line.to_string());
            self.current_line += 1;
            
            // Copy data to buffer
            let data = line_content.as_bytes();
            let to_copy = std::cmp::min(buf.len(), data.len());
            buf[..to_copy].copy_from_slice(&data[..to_copy]);
            
            Ok(to_copy)
        }
    }
    
    let reader = MassiveDataStreamSimulator::new(test_line_count);
    let mut decoder = DecodeBufReader::new(reader);

    let mut line_count_read = 0u64;
    let mut line = String::new();
    let mut total_bytes_read = 0u64;
    
    // Read all lines - this demonstrates the streaming capability
    while decoder.read_line(&mut line).unwrap() > 0 {
        line_count_read += 1;
        total_bytes_read += line.len() as u64;
        
        // Verify content structure (first few lines)
        if line_count_read <= 1000 {
            assert!(line.starts_with("This is a simulated line number "));
            assert!(line.contains("你好世界"));
            assert!(line.contains("🚀🌟⭐"));
            assert!(line.ends_with("\n"));
        }
        
        // Progress reporting for long-running test
        if line_count_read % 20000 == 0 {
            println!("Processed {} lines, {} bytes total", line_count_read, total_bytes_read);
        }
        
        line.clear();
    }
    
    assert_eq!(line_count_read, test_line_count);
    println!("Successfully processed {} lines representing {} GB+ of simulated data", 
             line_count_read, 
             (total_bytes_read as f64 / (1024.0 * 1024.0 * 1024.0)) as u64);
}

#[test]
fn test_decode_buf_reader_large_chunk_decoding() {
    // Test chunk decoding with large data
    let large_content = "A".repeat(50000) + "\n" + &"B".repeat(30000) + "\n" + &"C".repeat(20000);
    let data = large_content.as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Decode in small chunks to test buffer management
    let mut total_content = String::new();
    let mut chunks_decoded = 0;
    
    while decoder.decode_chunk(1000).unwrap() {
        if let Some(content) = decoder.take_string() {
            total_content.push_str(&content);
            chunks_decoded += 1;
        }
    }
    
    // Check that we got content
    assert!(!total_content.is_empty());
    assert!(chunks_decoded > 0);
    assert!(total_content.contains("AAAAA")); // Should contain the repeated A's
    assert!(total_content.contains("BBBBB")); // Should contain the repeated B's
    assert!(total_content.contains("CCCCC")); // Should contain the repeated C's
}

#[test]
fn test_decode_buf_reader_multibyte_boundaries() {
    // Test decoding at multibyte character boundaries
    let unicode_content = "🚀🌟⭐\n🌍🌎🌏\n😀😃😄\n".repeat(100);
    let data = unicode_content.as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    let mut line = String::new();
    let mut lines_read = 0;
    
    while decoder.read_line(&mut line).unwrap() > 0 {
        lines_read += 1;
        assert!(line.contains("🚀") || line.contains("🌍") || line.contains("😀"));
        line.clear();
    }
    
    assert_eq!(lines_read, 300); // 3 lines per repeat * 100 repeats
}

#[test]
fn test_decode_buf_reader_mixed_encodings() {
    // Test with mixed valid/invalid encoding data
    let mut mixed_data = Vec::new();
    
    // Add valid UTF-8 data
    mixed_data.extend_from_slice("Valid UTF-8 text\n".as_bytes());
    
    // Add some invalid bytes
    mixed_data.extend_from_slice(&[0xFF, 0xFE, 0xFD]);
    
    // Add more valid UTF-8 data
    mixed_data.extend_from_slice("\nMore valid text\n".as_bytes());
    
    let cursor = Cursor::new(mixed_data);
    let mut decoder = DecodeBufReader::with_encoding_and_strategy(
        cursor,
        encoding_rs::UTF_8,
        EncodingErrorStrategy::Replace, // Use replace mode to handle invalid bytes
    );

    let mut content = String::new();
    let mut line = String::new();
    
    while decoder.read_line(&mut line).unwrap() > 0 {
        content.push_str(&line);
        line.clear();
    }
    
    // Should contain the valid text and replacement characters for invalid bytes
    assert!(content.contains("Valid UTF-8 text"));
    assert!(content.contains("More valid text"));
}

#[test]
fn test_decode_buf_reader_buffer_boundary_conditions() {
    // Test edge cases around buffer boundaries
    let buffer_size = 8192; // Current buffer size
    
    // Create data that's exactly at buffer boundary
    let boundary_data = "X".repeat(buffer_size - 10) + "\n" + &"Y".repeat(buffer_size - 5);
    let data = boundary_data.as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    let mut line = String::new();
    
    // Read first line
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert!(line.starts_with("X"));
    assert!(line.ends_with("\n"));
    line.clear();
    
    // Read second line
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert!(line.starts_with("Y"));
    line.clear();
    
    // EOF
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert_eq!(bytes_read, 0);
}

#[test]
fn test_decode_buf_reader_zero_sized_reads() {
    // Test behavior with zero-sized reads
    let data = "Test data".as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Test decode_chunk with 0 max_chars
    assert!(decoder.decode_chunk(0).unwrap());
    let result = decoder.take_string();
    assert!(result.is_none());
}

#[test]
fn test_decode_buf_reader_consume_behavior() {
    // Test the consume method behavior by reading and consuming data
    let data = "Short line 1\nShort line 2\n".as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Read a line to ensure buffer is filled
    let mut line = String::new();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "Short line 1\n");
    line.clear();

    // Read another line
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "Short line 2\n");
    line.clear();

    // Should be at EOF
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert_eq!(bytes_read, 0);
    assert_eq!(line, "");
}

#[test]
fn test_decode_buf_reader_extremely_large_stream() {
    // Test with extremely large stream that would exceed memory if loaded entirely
    // Generate 100,000 lines with substantial content each
    let line_count = 100000;
    println!("Testing extremely large stream with {} lines...", line_count);
    
    // Create a custom reader that generates data on-the-fly
    struct LargeDataStream {
        current_line: usize,
        total_lines: usize,
        line_buffer: String,
    }
    
    impl LargeDataStream {
        fn new(total_lines: usize) -> Self {
            Self {
                current_line: 0,
                total_lines,
                line_buffer: String::new(),
            }
        }
    }
    
    impl std::io::Read for LargeDataStream {
        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
            if self.current_line >= self.total_lines {
                return Ok(0); // EOF
            }
            
            // Generate line content on-demand
            self.line_buffer.clear();
            self.line_buffer.push_str(&format!(
                "Line number {} with substantial content including Unicode: 你好世界 🚀🌟⭐ and numbers: {}\n",
                self.current_line,
                "ABC".repeat(50) // Add substantial repeated content
            ));
            
            self.current_line += 1;
            
            // Copy data to buffer
            let data = self.line_buffer.as_bytes();
            let to_copy = std::cmp::min(buf.len(), data.len());
            buf[..to_copy].copy_from_slice(&data[..to_copy]);
            
            Ok(to_copy)
        }
    }
    
    let reader = LargeDataStream::new(line_count);
    let mut decoder = DecodeBufReader::new(reader);
    
    let mut lines_read = 0;
    let mut line = String::new();
    let mut total_chars_read = 0;
    
    // Read all lines without loading everything into memory
    while decoder.read_line(&mut line).unwrap() > 0 {
        lines_read += 1;
        total_chars_read += line.len();
        
        // Verify content structure
        assert!(line.contains("Line number"));
        assert!(line.contains("你好世界"));
        assert!(line.contains("🚀🌟⭐"));
        assert!(line.ends_with('\n'));
        
        // Progress reporting for very long tests
        if lines_read % 10000 == 0 {
            println!("Processed {} lines, {} characters total", lines_read, total_chars_read);
        }
        
        line.clear();
    }
    
    assert_eq!(lines_read, line_count);
    assert!(total_chars_read > 0);
    println!("Successfully processed {} lines with {} total characters", lines_read, total_chars_read);
}

#[test]
fn test_decode_buf_reader_memory_efficient_processing() {
    // Test that the decoder processes large streams memory-efficiently
    // This test ensures that we don't load the entire stream into memory at once
    
    let large_line_count = 50000;
    println!("Testing memory-efficient processing with {} lines...", large_line_count);
    
    // Create a reader that tracks memory usage
    struct MemoryEfficientReader {
        current_line: usize,
        total_lines: usize,
        max_buffer_size: usize,
    }
    
    impl MemoryEfficientReader {
        fn new(total_lines: usize) -> Self {
            Self {
                current_line: 0,
                total_lines,
                max_buffer_size: 1024, // Small buffer to ensure streaming
            }
        }
    }
    
    impl std::io::Read for MemoryEfficientReader {
        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
            if self.current_line >= self.total_lines {
                return Ok(0); // EOF
            }
            
            // Ensure we never provide more data than buffer can handle
            let buffer_size = std::cmp::min(self.max_buffer_size, buf.len());
            
            if buffer_size == 0 {
                return Ok(0);
            }
            
            // Generate data that fits in the small buffer
            let line_content = format!("Memory test line {}\n", self.current_line);
            self.current_line += 1;
            
            let data = line_content.as_bytes();
            let to_copy = std::cmp::min(buffer_size, data.len());
            buf[..to_copy].copy_from_slice(&data[..to_copy]);
            
            Ok(to_copy)
        }
    }
    
    let reader = MemoryEfficientReader::new(large_line_count);
    let mut decoder = DecodeBufReader::new(reader);
    
    let mut lines_read = 0;
    let mut line = String::new();
    
    // Process lines one by one, ensuring memory efficiency
    while decoder.read_line(&mut line).unwrap() > 0 {
        lines_read += 1;
        
        // Verify each line is processed correctly
        assert!(line.starts_with("Memory test line "));
        assert!(line.ends_with('\n'));
        
        line.clear();
        
        // This test should run without memory issues even with large line counts
        if lines_read % 10000 == 0 {
            println!("Memory-efficient processing: {} lines completed", lines_read);
        }
    }
    
    assert_eq!(lines_read, large_line_count);
    println!("Memory-efficient processing completed successfully with {} lines", lines_read);
}