koicore 0.2.3

core KoiLang module
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
//! Tests for the DecodeBufReader functionality

use koicore::parser::decode_buf_reader::DecodeBufReader;
use koicore::parser::input::EncodingErrorStrategy;
use std::io::{BufRead, Cursor};

#[test]
fn test_decode_buf_reader_utf8() {
    let data = "Hello, 世界!\n测试数据\nLine 3".as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Test reading lines
    let mut line = String::new();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "Hello, 世界!\n");

    line.clear();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "测试数据\n");

    line.clear();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "Line 3");

    // Test EOF
    line.clear();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert_eq!(bytes_read, 0);
    assert_eq!(line, "");
}

#[test]
fn test_decode_buf_reader_chunk_decoding() {
    let data = "Hello, 世界!\n测试数据\nLine 3".as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Test chunk decoding
    assert!(decoder.decode_chunk(100).unwrap());
    let result = decoder.take_string().unwrap();
    assert!(!result.is_empty());
    assert!(result.contains("Hello"));
}

#[test]
fn test_decode_buf_reader_empty_input() {
    let data: &[u8] = &[];
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    let mut line = String::new();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert_eq!(bytes_read, 0);
    assert_eq!(line, "");
}

#[test]
fn test_decode_buf_reader_with_encoding() {
    // Test with GBK encoding
    let data = vec![0xC4, 0xE3, 0xBA, 0xC3, 0x0A, 0xCA, 0xC0, 0xBD, 0xE7]; // "你好\n世界" in GBK
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::with_encoding(cursor, encoding_rs::GBK);

    let mut line = String::new();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert!(line.contains("")); // Should contain "你好\n"
}

#[test]
fn test_decode_buf_reader_error_handling() {
    // Test with strict error handling
    let data = vec![0xC4, 0xE3, 0xBA, 0xC3, 0x0A, 0xFF, 0xFF]; // Invalid UTF-8
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::with_encoding_and_strategy(
        cursor, 
        encoding_rs::UTF_8, 
        EncodingErrorStrategy::Strict
    );

    let mut line = String::new();
    let result = decoder.read_line(&mut line);
    // Should fail with strict mode
    assert!(result.is_err());
}

#[test]
fn test_decode_buf_reader_ultra_long_stream() {
    // Test with ultra-long input stream (simulating 256GB+ data)
    // We'll simulate massive data by calculating what 256GB would represent
    // 256GB = 256 * 1024 * 1024 * 1024 bytes = 274,877,906,944 bytes
    // If each line is ~100 bytes, that's approximately 2.7 billion lines
    
    let simulated_line_count = 2_700_000_000u64; // 2.7 billion lines to simulate 256GB+
    let test_line_count = 100_000u64; // Use a reasonable test size that completes quickly
    
    println!("Simulating ultra-long stream processing (would represent {} lines for 256GB+)", simulated_line_count);
    
    // Create a custom reader that simulates massive data without actually allocating it
    struct MassiveDataStreamSimulator {
        current_line: u64,
        total_lines: u64,
        line_template: String,
    }
    
    impl MassiveDataStreamSimulator {
        fn new(total_lines: u64) -> Self {
            Self {
                current_line: 0,
                total_lines,
                line_template: "This is a simulated line number {} with substantial content to represent realistic data size including Unicode characters: 你好世界 🚀🌟⭐ and additional text to make each line approximately 100 bytes long for accurate simulation\n".to_string(),
            }
        }
    }
    
    impl std::io::Read for MassiveDataStreamSimulator {
        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
            if self.current_line >= self.total_lines {
                return Ok(0); // EOF
            }
            
            // Generate line content dynamically to simulate massive data
            let line_content = self.line_template.replace("{}", &self.current_line.to_string());
            self.current_line += 1;
            
            // Copy data to buffer
            let data = line_content.as_bytes();
            let to_copy = std::cmp::min(buf.len(), data.len());
            buf[..to_copy].copy_from_slice(&data[..to_copy]);
            
            Ok(to_copy)
        }
    }
    
    let reader = MassiveDataStreamSimulator::new(test_line_count);
    let mut decoder = DecodeBufReader::new(reader);

    let mut line_count_read = 0u64;
    let mut line = String::new();
    let mut total_bytes_read = 0u64;
    
    // Read all lines - this demonstrates the streaming capability
    while decoder.read_line(&mut line).unwrap() > 0 {
        line_count_read += 1;
        total_bytes_read += line.len() as u64;
        
        // Verify content structure (first few lines)
        if line_count_read <= 1000 {
            assert!(line.starts_with("This is a simulated line number "));
            assert!(line.contains("你好世界"));
            assert!(line.contains("🚀🌟⭐"));
            assert!(line.ends_with("\n"));
        }
        
        // Progress reporting for long-running test
        if line_count_read % 20000 == 0 {
            println!("Processed {} lines, {} bytes total", line_count_read, total_bytes_read);
        }
        
        line.clear();
    }
    
    assert_eq!(line_count_read, test_line_count);
    println!("Successfully processed {} lines representing {} GB+ of simulated data", 
             line_count_read, 
             (total_bytes_read as f64 / (1024.0 * 1024.0 * 1024.0)) as u64);
}

#[test]
fn test_decode_buf_reader_large_chunk_decoding() {
    // Test chunk decoding with large data
    let large_content = "A".repeat(50000) + "\n" + &"B".repeat(30000) + "\n" + &"C".repeat(20000);
    let data = large_content.as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Decode in small chunks to test buffer management
    let mut total_content = String::new();
    let mut chunks_decoded = 0;
    
    while decoder.decode_chunk(1000).unwrap() {
        if let Some(content) = decoder.take_string() {
            total_content.push_str(&content);
            chunks_decoded += 1;
        }
    }
    
    // Check that we got content
    assert!(!total_content.is_empty());
    assert!(chunks_decoded > 0);
    assert!(total_content.contains("AAAAA")); // Should contain the repeated A's
    assert!(total_content.contains("BBBBB")); // Should contain the repeated B's
    assert!(total_content.contains("CCCCC")); // Should contain the repeated C's
}

#[test]
fn test_decode_buf_reader_multibyte_boundaries() {
    // Test decoding at multibyte character boundaries
    let unicode_content = "🚀🌟⭐\n🌍🌎🌏\n😀😃😄\n".repeat(100);
    let data = unicode_content.as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    let mut line = String::new();
    let mut lines_read = 0;
    
    while decoder.read_line(&mut line).unwrap() > 0 {
        lines_read += 1;
        assert!(line.contains("🚀") || line.contains("🌍") || line.contains("😀"));
        line.clear();
    }
    
    assert_eq!(lines_read, 300); // 3 lines per repeat * 100 repeats
}

#[test]
fn test_decode_buf_reader_mixed_encodings() {
    // Test with mixed valid/invalid encoding data
    let mut mixed_data = Vec::new();
    
    // Add valid UTF-8 data
    mixed_data.extend_from_slice("Valid UTF-8 text\n".as_bytes());
    
    // Add some invalid bytes
    mixed_data.extend_from_slice(&[0xFF, 0xFE, 0xFD]);
    
    // Add more valid UTF-8 data
    mixed_data.extend_from_slice("\nMore valid text\n".as_bytes());
    
    let cursor = Cursor::new(mixed_data);
    let mut decoder = DecodeBufReader::with_encoding_and_strategy(
        cursor,
        encoding_rs::UTF_8,
        EncodingErrorStrategy::Replace, // Use replace mode to handle invalid bytes
    );

    let mut content = String::new();
    let mut line = String::new();
    
    while decoder.read_line(&mut line).unwrap() > 0 {
        content.push_str(&line);
        line.clear();
    }
    
    // Should contain the valid text and replacement characters for invalid bytes
    assert!(content.contains("Valid UTF-8 text"));
    assert!(content.contains("More valid text"));
}

#[test]
fn test_decode_buf_reader_buffer_boundary_conditions() {
    // Test edge cases around buffer boundaries
    let buffer_size = 8192; // Current buffer size
    
    // Create data that's exactly at buffer boundary
    let boundary_data = "X".repeat(buffer_size - 10) + "\n" + &"Y".repeat(buffer_size - 5);
    let data = boundary_data.as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    let mut line = String::new();
    
    // Read first line
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert!(line.starts_with("X"));
    assert!(line.ends_with("\n"));
    line.clear();
    
    // Read second line
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert!(line.starts_with("Y"));
    line.clear();
    
    // EOF
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert_eq!(bytes_read, 0);
}

#[test]
fn test_decode_buf_reader_zero_sized_reads() {
    // Test behavior with zero-sized reads
    let data = "Test data".as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Test decode_chunk with 0 max_chars
    assert!(decoder.decode_chunk(0).unwrap());
    let result = decoder.take_string();
    assert!(result.is_none());
}

#[test]
fn test_decode_buf_reader_consume_behavior() {
    // Test the consume method behavior by reading and consuming data
    let data = "Short line 1\nShort line 2\n".as_bytes();
    let cursor = Cursor::new(data);
    let mut decoder = DecodeBufReader::new(cursor);

    // Read a line to ensure buffer is filled
    let mut line = String::new();
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "Short line 1\n");
    line.clear();

    // Read another line
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert!(bytes_read > 0);
    assert_eq!(line, "Short line 2\n");
    line.clear();

    // Should be at EOF
    let bytes_read = decoder.read_line(&mut line).unwrap();
    assert_eq!(bytes_read, 0);
    assert_eq!(line, "");
}

#[test]
fn test_decode_buf_reader_extremely_large_stream() {
    // Test with extremely large stream that would exceed memory if loaded entirely
    // Generate 100,000 lines with substantial content each
    let line_count = 100000;
    println!("Testing extremely large stream with {} lines...", line_count);
    
    // Create a custom reader that generates data on-the-fly
    struct LargeDataStream {
        current_line: usize,
        total_lines: usize,
        line_buffer: String,
    }
    
    impl LargeDataStream {
        fn new(total_lines: usize) -> Self {
            Self {
                current_line: 0,
                total_lines,
                line_buffer: String::new(),
            }
        }
    }
    
    impl std::io::Read for LargeDataStream {
        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
            if self.current_line >= self.total_lines {
                return Ok(0); // EOF
            }
            
            // Generate line content on-demand
            self.line_buffer.clear();
            self.line_buffer.push_str(&format!(
                "Line number {} with substantial content including Unicode: 你好世界 🚀🌟⭐ and numbers: {}\n",
                self.current_line,
                "ABC".repeat(50) // Add substantial repeated content
            ));
            
            self.current_line += 1;
            
            // Copy data to buffer
            let data = self.line_buffer.as_bytes();
            let to_copy = std::cmp::min(buf.len(), data.len());
            buf[..to_copy].copy_from_slice(&data[..to_copy]);
            
            Ok(to_copy)
        }
    }
    
    let reader = LargeDataStream::new(line_count);
    let mut decoder = DecodeBufReader::new(reader);
    
    let mut lines_read = 0;
    let mut line = String::new();
    let mut total_chars_read = 0;
    
    // Read all lines without loading everything into memory
    while decoder.read_line(&mut line).unwrap() > 0 {
        lines_read += 1;
        total_chars_read += line.len();
        
        // Verify content structure
        assert!(line.contains("Line number"));
        assert!(line.contains("你好世界"));
        assert!(line.contains("🚀🌟⭐"));
        assert!(line.ends_with('\n'));
        
        // Progress reporting for very long tests
        if lines_read % 10000 == 0 {
            println!("Processed {} lines, {} characters total", lines_read, total_chars_read);
        }
        
        line.clear();
    }
    
    assert_eq!(lines_read, line_count);
    assert!(total_chars_read > 0);
    println!("Successfully processed {} lines with {} total characters", lines_read, total_chars_read);
}

#[test]
fn test_decode_buf_reader_memory_efficient_processing() {
    // Test that the decoder processes large streams memory-efficiently
    // This test ensures that we don't load the entire stream into memory at once
    
    let large_line_count = 50000;
    println!("Testing memory-efficient processing with {} lines...", large_line_count);
    
    // Create a reader that tracks memory usage
    struct MemoryEfficientReader {
        current_line: usize,
        total_lines: usize,
        max_buffer_size: usize,
    }
    
    impl MemoryEfficientReader {
        fn new(total_lines: usize) -> Self {
            Self {
                current_line: 0,
                total_lines,
                max_buffer_size: 1024, // Small buffer to ensure streaming
            }
        }
    }
    
    impl std::io::Read for MemoryEfficientReader {
        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
            if self.current_line >= self.total_lines {
                return Ok(0); // EOF
            }
            
            // Ensure we never provide more data than buffer can handle
            let buffer_size = std::cmp::min(self.max_buffer_size, buf.len());
            
            if buffer_size == 0 {
                return Ok(0);
            }
            
            // Generate data that fits in the small buffer
            let line_content = format!("Memory test line {}\n", self.current_line);
            self.current_line += 1;
            
            let data = line_content.as_bytes();
            let to_copy = std::cmp::min(buffer_size, data.len());
            buf[..to_copy].copy_from_slice(&data[..to_copy]);
            
            Ok(to_copy)
        }
    }
    
    let reader = MemoryEfficientReader::new(large_line_count);
    let mut decoder = DecodeBufReader::new(reader);
    
    let mut lines_read = 0;
    let mut line = String::new();
    
    // Process lines one by one, ensuring memory efficiency
    while decoder.read_line(&mut line).unwrap() > 0 {
        lines_read += 1;
        
        // Verify each line is processed correctly
        assert!(line.starts_with("Memory test line "));
        assert!(line.ends_with('\n'));
        
        line.clear();
        
        // This test should run without memory issues even with large line counts
        if lines_read % 10000 == 0 {
            println!("Memory-efficient processing: {} lines completed", lines_read);
        }
    }
    
    assert_eq!(lines_read, large_line_count);
    println!("Memory-efficient processing completed successfully with {} lines", lines_read);
}