Skip to main content

coreutils_rs/split/
core.rs

1use std::fs::{self, File};
2use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
3use std::path::{Path, PathBuf};
4use std::process::{Command, Stdio};
5
6/// Suffix type for output filenames.
7#[derive(Clone, Debug, PartialEq)]
8pub enum SuffixType {
9    /// Alphabetic suffixes: aa, ab, ..., zz, aaa, ...
10    Alphabetic,
11    /// Numeric suffixes: 00, 01, ..., 99, 000, ...
12    Numeric(u64),
13    /// Hexadecimal suffixes: 00, 01, ..., ff, 000, ...
14    Hex(u64),
15}
16
17/// Split mode: how to divide the input.
18#[derive(Clone, Debug)]
19pub enum SplitMode {
20    /// Split every N lines (default 1000).
21    Lines(u64),
22    /// Split every N bytes.
23    Bytes(u64),
24    /// Split at line boundaries, at most N bytes per file.
25    LineBytes(u64),
26    /// Split into exactly N output files (by byte count).
27    Number(u64),
28    /// Extract Kth chunk of N total (K/N format, 1-indexed).
29    NumberExtract(u64, u64),
30    /// Split into N output files by line boundaries (l/N format).
31    LineChunks(u64),
32    /// Extract Kth line-based chunk of N total (l/K/N format).
33    LineChunkExtract(u64, u64),
34    /// Round-robin distribute lines across N output files (r/N format).
35    RoundRobin(u64),
36    /// Extract Kth round-robin chunk of N total (r/K/N format).
37    RoundRobinExtract(u64, u64),
38}
39
40/// Configuration for the split command.
41#[derive(Clone, Debug)]
42pub struct SplitConfig {
43    pub mode: SplitMode,
44    pub suffix_type: SuffixType,
45    pub suffix_length: usize,
46    pub additional_suffix: String,
47    pub prefix: String,
48    pub elide_empty: bool,
49    pub verbose: bool,
50    pub filter: Option<String>,
51    pub separator: u8,
52}
53
54impl Default for SplitConfig {
55    fn default() -> Self {
56        Self {
57            mode: SplitMode::Lines(1000),
58            suffix_type: SuffixType::Alphabetic,
59            suffix_length: 2,
60            additional_suffix: String::new(),
61            prefix: "x".to_string(),
62            elide_empty: false,
63            verbose: false,
64            filter: None,
65            separator: b'\n',
66        }
67    }
68}
69
70/// Parse a SIZE string with optional suffix.
71/// Supports: K=1024, M=1024^2, G=1024^3, T=1024^4, P=1024^5, E=1024^6
72/// Also: KB=1000, MB=1000^2, GB=1000^3, etc.
73/// Also: b=512, KiB=1024, MiB=1024^2, etc.
74pub fn parse_size(s: &str) -> Result<u64, String> {
75    let s = s.trim();
76    if s.is_empty() {
77        return Err("empty size".to_string());
78    }
79
80    // Find where the numeric part ends
81    let mut num_end = 0;
82    for (i, c) in s.char_indices() {
83        if c.is_ascii_digit() || (i == 0 && (c == '+' || c == '-')) {
84            num_end = i + c.len_utf8();
85        } else {
86            break;
87        }
88    }
89
90    if num_end == 0 {
91        return Err(format!("invalid number: '{}'", s));
92    }
93
94    let num_str = &s[..num_end];
95    let suffix = &s[num_end..];
96
97    let num: u64 = num_str
98        .parse()
99        .map_err(|_| format!("invalid number: '{}'", num_str))?;
100
101    let multiplier: u64 = match suffix {
102        "" => 1,
103        "b" => 512,
104        "kB" => 1000,
105        "K" | "KiB" => 1024,
106        "MB" => 1_000_000,
107        "M" | "MiB" => 1_048_576,
108        "GB" => 1_000_000_000,
109        "G" | "GiB" => 1_073_741_824,
110        "TB" => 1_000_000_000_000,
111        "T" | "TiB" => 1_099_511_627_776,
112        "PB" => 1_000_000_000_000_000,
113        "P" | "PiB" => 1_125_899_906_842_624,
114        "EB" => 1_000_000_000_000_000_000,
115        "E" | "EiB" => 1_152_921_504_606_846_976,
116        "ZB" | "Z" | "ZiB" | "YB" | "Y" | "YiB" => {
117            if num > 0 {
118                return Ok(u64::MAX);
119            }
120            return Ok(0);
121        }
122        _ => return Err(format!("invalid suffix in '{}'", s)),
123    };
124
125    num.checked_mul(multiplier)
126        .ok_or_else(|| format!("number too large: '{}'", s))
127}
128
129/// Generate the suffix string for a given chunk index.
130pub fn generate_suffix(index: u64, suffix_type: &SuffixType, suffix_length: usize) -> String {
131    match suffix_type {
132        SuffixType::Alphabetic => {
133            let mut result = Vec::with_capacity(suffix_length);
134            let mut remaining = index;
135            for _ in 0..suffix_length {
136                result.push(b'a' + (remaining % 26) as u8);
137                remaining /= 26;
138            }
139            result.reverse();
140            String::from_utf8(result).unwrap()
141        }
142        SuffixType::Numeric(start) => {
143            let val = start + index;
144            format!("{:0>width$}", val, width = suffix_length)
145        }
146        SuffixType::Hex(start) => {
147            let val = start + index;
148            format!("{:0>width$x}", val, width = suffix_length)
149        }
150    }
151}
152
153/// Compute the maximum number of chunks supported for a given suffix configuration.
154pub fn max_chunks(suffix_type: &SuffixType, suffix_length: usize) -> u64 {
155    match suffix_type {
156        SuffixType::Alphabetic => 26u64.saturating_pow(suffix_length as u32),
157        SuffixType::Numeric(_) | SuffixType::Hex(_) => 10u64.saturating_pow(suffix_length as u32),
158    }
159}
160
161/// Build the output file path for a given chunk index.
162fn output_path(config: &SplitConfig, index: u64) -> String {
163    let suffix = generate_suffix(index, &config.suffix_type, config.suffix_length);
164    format!("{}{}{}", config.prefix, suffix, config.additional_suffix)
165}
166
167/// Trait for output sinks: either a file or a filter command pipe.
168trait ChunkWriter: Write {
169    fn finish(&mut self) -> io::Result<()>;
170}
171
172/// Writes chunks to files on disk.
173struct FileChunkWriter {
174    writer: BufWriter<File>,
175}
176
177impl FileChunkWriter {
178    fn create(path: &str) -> io::Result<Self> {
179        let file = File::create(path)?;
180        Ok(Self {
181            writer: BufWriter::with_capacity(1024 * 1024, file), // 1MB output buffer
182        })
183    }
184}
185
186impl Write for FileChunkWriter {
187    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
188        self.writer.write(buf)
189    }
190
191    fn flush(&mut self) -> io::Result<()> {
192        self.writer.flush()
193    }
194}
195
196impl ChunkWriter for FileChunkWriter {
197    fn finish(&mut self) -> io::Result<()> {
198        self.writer.flush()
199    }
200}
201
202/// Writes chunks to a filter command via pipe.
203struct FilterChunkWriter {
204    child: std::process::Child,
205    _stdin_taken: bool,
206}
207
208impl FilterChunkWriter {
209    fn create(filter_cmd: &str, output_path: &str) -> io::Result<Self> {
210        let child = Command::new("sh")
211            .arg("-c")
212            .arg(filter_cmd)
213            .env("FILE", output_path)
214            .stdin(Stdio::piped())
215            .spawn()?;
216        Ok(Self {
217            child,
218            _stdin_taken: false,
219        })
220    }
221}
222
223impl Write for FilterChunkWriter {
224    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
225        if let Some(ref mut stdin) = self.child.stdin {
226            stdin.write(buf)
227        } else {
228            Err(io::Error::new(io::ErrorKind::BrokenPipe, "stdin closed"))
229        }
230    }
231
232    fn flush(&mut self) -> io::Result<()> {
233        if let Some(ref mut stdin) = self.child.stdin {
234            stdin.flush()
235        } else {
236            Ok(())
237        }
238    }
239}
240
241impl ChunkWriter for FilterChunkWriter {
242    fn finish(&mut self) -> io::Result<()> {
243        // Close stdin so the child can finish
244        self.child.stdin.take();
245        let status = self.child.wait()?;
246        if !status.success() {
247            return Err(io::Error::other(format!(
248                "filter command exited with status {}",
249                status
250            )));
251        }
252        Ok(())
253    }
254}
255
256/// Create a chunk writer for the given chunk index.
257fn create_writer(config: &SplitConfig, index: u64) -> io::Result<Box<dyn ChunkWriter>> {
258    let path = output_path(config, index);
259    if config.verbose {
260        eprintln!("creating file '{}'", path);
261    }
262    if let Some(ref filter_cmd) = config.filter {
263        Ok(Box::new(FilterChunkWriter::create(filter_cmd, &path)?))
264    } else {
265        Ok(Box::new(FileChunkWriter::create(&path)?))
266    }
267}
268
269/// Split input by line count.
270/// Uses bulk memchr scanning to count lines within large buffer slices,
271/// writing contiguous multi-line slices instead of copying line-by-line.
272fn split_by_lines(
273    reader: &mut dyn BufRead,
274    config: &SplitConfig,
275    lines_per_chunk: u64,
276) -> io::Result<()> {
277    let limit = max_chunks(&config.suffix_type, config.suffix_length);
278    let mut chunk_index: u64 = 0;
279    let mut lines_in_chunk: u64 = 0;
280    let mut writer: Option<Box<dyn ChunkWriter>> = None;
281    let sep = config.separator;
282
283    loop {
284        let available = match reader.fill_buf() {
285            Ok([]) => break,
286            Ok(b) => b,
287            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
288            Err(e) => return Err(e),
289        };
290
291        let mut pos = 0;
292        let buf_len = available.len();
293
294        while pos < buf_len {
295            if writer.is_none() {
296                if chunk_index >= limit {
297                    return Err(io::Error::other("output file suffixes exhausted"));
298                }
299                writer = Some(create_writer(config, chunk_index)?);
300                lines_in_chunk = 0;
301            }
302
303            // How many lines left before we need a new chunk?
304            let lines_needed = lines_per_chunk - lines_in_chunk;
305            let slice = &available[pos..];
306
307            // Use memchr_iter for bulk SIMD scanning — finds all separator
308            // positions in one pass instead of N individual memchr calls.
309            let mut found = 0u64;
310            let mut last_sep_end = 0;
311
312            for offset in memchr::memchr_iter(sep, slice) {
313                found += 1;
314                last_sep_end = offset + 1;
315                if found >= lines_needed {
316                    break;
317                }
318            }
319
320            if found >= lines_needed {
321                // We found enough lines - write the contiguous slice
322                writer.as_mut().unwrap().write_all(&slice[..last_sep_end])?;
323                pos += last_sep_end;
324                // Close this chunk
325                writer.as_mut().unwrap().finish()?;
326                writer = None;
327                chunk_index += 1;
328            } else {
329                // Not enough lines in this buffer - write everything and get more
330                writer.as_mut().unwrap().write_all(slice)?;
331                lines_in_chunk += found;
332                pos = buf_len;
333            }
334        }
335
336        let consumed = buf_len;
337        reader.consume(consumed);
338    }
339
340    // Handle final partial chunk (data without trailing separator)
341    if let Some(ref mut w) = writer {
342        w.finish()?;
343    }
344
345    Ok(())
346}
347
348/// Split input by byte count.
349fn split_by_bytes(
350    reader: &mut dyn Read,
351    config: &SplitConfig,
352    bytes_per_chunk: u64,
353) -> io::Result<()> {
354    let limit = max_chunks(&config.suffix_type, config.suffix_length);
355    let mut chunk_index: u64 = 0;
356    let mut bytes_in_chunk: u64 = 0;
357    let mut writer: Option<Box<dyn ChunkWriter>> = None;
358
359    let mut read_buf = vec![0u8; 1024 * 1024]; // 1MB read buffer for fewer syscalls
360    loop {
361        let bytes_read = match reader.read(&mut read_buf) {
362            Ok(0) => break,
363            Ok(n) => n,
364            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
365            Err(e) => return Err(e),
366        };
367
368        let mut offset = 0usize;
369        while offset < bytes_read {
370            if writer.is_none() {
371                if chunk_index >= limit {
372                    return Err(io::Error::other("output file suffixes exhausted"));
373                }
374                writer = Some(create_writer(config, chunk_index)?);
375                bytes_in_chunk = 0;
376            }
377
378            let remaining_in_chunk = (bytes_per_chunk - bytes_in_chunk) as usize;
379            let remaining_in_buf = bytes_read - offset;
380            let to_write = remaining_in_chunk.min(remaining_in_buf);
381
382            writer
383                .as_mut()
384                .unwrap()
385                .write_all(&read_buf[offset..offset + to_write])?;
386            bytes_in_chunk += to_write as u64;
387            offset += to_write;
388
389            if bytes_in_chunk >= bytes_per_chunk {
390                writer.as_mut().unwrap().finish()?;
391                writer = None;
392                chunk_index += 1;
393            }
394        }
395    }
396
397    if let Some(ref mut w) = writer {
398        if config.elide_empty && bytes_in_chunk == 0 {
399            w.finish()?;
400            // Remove the empty file
401            let path = output_path(config, chunk_index);
402            let _ = fs::remove_file(&path);
403        } else {
404            w.finish()?;
405        }
406    }
407
408    Ok(())
409}
410
411/// Split input by line-bytes: at most N bytes per file, breaking at line boundaries.
412/// GNU split uses a buffer-based approach: for each chunk-sized window, it finds
413/// the last newline using memrchr and breaks there. When no newline exists within
414/// the window (line longer than max_bytes), it breaks at the byte boundary.
415fn split_by_line_bytes(
416    reader: &mut dyn Read,
417    config: &SplitConfig,
418    max_bytes: u64,
419) -> io::Result<()> {
420    let limit = max_chunks(&config.suffix_type, config.suffix_length);
421    let max = max_bytes as usize;
422    let sep = config.separator;
423
424    // Read all input data for simplicity (matches other modes)
425    let mut data = Vec::new();
426    reader.read_to_end(&mut data)?;
427
428    if data.is_empty() {
429        return Ok(());
430    }
431
432    let total = data.len();
433    let mut chunk_index: u64 = 0;
434    let mut offset = 0;
435
436    while offset < total {
437        if chunk_index >= limit {
438            return Err(io::Error::other("output file suffixes exhausted"));
439        }
440
441        let remaining = total - offset;
442        let window = remaining.min(max);
443        let slice = &data[offset..offset + window];
444
445        // Find the last separator in this window.
446        // GNU split uses memrchr to find the last newline within the window,
447        // breaking there. If no separator exists, write the full window.
448        // When remaining data is strictly smaller than max_bytes, take everything
449        // as the final chunk (matches GNU behavior).
450        let end = if remaining < max {
451            offset + window
452        } else if let Some(pos) = memchr::memrchr(sep, slice) {
453            // Break at the last separator within the window
454            offset + pos + 1
455        } else {
456            // No separator found: write the full window (line > max_bytes)
457            offset + window
458        };
459
460        let chunk_data = &data[offset..end];
461
462        let mut writer = create_writer(config, chunk_index)?;
463        writer.write_all(chunk_data)?;
464        writer.finish()?;
465
466        offset = end;
467        chunk_index += 1;
468    }
469
470    Ok(())
471}
472
473/// Split input into exactly N chunks by byte count.
474/// Reads the whole file to determine size, then distributes bytes evenly.
475fn split_by_number(input_path: &str, config: &SplitConfig, n_chunks: u64) -> io::Result<()> {
476    let limit = max_chunks(&config.suffix_type, config.suffix_length);
477    if n_chunks > limit {
478        return Err(io::Error::other("output file suffixes exhausted"));
479    }
480    if n_chunks == 0 {
481        return Err(io::Error::new(
482            io::ErrorKind::InvalidInput,
483            "invalid number of chunks: 0",
484        ));
485    }
486
487    // Read input data (mmap for regular files, read for stdin)
488    let data: crate::common::io::FileData = if input_path == "-" {
489        let mut buf = Vec::new();
490        io::stdin().lock().read_to_end(&mut buf)?;
491        crate::common::io::FileData::Owned(buf)
492    } else {
493        crate::common::io::read_file(Path::new(input_path))?
494    };
495
496    let total = data.len() as u64;
497    let base_chunk_size = total / n_chunks;
498    let remainder = total % n_chunks;
499
500    let mut offset: u64 = 0;
501    for i in 0..n_chunks {
502        // First `remainder` chunks get one extra byte
503        let chunk_size = base_chunk_size + if i < remainder { 1 } else { 0 };
504
505        if config.elide_empty && chunk_size == 0 {
506            continue;
507        }
508
509        let mut writer = create_writer(config, i)?;
510        if chunk_size > 0 {
511            let start = offset as usize;
512            let end = start + chunk_size as usize;
513            writer.write_all(&data[start..end])?;
514        }
515        writer.finish()?;
516        offset += chunk_size;
517    }
518
519    Ok(())
520}
521
522/// Extract Kth chunk of N from input (K/N format). Output goes to stdout.
523fn split_by_number_extract(input_path: &str, k: u64, n: u64) -> io::Result<()> {
524    let data: crate::common::io::FileData = if input_path == "-" {
525        let mut buf = Vec::new();
526        io::stdin().lock().read_to_end(&mut buf)?;
527        crate::common::io::FileData::Owned(buf)
528    } else {
529        crate::common::io::read_file(Path::new(input_path))?
530    };
531
532    let total = data.len() as u64;
533    let base_chunk_size = total / n;
534    let remainder = total % n;
535
536    let mut offset: u64 = 0;
537    for i in 0..n {
538        let chunk_size = base_chunk_size + if i < remainder { 1 } else { 0 };
539        if i + 1 == k {
540            if chunk_size > 0 {
541                let start = offset as usize;
542                let end = start + chunk_size as usize;
543                let stdout = io::stdout();
544                let mut out = stdout.lock();
545                out.write_all(&data[start..end])?;
546            }
547            return Ok(());
548        }
549        offset += chunk_size;
550    }
551    Ok(())
552}
553
554/// Read all input data into a buffer.
555fn read_input_data(input_path: &str) -> io::Result<Vec<u8>> {
556    if input_path == "-" {
557        let mut buf = Vec::new();
558        io::stdin().lock().read_to_end(&mut buf)?;
559        Ok(buf)
560    } else {
561        let data = crate::common::io::read_file(Path::new(input_path))?;
562        Ok(data.to_vec())
563    }
564}
565
566/// Compute chunk boundary offsets for line-based N-way splitting.
567/// GNU split distributes lines to chunks by reading sequentially:
568/// each line goes to the current chunk until accumulated bytes reach
569/// or exceed the chunk's target end boundary, then the chunk is closed.
570fn compute_line_chunk_boundaries(data: &[u8], n_chunks: u64, sep: u8) -> Vec<u64> {
571    let total = data.len() as u64;
572    let base_chunk_size = total / n_chunks;
573    let remainder = total % n_chunks;
574
575    // Precompute target end boundaries for each chunk
576    let mut boundaries = Vec::with_capacity(n_chunks as usize);
577    let mut target_end: u64 = 0;
578    for i in 0..n_chunks {
579        target_end += base_chunk_size + if i < remainder { 1 } else { 0 };
580        boundaries.push(target_end);
581    }
582
583    // Now read lines and assign to chunks
584    let mut chunk_ends = Vec::with_capacity(n_chunks as usize);
585    let mut pos: u64 = 0;
586    let mut chunk_idx: u64 = 0;
587
588    for sep_pos in memchr::memchr_iter(sep, data) {
589        let line_end = sep_pos as u64 + 1; // inclusive of separator
590        pos = line_end;
591
592        // If we've reached or passed this chunk's target boundary, close it
593        while chunk_idx < n_chunks && pos >= boundaries[chunk_idx as usize] {
594            chunk_ends.push(pos);
595            chunk_idx += 1;
596        }
597    }
598
599    // Handle trailing data without separator
600    if pos < total {
601        pos = total;
602        while chunk_idx < n_chunks && pos >= boundaries[chunk_idx as usize] {
603            chunk_ends.push(pos);
604            chunk_idx += 1;
605        }
606    }
607
608    // Any remaining chunks get the same end position (at end of data or last line)
609    while (chunk_ends.len() as u64) < n_chunks {
610        chunk_ends.push(pos);
611    }
612
613    chunk_ends
614}
615
616/// Split into N output files by line count (l/N format).
617fn split_by_line_chunks(input_path: &str, config: &SplitConfig, n_chunks: u64) -> io::Result<()> {
618    let data = read_input_data(input_path)?;
619    let sep = config.separator;
620
621    let chunk_ends = compute_line_chunk_boundaries(&data, n_chunks, sep);
622
623    let mut offset: u64 = 0;
624    for i in 0..n_chunks {
625        let end = chunk_ends[i as usize];
626        let chunk_size = end - offset;
627
628        if config.elide_empty && chunk_size == 0 {
629            continue;
630        }
631
632        let mut writer = create_writer(config, i)?;
633        if chunk_size > 0 {
634            writer.write_all(&data[offset as usize..end as usize])?;
635        }
636        writer.finish()?;
637        offset = end;
638    }
639    Ok(())
640}
641
642/// Extract Kth line-based chunk of N (l/K/N format). Output goes to stdout.
643fn split_by_line_chunk_extract(
644    input_path: &str,
645    config: &SplitConfig,
646    k: u64,
647    n_chunks: u64,
648) -> io::Result<()> {
649    let data = read_input_data(input_path)?;
650    let sep = config.separator;
651
652    let chunk_ends = compute_line_chunk_boundaries(&data, n_chunks, sep);
653
654    let mut offset: u64 = 0;
655    for i in 0..n_chunks {
656        let end = chunk_ends[i as usize];
657        if i + 1 == k {
658            let chunk_size = end - offset;
659            if chunk_size > 0 {
660                let stdout = io::stdout();
661                let mut out = stdout.lock();
662                out.write_all(&data[offset as usize..end as usize])?;
663            }
664            return Ok(());
665        }
666        offset = end;
667    }
668    Ok(())
669}
670
671/// Round-robin distribute lines across N output files (r/N format).
672fn split_by_round_robin(input_path: &str, config: &SplitConfig, n_chunks: u64) -> io::Result<()> {
673    let data = read_input_data(input_path)?;
674    let sep = config.separator;
675
676    // Collect lines
677    let mut lines: Vec<&[u8]> = Vec::new();
678    let mut start = 0;
679    for pos in memchr::memchr_iter(sep, &data) {
680        lines.push(&data[start..=pos]);
681        start = pos + 1;
682    }
683    if start < data.len() {
684        lines.push(&data[start..]);
685    }
686
687    // Create writers for each chunk
688    let mut writers: Vec<Option<Box<dyn ChunkWriter>>> = (0..n_chunks)
689        .map(|i| {
690            if config.elide_empty && lines.len() as u64 <= i {
691                None
692            } else {
693                Some(create_writer(config, i).unwrap())
694            }
695        })
696        .collect();
697
698    // Distribute lines round-robin
699    for (idx, line) in lines.iter().enumerate() {
700        let chunk_idx = (idx as u64) % n_chunks;
701        if let Some(ref mut writer) = writers[chunk_idx as usize] {
702            writer.write_all(line)?;
703        }
704    }
705
706    // Finish all writers
707    for writer in &mut writers {
708        if let Some(mut w) = writer.take() {
709            w.finish()?;
710        }
711    }
712
713    Ok(())
714}
715
716/// Extract Kth round-robin chunk of N (r/K/N format). Output goes to stdout.
717fn split_by_round_robin_extract(input_path: &str, k: u64, n: u64) -> io::Result<()> {
718    let data = read_input_data(input_path)?;
719    let sep = b'\n';
720
721    let stdout = io::stdout();
722    let mut out = stdout.lock();
723
724    let mut start = 0;
725    let mut line_idx: u64 = 0;
726    for pos in memchr::memchr_iter(sep, &data) {
727        if line_idx % n == k - 1 {
728            out.write_all(&data[start..=pos])?;
729        }
730        start = pos + 1;
731        line_idx += 1;
732    }
733    if start < data.len() && line_idx % n == k - 1 {
734        out.write_all(&data[start..])?;
735    }
736
737    Ok(())
738}
739
740/// Fast pre-loaded line splitting: reads the entire file into a heap buffer and
741/// splits by scanning for separator positions in one pass. Each output chunk is
742/// written with a single write_all() call (no BufWriter needed).
743#[cfg(unix)]
744fn split_lines_preloaded(
745    data: &[u8],
746    config: &SplitConfig,
747    lines_per_chunk: u64,
748) -> io::Result<()> {
749    let limit = max_chunks(&config.suffix_type, config.suffix_length);
750    let sep = config.separator;
751    let mut chunk_index: u64 = 0;
752    let mut chunk_start: usize = 0;
753    let mut lines_in_chunk: u64 = 0;
754
755    for offset in memchr::memchr_iter(sep, data) {
756        lines_in_chunk += 1;
757        if lines_in_chunk >= lines_per_chunk {
758            let chunk_end = offset + 1;
759            if chunk_index >= limit {
760                return Err(io::Error::other("output file suffixes exhausted"));
761            }
762            let path = output_path(config, chunk_index);
763            if config.verbose {
764                eprintln!("creating file '{}'", path);
765            }
766            let mut file = File::create(&path)?;
767            file.write_all(&data[chunk_start..chunk_end])?;
768            chunk_start = chunk_end;
769            chunk_index += 1;
770            lines_in_chunk = 0;
771        }
772    }
773
774    // Write remaining data (partial chunk or data without trailing separator)
775    if chunk_start < data.len() {
776        if chunk_index >= limit {
777            return Err(io::Error::other("output file suffixes exhausted"));
778        }
779        let path = output_path(config, chunk_index);
780        if config.verbose {
781            eprintln!("creating file '{}'", path);
782        }
783        let mut file = File::create(&path)?;
784        file.write_all(&data[chunk_start..])?;
785    }
786
787    Ok(())
788}
789
790/// Main entry point: split a file according to the given configuration.
791/// `input_path` is the path to the input file, or "-" for stdin.
792pub fn split_file(input_path: &str, config: &SplitConfig) -> io::Result<()> {
793    // For number-based splitting, we need to read the whole file to know size.
794    if let SplitMode::Number(n) = config.mode {
795        return split_by_number(input_path, config, n);
796    }
797    if let SplitMode::NumberExtract(k, n) = config.mode {
798        return split_by_number_extract(input_path, k, n);
799    }
800    if let SplitMode::LineChunks(n) = config.mode {
801        return split_by_line_chunks(input_path, config, n);
802    }
803    if let SplitMode::LineChunkExtract(k, n) = config.mode {
804        return split_by_line_chunk_extract(input_path, config, k, n);
805    }
806    if let SplitMode::RoundRobin(n) = config.mode {
807        return split_by_round_robin(input_path, config, n);
808    }
809    if let SplitMode::RoundRobinExtract(k, n) = config.mode {
810        return split_by_round_robin_extract(input_path, k, n);
811    }
812
813    // Fast path: read+memchr line splitting for regular files (no filter).
814    // Intentionally bypasses create_writer for single write_all() per chunk.
815    // Only used for files ≤512 MB to avoid OOM on very large files.
816    // Opens the file once and uses fstat on the fd (not stat on the path) to
817    // avoid an extra syscall and eliminate the TOCTOU race on the size guard.
818    #[cfg(unix)]
819    if let SplitMode::Lines(n) = config.mode {
820        if input_path != "-" && config.filter.is_none() {
821            const FAST_PATH_LIMIT: u64 = 512 * 1024 * 1024;
822            if let Ok(file) = File::open(input_path) {
823                if let Ok(meta) = file.metadata() {
824                    if meta.file_type().is_file() && meta.len() <= FAST_PATH_LIMIT {
825                        let len = meta.len() as usize;
826                        let data = if len > 0 {
827                            let mut buf = vec![0u8; len];
828                            let mut total = 0;
829                            let mut f = &file;
830                            while total < buf.len() {
831                                match f.read(&mut buf[total..]) {
832                                    Ok(0) => break,
833                                    Ok(n) => total += n,
834                                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {
835                                        continue;
836                                    }
837                                    Err(e) => return Err(e),
838                                }
839                            }
840                            buf.truncate(total);
841                            buf
842                        } else {
843                            Vec::new()
844                        };
845                        return split_lines_preloaded(&data, config, n);
846                    }
847                }
848            }
849        }
850    }
851
852    // Open input
853    let reader: Box<dyn Read> = if input_path == "-" {
854        Box::new(io::stdin().lock())
855    } else {
856        let path = Path::new(input_path);
857        if !path.exists() {
858            return Err(io::Error::new(
859                io::ErrorKind::NotFound,
860                format!(
861                    "cannot open '{}' for reading: No such file or directory",
862                    input_path
863                ),
864            ));
865        }
866        let file = File::open(path)?;
867        // Hint kernel to readahead sequentially for better I/O throughput
868        #[cfg(target_os = "linux")]
869        {
870            use std::os::unix::io::AsRawFd;
871            unsafe {
872                libc::posix_fadvise(file.as_raw_fd(), 0, 0, libc::POSIX_FADV_SEQUENTIAL);
873            }
874        }
875        Box::new(file)
876    };
877
878    match config.mode {
879        SplitMode::Lines(n) => {
880            let mut buf_reader = BufReader::with_capacity(1024 * 1024, reader);
881            split_by_lines(&mut buf_reader, config, n)
882        }
883        SplitMode::Bytes(n) => {
884            let mut reader = reader;
885            split_by_bytes(&mut reader, config, n)
886        }
887        SplitMode::LineBytes(n) => {
888            let mut reader = reader;
889            split_by_line_bytes(&mut reader, config, n)
890        }
891        SplitMode::Number(_)
892        | SplitMode::NumberExtract(_, _)
893        | SplitMode::LineChunks(_)
894        | SplitMode::LineChunkExtract(_, _)
895        | SplitMode::RoundRobin(_)
896        | SplitMode::RoundRobinExtract(_, _) => unreachable!(),
897    }
898}
899
900/// Get the list of output file paths that would be generated for given config and chunk count.
901pub fn output_paths(config: &SplitConfig, count: u64) -> Vec<PathBuf> {
902    (0..count)
903        .map(|i| PathBuf::from(output_path(config, i)))
904        .collect()
905}