Skip to main content

coreutils_rs/split/
core.rs

1use std::fs::{self, File};
2use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
3use std::path::{Path, PathBuf};
4use std::process::{Command, Stdio};
5
6/// Suffix type for output filenames.
7#[derive(Clone, Debug, PartialEq)]
8pub enum SuffixType {
9    /// Alphabetic suffixes: aa, ab, ..., zz, aaa, ...
10    Alphabetic,
11    /// Numeric suffixes: 00, 01, ..., 99, 000, ...
12    Numeric(u64),
13    /// Hexadecimal suffixes: 00, 01, ..., ff, 000, ...
14    Hex(u64),
15}
16
17/// Split mode: how to divide the input.
18#[derive(Clone, Debug)]
19pub enum SplitMode {
20    /// Split every N lines (default 1000).
21    Lines(u64),
22    /// Split every N bytes.
23    Bytes(u64),
24    /// Split at line boundaries, at most N bytes per file.
25    LineBytes(u64),
26    /// Split into exactly N output files (by byte count).
27    Number(u64),
28}
29
30/// Configuration for the split command.
31#[derive(Clone, Debug)]
32pub struct SplitConfig {
33    pub mode: SplitMode,
34    pub suffix_type: SuffixType,
35    pub suffix_length: usize,
36    pub additional_suffix: String,
37    pub prefix: String,
38    pub elide_empty: bool,
39    pub verbose: bool,
40    pub filter: Option<String>,
41    pub separator: u8,
42}
43
44impl Default for SplitConfig {
45    fn default() -> Self {
46        Self {
47            mode: SplitMode::Lines(1000),
48            suffix_type: SuffixType::Alphabetic,
49            suffix_length: 2,
50            additional_suffix: String::new(),
51            prefix: "x".to_string(),
52            elide_empty: false,
53            verbose: false,
54            filter: None,
55            separator: b'\n',
56        }
57    }
58}
59
60/// Parse a SIZE string with optional suffix.
61/// Supports: K=1024, M=1024^2, G=1024^3, T=1024^4, P=1024^5, E=1024^6
62/// Also: KB=1000, MB=1000^2, GB=1000^3, etc.
63/// Also: b=512, KiB=1024, MiB=1024^2, etc.
64pub fn parse_size(s: &str) -> Result<u64, String> {
65    let s = s.trim();
66    if s.is_empty() {
67        return Err("empty size".to_string());
68    }
69
70    // Find where the numeric part ends
71    let mut num_end = 0;
72    for (i, c) in s.char_indices() {
73        if c.is_ascii_digit() || (i == 0 && (c == '+' || c == '-')) {
74            num_end = i + c.len_utf8();
75        } else {
76            break;
77        }
78    }
79
80    if num_end == 0 {
81        return Err(format!("invalid number: '{}'", s));
82    }
83
84    let num_str = &s[..num_end];
85    let suffix = &s[num_end..];
86
87    let num: u64 = num_str
88        .parse()
89        .map_err(|_| format!("invalid number: '{}'", num_str))?;
90
91    let multiplier: u64 = match suffix {
92        "" => 1,
93        "b" => 512,
94        "kB" => 1000,
95        "K" | "KiB" => 1024,
96        "MB" => 1_000_000,
97        "M" | "MiB" => 1_048_576,
98        "GB" => 1_000_000_000,
99        "G" | "GiB" => 1_073_741_824,
100        "TB" => 1_000_000_000_000,
101        "T" | "TiB" => 1_099_511_627_776,
102        "PB" => 1_000_000_000_000_000,
103        "P" | "PiB" => 1_125_899_906_842_624,
104        "EB" => 1_000_000_000_000_000_000,
105        "E" | "EiB" => 1_152_921_504_606_846_976,
106        "ZB" | "Z" | "ZiB" | "YB" | "Y" | "YiB" => {
107            if num > 0 {
108                return Ok(u64::MAX);
109            }
110            return Ok(0);
111        }
112        _ => return Err(format!("invalid suffix in '{}'", s)),
113    };
114
115    num.checked_mul(multiplier)
116        .ok_or_else(|| format!("number too large: '{}'", s))
117}
118
119/// Generate the suffix string for a given chunk index.
120pub fn generate_suffix(index: u64, suffix_type: &SuffixType, suffix_length: usize) -> String {
121    match suffix_type {
122        SuffixType::Alphabetic => {
123            let mut result = Vec::with_capacity(suffix_length);
124            let mut remaining = index;
125            for _ in 0..suffix_length {
126                result.push(b'a' + (remaining % 26) as u8);
127                remaining /= 26;
128            }
129            result.reverse();
130            String::from_utf8(result).unwrap()
131        }
132        SuffixType::Numeric(start) => {
133            let val = start + index;
134            format!("{:0>width$}", val, width = suffix_length)
135        }
136        SuffixType::Hex(start) => {
137            let val = start + index;
138            format!("{:0>width$x}", val, width = suffix_length)
139        }
140    }
141}
142
143/// Compute the maximum number of chunks supported for a given suffix configuration.
144pub fn max_chunks(suffix_type: &SuffixType, suffix_length: usize) -> u64 {
145    match suffix_type {
146        SuffixType::Alphabetic => 26u64.saturating_pow(suffix_length as u32),
147        SuffixType::Numeric(_) | SuffixType::Hex(_) => 10u64.saturating_pow(suffix_length as u32),
148    }
149}
150
151/// Build the output file path for a given chunk index.
152fn output_path(config: &SplitConfig, index: u64) -> String {
153    let suffix = generate_suffix(index, &config.suffix_type, config.suffix_length);
154    format!("{}{}{}", config.prefix, suffix, config.additional_suffix)
155}
156
157/// Trait for output sinks: either a file or a filter command pipe.
158trait ChunkWriter: Write {
159    fn finish(&mut self) -> io::Result<()>;
160}
161
162/// Writes chunks to files on disk.
163struct FileChunkWriter {
164    writer: BufWriter<File>,
165}
166
167impl FileChunkWriter {
168    fn create(path: &str) -> io::Result<Self> {
169        let file = File::create(path)?;
170        Ok(Self {
171            writer: BufWriter::with_capacity(1024 * 1024, file), // 1MB output buffer
172        })
173    }
174}
175
176impl Write for FileChunkWriter {
177    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
178        self.writer.write(buf)
179    }
180
181    fn flush(&mut self) -> io::Result<()> {
182        self.writer.flush()
183    }
184}
185
186impl ChunkWriter for FileChunkWriter {
187    fn finish(&mut self) -> io::Result<()> {
188        self.writer.flush()
189    }
190}
191
192/// Writes chunks to a filter command via pipe.
193struct FilterChunkWriter {
194    child: std::process::Child,
195    _stdin_taken: bool,
196}
197
198impl FilterChunkWriter {
199    fn create(filter_cmd: &str, output_path: &str) -> io::Result<Self> {
200        let child = Command::new("sh")
201            .arg("-c")
202            .arg(filter_cmd)
203            .env("FILE", output_path)
204            .stdin(Stdio::piped())
205            .spawn()?;
206        Ok(Self {
207            child,
208            _stdin_taken: false,
209        })
210    }
211}
212
213impl Write for FilterChunkWriter {
214    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
215        if let Some(ref mut stdin) = self.child.stdin {
216            stdin.write(buf)
217        } else {
218            Err(io::Error::new(io::ErrorKind::BrokenPipe, "stdin closed"))
219        }
220    }
221
222    fn flush(&mut self) -> io::Result<()> {
223        if let Some(ref mut stdin) = self.child.stdin {
224            stdin.flush()
225        } else {
226            Ok(())
227        }
228    }
229}
230
231impl ChunkWriter for FilterChunkWriter {
232    fn finish(&mut self) -> io::Result<()> {
233        // Close stdin so the child can finish
234        self.child.stdin.take();
235        let status = self.child.wait()?;
236        if !status.success() {
237            return Err(io::Error::other(format!(
238                "filter command exited with status {}",
239                status
240            )));
241        }
242        Ok(())
243    }
244}
245
246/// Create a chunk writer for the given chunk index.
247fn create_writer(config: &SplitConfig, index: u64) -> io::Result<Box<dyn ChunkWriter>> {
248    let path = output_path(config, index);
249    if config.verbose {
250        eprintln!("creating file '{}'", path);
251    }
252    if let Some(ref filter_cmd) = config.filter {
253        Ok(Box::new(FilterChunkWriter::create(filter_cmd, &path)?))
254    } else {
255        Ok(Box::new(FileChunkWriter::create(&path)?))
256    }
257}
258
259/// Split input by line count.
260/// Uses bulk memchr scanning to count lines within large buffer slices,
261/// writing contiguous multi-line slices instead of copying line-by-line.
262fn split_by_lines(
263    reader: &mut dyn BufRead,
264    config: &SplitConfig,
265    lines_per_chunk: u64,
266) -> io::Result<()> {
267    let limit = max_chunks(&config.suffix_type, config.suffix_length);
268    let mut chunk_index: u64 = 0;
269    let mut lines_in_chunk: u64 = 0;
270    let mut writer: Option<Box<dyn ChunkWriter>> = None;
271    let sep = config.separator;
272
273    loop {
274        let available = match reader.fill_buf() {
275            Ok([]) => break,
276            Ok(b) => b,
277            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
278            Err(e) => return Err(e),
279        };
280
281        let mut pos = 0;
282        let buf_len = available.len();
283
284        while pos < buf_len {
285            if writer.is_none() {
286                if chunk_index >= limit {
287                    return Err(io::Error::other("output file suffixes exhausted"));
288                }
289                writer = Some(create_writer(config, chunk_index)?);
290                lines_in_chunk = 0;
291            }
292
293            // How many lines left before we need a new chunk?
294            let lines_needed = lines_per_chunk - lines_in_chunk;
295            let slice = &available[pos..];
296
297            // Use memchr_iter for bulk SIMD scanning — finds all separator
298            // positions in one pass instead of N individual memchr calls.
299            let mut found = 0u64;
300            let mut last_sep_end = 0;
301
302            for offset in memchr::memchr_iter(sep, slice) {
303                found += 1;
304                last_sep_end = offset + 1;
305                if found >= lines_needed {
306                    break;
307                }
308            }
309
310            if found >= lines_needed {
311                // We found enough lines - write the contiguous slice
312                writer.as_mut().unwrap().write_all(&slice[..last_sep_end])?;
313                pos += last_sep_end;
314                // Close this chunk
315                writer.as_mut().unwrap().finish()?;
316                writer = None;
317                chunk_index += 1;
318            } else {
319                // Not enough lines in this buffer - write everything and get more
320                writer.as_mut().unwrap().write_all(slice)?;
321                lines_in_chunk += found;
322                pos = buf_len;
323            }
324        }
325
326        let consumed = buf_len;
327        reader.consume(consumed);
328    }
329
330    // Handle final partial chunk (data without trailing separator)
331    if let Some(ref mut w) = writer {
332        w.finish()?;
333    }
334
335    Ok(())
336}
337
338/// Read bytes from reader until the separator byte (inclusive), appending to buf.
339/// Returns number of bytes read (0 at EOF).
340fn read_until_sep(reader: &mut dyn BufRead, sep: u8, buf: &mut Vec<u8>) -> io::Result<usize> {
341    if sep == b'\n' {
342        // Use the built-in BufRead::read_until for newline, it's optimized
343        let n = reader.read_until(b'\n', buf)?;
344        return Ok(n);
345    }
346    // Custom separator
347    let start_len = buf.len();
348    loop {
349        let available = match reader.fill_buf() {
350            Ok(b) => b,
351            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
352            Err(e) => return Err(e),
353        };
354        if available.is_empty() {
355            return Ok(buf.len() - start_len);
356        }
357        if let Some(pos) = memchr::memchr(sep, available) {
358            buf.extend_from_slice(&available[..=pos]);
359            let consume = pos + 1;
360            reader.consume(consume);
361            return Ok(buf.len() - start_len);
362        }
363        buf.extend_from_slice(available);
364        let len = available.len();
365        reader.consume(len);
366    }
367}
368
369/// Split input by byte count.
370fn split_by_bytes(
371    reader: &mut dyn Read,
372    config: &SplitConfig,
373    bytes_per_chunk: u64,
374) -> io::Result<()> {
375    let limit = max_chunks(&config.suffix_type, config.suffix_length);
376    let mut chunk_index: u64 = 0;
377    let mut bytes_in_chunk: u64 = 0;
378    let mut writer: Option<Box<dyn ChunkWriter>> = None;
379
380    let mut read_buf = vec![0u8; 1024 * 1024]; // 1MB read buffer for fewer syscalls
381    loop {
382        let bytes_read = match reader.read(&mut read_buf) {
383            Ok(0) => break,
384            Ok(n) => n,
385            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
386            Err(e) => return Err(e),
387        };
388
389        let mut offset = 0usize;
390        while offset < bytes_read {
391            if writer.is_none() {
392                if chunk_index >= limit {
393                    return Err(io::Error::other("output file suffixes exhausted"));
394                }
395                writer = Some(create_writer(config, chunk_index)?);
396                bytes_in_chunk = 0;
397            }
398
399            let remaining_in_chunk = (bytes_per_chunk - bytes_in_chunk) as usize;
400            let remaining_in_buf = bytes_read - offset;
401            let to_write = remaining_in_chunk.min(remaining_in_buf);
402
403            writer
404                .as_mut()
405                .unwrap()
406                .write_all(&read_buf[offset..offset + to_write])?;
407            bytes_in_chunk += to_write as u64;
408            offset += to_write;
409
410            if bytes_in_chunk >= bytes_per_chunk {
411                writer.as_mut().unwrap().finish()?;
412                writer = None;
413                chunk_index += 1;
414            }
415        }
416    }
417
418    if let Some(ref mut w) = writer {
419        if config.elide_empty && bytes_in_chunk == 0 {
420            w.finish()?;
421            // Remove the empty file
422            let path = output_path(config, chunk_index);
423            let _ = fs::remove_file(&path);
424        } else {
425            w.finish()?;
426        }
427    }
428
429    Ok(())
430}
431
432/// Split input by line-bytes: at most N bytes per file, breaking at line boundaries.
433fn split_by_line_bytes(
434    reader: &mut dyn BufRead,
435    config: &SplitConfig,
436    max_bytes: u64,
437) -> io::Result<()> {
438    let limit = max_chunks(&config.suffix_type, config.suffix_length);
439    let mut chunk_index: u64 = 0;
440    let mut bytes_in_chunk: u64 = 0;
441    let mut writer: Option<Box<dyn ChunkWriter>> = None;
442    let sep = config.separator;
443
444    let mut buf = Vec::with_capacity(8192);
445    loop {
446        buf.clear();
447        let bytes_read = read_until_sep(reader, sep, &mut buf)?;
448        if bytes_read == 0 {
449            break;
450        }
451
452        let line_len = buf.len() as u64;
453
454        // If this line alone exceeds the max, we must write it (possibly to its own chunk).
455        // If adding this line would exceed the max and we've already written something,
456        // start a new chunk.
457        if bytes_in_chunk > 0 && bytes_in_chunk + line_len > max_bytes {
458            if let Some(ref mut w) = writer {
459                w.finish()?;
460            }
461            writer = None;
462            chunk_index += 1;
463            bytes_in_chunk = 0;
464        }
465
466        if writer.is_none() {
467            if chunk_index >= limit {
468                return Err(io::Error::other("output file suffixes exhausted"));
469            }
470            writer = Some(create_writer(config, chunk_index)?);
471            bytes_in_chunk = 0;
472        }
473
474        // If the line itself is longer than max_bytes, we still write the whole line
475        // to this chunk (GNU split behavior: -C never splits a line).
476        writer.as_mut().unwrap().write_all(&buf)?;
477        bytes_in_chunk += line_len;
478
479        if bytes_in_chunk >= max_bytes {
480            if let Some(ref mut w) = writer {
481                w.finish()?;
482            }
483            writer = None;
484            chunk_index += 1;
485            bytes_in_chunk = 0;
486        }
487    }
488
489    if let Some(ref mut w) = writer {
490        w.finish()?;
491    }
492
493    Ok(())
494}
495
496/// Split input into exactly N chunks by byte count.
497/// Reads the whole file to determine size, then distributes bytes evenly.
498fn split_by_number(input_path: &str, config: &SplitConfig, n_chunks: u64) -> io::Result<()> {
499    let limit = max_chunks(&config.suffix_type, config.suffix_length);
500    if n_chunks > limit {
501        return Err(io::Error::other("output file suffixes exhausted"));
502    }
503    if n_chunks == 0 {
504        return Err(io::Error::new(
505            io::ErrorKind::InvalidInput,
506            "invalid number of chunks: 0",
507        ));
508    }
509
510    // Read input data (mmap for regular files, read for stdin)
511    let data: crate::common::io::FileData = if input_path == "-" {
512        let mut buf = Vec::new();
513        io::stdin().lock().read_to_end(&mut buf)?;
514        crate::common::io::FileData::Owned(buf)
515    } else {
516        crate::common::io::read_file(Path::new(input_path))?
517    };
518
519    let total = data.len() as u64;
520    let base_chunk_size = total / n_chunks;
521    let remainder = total % n_chunks;
522
523    let mut offset: u64 = 0;
524    for i in 0..n_chunks {
525        // First `remainder` chunks get one extra byte
526        let chunk_size = base_chunk_size + if i < remainder { 1 } else { 0 };
527
528        if config.elide_empty && chunk_size == 0 {
529            continue;
530        }
531
532        let mut writer = create_writer(config, i)?;
533        if chunk_size > 0 {
534            let start = offset as usize;
535            let end = start + chunk_size as usize;
536            writer.write_all(&data[start..end])?;
537        }
538        writer.finish()?;
539        offset += chunk_size;
540    }
541
542    Ok(())
543}
544
545/// Fast pre-loaded line splitting: reads the entire file into a heap buffer and
546/// splits by scanning for separator positions in one pass. Each output chunk is
547/// written with a single write_all() call (no BufWriter needed).
548#[cfg(unix)]
549fn split_lines_preloaded(
550    data: &[u8],
551    config: &SplitConfig,
552    lines_per_chunk: u64,
553) -> io::Result<()> {
554    let limit = max_chunks(&config.suffix_type, config.suffix_length);
555    let sep = config.separator;
556    let mut chunk_index: u64 = 0;
557    let mut chunk_start: usize = 0;
558    let mut lines_in_chunk: u64 = 0;
559
560    for offset in memchr::memchr_iter(sep, data) {
561        lines_in_chunk += 1;
562        if lines_in_chunk >= lines_per_chunk {
563            let chunk_end = offset + 1;
564            if chunk_index >= limit {
565                return Err(io::Error::other("output file suffixes exhausted"));
566            }
567            let path = output_path(config, chunk_index);
568            if config.verbose {
569                eprintln!("creating file '{}'", path);
570            }
571            let mut file = File::create(&path)?;
572            file.write_all(&data[chunk_start..chunk_end])?;
573            chunk_start = chunk_end;
574            chunk_index += 1;
575            lines_in_chunk = 0;
576        }
577    }
578
579    // Write remaining data (partial chunk or data without trailing separator)
580    if chunk_start < data.len() {
581        if chunk_index >= limit {
582            return Err(io::Error::other("output file suffixes exhausted"));
583        }
584        let path = output_path(config, chunk_index);
585        if config.verbose {
586            eprintln!("creating file '{}'", path);
587        }
588        let mut file = File::create(&path)?;
589        file.write_all(&data[chunk_start..])?;
590    }
591
592    Ok(())
593}
594
595/// Main entry point: split a file according to the given configuration.
596/// `input_path` is the path to the input file, or "-" for stdin.
597pub fn split_file(input_path: &str, config: &SplitConfig) -> io::Result<()> {
598    // For number-based splitting, we need to read the whole file to know size.
599    if let SplitMode::Number(n) = config.mode {
600        return split_by_number(input_path, config, n);
601    }
602
603    // Fast path: read+memchr line splitting for regular files (no filter).
604    // Intentionally bypasses create_writer for single write_all() per chunk.
605    // Only used for files ≤512 MB to avoid OOM on very large files.
606    // Opens the file once and uses fstat on the fd (not stat on the path) to
607    // avoid an extra syscall and eliminate the TOCTOU race on the size guard.
608    #[cfg(unix)]
609    if let SplitMode::Lines(n) = config.mode {
610        if input_path != "-" && config.filter.is_none() {
611            const FAST_PATH_LIMIT: u64 = 512 * 1024 * 1024;
612            if let Ok(file) = File::open(input_path) {
613                if let Ok(meta) = file.metadata() {
614                    if meta.file_type().is_file() && meta.len() <= FAST_PATH_LIMIT {
615                        let len = meta.len() as usize;
616                        let data = if len > 0 {
617                            let mut buf = vec![0u8; len];
618                            let mut total = 0;
619                            let mut f = &file;
620                            while total < buf.len() {
621                                match f.read(&mut buf[total..]) {
622                                    Ok(0) => break,
623                                    Ok(n) => total += n,
624                                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {
625                                        continue;
626                                    }
627                                    Err(e) => return Err(e),
628                                }
629                            }
630                            buf.truncate(total);
631                            buf
632                        } else {
633                            Vec::new()
634                        };
635                        return split_lines_preloaded(&data, config, n);
636                    }
637                }
638            }
639        }
640    }
641
642    // Open input
643    let reader: Box<dyn Read> = if input_path == "-" {
644        Box::new(io::stdin().lock())
645    } else {
646        let path = Path::new(input_path);
647        if !path.exists() {
648            return Err(io::Error::new(
649                io::ErrorKind::NotFound,
650                format!(
651                    "cannot open '{}' for reading: No such file or directory",
652                    input_path
653                ),
654            ));
655        }
656        let file = File::open(path)?;
657        // Hint kernel to readahead sequentially for better I/O throughput
658        #[cfg(target_os = "linux")]
659        {
660            use std::os::unix::io::AsRawFd;
661            unsafe {
662                libc::posix_fadvise(file.as_raw_fd(), 0, 0, libc::POSIX_FADV_SEQUENTIAL);
663            }
664        }
665        Box::new(file)
666    };
667
668    match config.mode {
669        SplitMode::Lines(n) => {
670            let mut buf_reader = BufReader::with_capacity(1024 * 1024, reader);
671            split_by_lines(&mut buf_reader, config, n)
672        }
673        SplitMode::Bytes(n) => {
674            let mut reader = reader;
675            split_by_bytes(&mut reader, config, n)
676        }
677        SplitMode::LineBytes(n) => {
678            let mut buf_reader = BufReader::with_capacity(1024 * 1024, reader);
679            split_by_line_bytes(&mut buf_reader, config, n)
680        }
681        SplitMode::Number(_) => unreachable!(),
682    }
683}
684
685/// Get the list of output file paths that would be generated for given config and chunk count.
686pub fn output_paths(config: &SplitConfig, count: u64) -> Vec<PathBuf> {
687    (0..count)
688        .map(|i| PathBuf::from(output_path(config, i)))
689        .collect()
690}