Skip to main content

coreutils_rs/split/
core.rs

1use std::fs::{self, File};
2use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
3use std::path::{Path, PathBuf};
4use std::process::{Command, Stdio};
5
6/// Suffix type for output filenames.
7#[derive(Clone, Debug, PartialEq)]
8pub enum SuffixType {
9    /// Alphabetic suffixes: aa, ab, ..., zz, aaa, ...
10    Alphabetic,
11    /// Numeric suffixes: 00, 01, ..., 99, 000, ...
12    Numeric(u64),
13    /// Hexadecimal suffixes: 00, 01, ..., ff, 000, ...
14    Hex(u64),
15}
16
17/// Split mode: how to divide the input.
18#[derive(Clone, Debug)]
19pub enum SplitMode {
20    /// Split every N lines (default 1000).
21    Lines(u64),
22    /// Split every N bytes.
23    Bytes(u64),
24    /// Split at line boundaries, at most N bytes per file.
25    LineBytes(u64),
26    /// Split into exactly N output files (by byte count).
27    Number(u64),
28}
29
30/// Configuration for the split command.
31#[derive(Clone, Debug)]
32pub struct SplitConfig {
33    pub mode: SplitMode,
34    pub suffix_type: SuffixType,
35    pub suffix_length: usize,
36    pub additional_suffix: String,
37    pub prefix: String,
38    pub elide_empty: bool,
39    pub verbose: bool,
40    pub filter: Option<String>,
41    pub separator: u8,
42}
43
44impl Default for SplitConfig {
45    fn default() -> Self {
46        Self {
47            mode: SplitMode::Lines(1000),
48            suffix_type: SuffixType::Alphabetic,
49            suffix_length: 2,
50            additional_suffix: String::new(),
51            prefix: "x".to_string(),
52            elide_empty: false,
53            verbose: false,
54            filter: None,
55            separator: b'\n',
56        }
57    }
58}
59
60/// Parse a SIZE string with optional suffix.
61/// Supports: K=1024, M=1024^2, G=1024^3, T=1024^4, P=1024^5, E=1024^6
62/// Also: KB=1000, MB=1000^2, GB=1000^3, etc.
63/// Also: b=512, KiB=1024, MiB=1024^2, etc.
64pub fn parse_size(s: &str) -> Result<u64, String> {
65    let s = s.trim();
66    if s.is_empty() {
67        return Err("empty size".to_string());
68    }
69
70    // Find where the numeric part ends
71    let mut num_end = 0;
72    for (i, c) in s.char_indices() {
73        if c.is_ascii_digit() || (i == 0 && (c == '+' || c == '-')) {
74            num_end = i + c.len_utf8();
75        } else {
76            break;
77        }
78    }
79
80    if num_end == 0 {
81        return Err(format!("invalid number: '{}'", s));
82    }
83
84    let num_str = &s[..num_end];
85    let suffix = &s[num_end..];
86
87    let num: u64 = num_str
88        .parse()
89        .map_err(|_| format!("invalid number: '{}'", num_str))?;
90
91    let multiplier: u64 = match suffix {
92        "" => 1,
93        "b" => 512,
94        "kB" => 1000,
95        "K" | "KiB" => 1024,
96        "MB" => 1_000_000,
97        "M" | "MiB" => 1_048_576,
98        "GB" => 1_000_000_000,
99        "G" | "GiB" => 1_073_741_824,
100        "TB" => 1_000_000_000_000,
101        "T" | "TiB" => 1_099_511_627_776,
102        "PB" => 1_000_000_000_000_000,
103        "P" | "PiB" => 1_125_899_906_842_624,
104        "EB" => 1_000_000_000_000_000_000,
105        "E" | "EiB" => 1_152_921_504_606_846_976,
106        "ZB" | "Z" | "ZiB" | "YB" | "Y" | "YiB" => {
107            if num > 0 {
108                return Ok(u64::MAX);
109            }
110            return Ok(0);
111        }
112        _ => return Err(format!("invalid suffix in '{}'", s)),
113    };
114
115    num.checked_mul(multiplier)
116        .ok_or_else(|| format!("number too large: '{}'", s))
117}
118
119/// Generate the suffix string for a given chunk index.
120pub fn generate_suffix(index: u64, suffix_type: &SuffixType, suffix_length: usize) -> String {
121    match suffix_type {
122        SuffixType::Alphabetic => {
123            let mut result = Vec::with_capacity(suffix_length);
124            let mut remaining = index;
125            for _ in 0..suffix_length {
126                result.push(b'a' + (remaining % 26) as u8);
127                remaining /= 26;
128            }
129            result.reverse();
130            String::from_utf8(result).unwrap()
131        }
132        SuffixType::Numeric(start) => {
133            let val = start + index;
134            format!("{:0>width$}", val, width = suffix_length)
135        }
136        SuffixType::Hex(start) => {
137            let val = start + index;
138            format!("{:0>width$x}", val, width = suffix_length)
139        }
140    }
141}
142
143/// Compute the maximum number of chunks supported for a given suffix configuration.
144pub fn max_chunks(suffix_type: &SuffixType, suffix_length: usize) -> u64 {
145    match suffix_type {
146        SuffixType::Alphabetic => 26u64.saturating_pow(suffix_length as u32),
147        SuffixType::Numeric(_) | SuffixType::Hex(_) => 10u64.saturating_pow(suffix_length as u32),
148    }
149}
150
151/// Build the output file path for a given chunk index.
152fn output_path(config: &SplitConfig, index: u64) -> String {
153    let suffix = generate_suffix(index, &config.suffix_type, config.suffix_length);
154    format!("{}{}{}", config.prefix, suffix, config.additional_suffix)
155}
156
157/// Trait for output sinks: either a file or a filter command pipe.
158trait ChunkWriter: Write {
159    fn finish(&mut self) -> io::Result<()>;
160}
161
162/// Writes chunks to files on disk.
163struct FileChunkWriter {
164    writer: BufWriter<File>,
165}
166
167impl FileChunkWriter {
168    fn create(path: &str) -> io::Result<Self> {
169        let file = File::create(path)?;
170        Ok(Self {
171            writer: BufWriter::with_capacity(1024 * 1024, file), // 1MB output buffer
172        })
173    }
174}
175
176impl Write for FileChunkWriter {
177    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
178        self.writer.write(buf)
179    }
180
181    fn flush(&mut self) -> io::Result<()> {
182        self.writer.flush()
183    }
184}
185
186impl ChunkWriter for FileChunkWriter {
187    fn finish(&mut self) -> io::Result<()> {
188        self.writer.flush()
189    }
190}
191
192/// Writes chunks to a filter command via pipe.
193struct FilterChunkWriter {
194    child: std::process::Child,
195    _stdin_taken: bool,
196}
197
198impl FilterChunkWriter {
199    fn create(filter_cmd: &str, output_path: &str) -> io::Result<Self> {
200        let child = Command::new("sh")
201            .arg("-c")
202            .arg(filter_cmd)
203            .env("FILE", output_path)
204            .stdin(Stdio::piped())
205            .spawn()?;
206        Ok(Self {
207            child,
208            _stdin_taken: false,
209        })
210    }
211}
212
213impl Write for FilterChunkWriter {
214    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
215        if let Some(ref mut stdin) = self.child.stdin {
216            stdin.write(buf)
217        } else {
218            Err(io::Error::new(io::ErrorKind::BrokenPipe, "stdin closed"))
219        }
220    }
221
222    fn flush(&mut self) -> io::Result<()> {
223        if let Some(ref mut stdin) = self.child.stdin {
224            stdin.flush()
225        } else {
226            Ok(())
227        }
228    }
229}
230
231impl ChunkWriter for FilterChunkWriter {
232    fn finish(&mut self) -> io::Result<()> {
233        // Close stdin so the child can finish
234        self.child.stdin.take();
235        let status = self.child.wait()?;
236        if !status.success() {
237            return Err(io::Error::other(format!(
238                "filter command exited with status {}",
239                status
240            )));
241        }
242        Ok(())
243    }
244}
245
246/// Create a chunk writer for the given chunk index.
247fn create_writer(config: &SplitConfig, index: u64) -> io::Result<Box<dyn ChunkWriter>> {
248    let path = output_path(config, index);
249    if config.verbose {
250        eprintln!("creating file '{}'", path);
251    }
252    if let Some(ref filter_cmd) = config.filter {
253        Ok(Box::new(FilterChunkWriter::create(filter_cmd, &path)?))
254    } else {
255        Ok(Box::new(FileChunkWriter::create(&path)?))
256    }
257}
258
259/// Split input by line count.
260fn split_by_lines(
261    reader: &mut dyn BufRead,
262    config: &SplitConfig,
263    lines_per_chunk: u64,
264) -> io::Result<()> {
265    let limit = max_chunks(&config.suffix_type, config.suffix_length);
266    let mut chunk_index: u64 = 0;
267    let mut line_count: u64 = 0;
268    let mut writer: Option<Box<dyn ChunkWriter>> = None;
269    let sep = config.separator;
270
271    let mut buf = Vec::with_capacity(8192);
272    loop {
273        buf.clear();
274        let bytes_read = read_until_sep(reader, sep, &mut buf)?;
275        if bytes_read == 0 {
276            break;
277        }
278
279        if writer.is_none() {
280            if chunk_index >= limit {
281                return Err(io::Error::other("output file suffixes exhausted"));
282            }
283            writer = Some(create_writer(config, chunk_index)?);
284        }
285
286        writer.as_mut().unwrap().write_all(&buf)?;
287        line_count += 1;
288
289        if line_count >= lines_per_chunk {
290            writer.as_mut().unwrap().finish()?;
291            writer = None;
292            line_count = 0;
293            chunk_index += 1;
294        }
295    }
296
297    if let Some(ref mut w) = writer {
298        w.finish()?;
299    }
300
301    Ok(())
302}
303
304/// Read bytes from reader until the separator byte (inclusive), appending to buf.
305/// Returns number of bytes read (0 at EOF).
306fn read_until_sep(reader: &mut dyn BufRead, sep: u8, buf: &mut Vec<u8>) -> io::Result<usize> {
307    if sep == b'\n' {
308        // Use the built-in BufRead::read_until for newline, it's optimized
309        let n = reader.read_until(b'\n', buf)?;
310        return Ok(n);
311    }
312    // Custom separator
313    let start_len = buf.len();
314    loop {
315        let available = match reader.fill_buf() {
316            Ok(b) => b,
317            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
318            Err(e) => return Err(e),
319        };
320        if available.is_empty() {
321            return Ok(buf.len() - start_len);
322        }
323        if let Some(pos) = memchr::memchr(sep, available) {
324            buf.extend_from_slice(&available[..=pos]);
325            let consume = pos + 1;
326            reader.consume(consume);
327            return Ok(buf.len() - start_len);
328        }
329        buf.extend_from_slice(available);
330        let len = available.len();
331        reader.consume(len);
332    }
333}
334
335/// Split input by byte count.
336fn split_by_bytes(
337    reader: &mut dyn Read,
338    config: &SplitConfig,
339    bytes_per_chunk: u64,
340) -> io::Result<()> {
341    let limit = max_chunks(&config.suffix_type, config.suffix_length);
342    let mut chunk_index: u64 = 0;
343    let mut bytes_in_chunk: u64 = 0;
344    let mut writer: Option<Box<dyn ChunkWriter>> = None;
345
346    let mut read_buf = vec![0u8; 1024 * 1024]; // 1MB read buffer for fewer syscalls
347    loop {
348        let bytes_read = match reader.read(&mut read_buf) {
349            Ok(0) => break,
350            Ok(n) => n,
351            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
352            Err(e) => return Err(e),
353        };
354
355        let mut offset = 0usize;
356        while offset < bytes_read {
357            if writer.is_none() {
358                if chunk_index >= limit {
359                    return Err(io::Error::other("output file suffixes exhausted"));
360                }
361                writer = Some(create_writer(config, chunk_index)?);
362                bytes_in_chunk = 0;
363            }
364
365            let remaining_in_chunk = (bytes_per_chunk - bytes_in_chunk) as usize;
366            let remaining_in_buf = bytes_read - offset;
367            let to_write = remaining_in_chunk.min(remaining_in_buf);
368
369            writer
370                .as_mut()
371                .unwrap()
372                .write_all(&read_buf[offset..offset + to_write])?;
373            bytes_in_chunk += to_write as u64;
374            offset += to_write;
375
376            if bytes_in_chunk >= bytes_per_chunk {
377                writer.as_mut().unwrap().finish()?;
378                writer = None;
379                chunk_index += 1;
380            }
381        }
382    }
383
384    if let Some(ref mut w) = writer {
385        if config.elide_empty && bytes_in_chunk == 0 {
386            w.finish()?;
387            // Remove the empty file
388            let path = output_path(config, chunk_index);
389            let _ = fs::remove_file(&path);
390        } else {
391            w.finish()?;
392        }
393    }
394
395    Ok(())
396}
397
398/// Split input by line-bytes: at most N bytes per file, breaking at line boundaries.
399fn split_by_line_bytes(
400    reader: &mut dyn BufRead,
401    config: &SplitConfig,
402    max_bytes: u64,
403) -> io::Result<()> {
404    let limit = max_chunks(&config.suffix_type, config.suffix_length);
405    let mut chunk_index: u64 = 0;
406    let mut bytes_in_chunk: u64 = 0;
407    let mut writer: Option<Box<dyn ChunkWriter>> = None;
408    let sep = config.separator;
409
410    let mut buf = Vec::with_capacity(8192);
411    loop {
412        buf.clear();
413        let bytes_read = read_until_sep(reader, sep, &mut buf)?;
414        if bytes_read == 0 {
415            break;
416        }
417
418        let line_len = buf.len() as u64;
419
420        // If this line alone exceeds the max, we must write it (possibly to its own chunk).
421        // If adding this line would exceed the max and we've already written something,
422        // start a new chunk.
423        if bytes_in_chunk > 0 && bytes_in_chunk + line_len > max_bytes {
424            if let Some(ref mut w) = writer {
425                w.finish()?;
426            }
427            writer = None;
428            chunk_index += 1;
429            bytes_in_chunk = 0;
430        }
431
432        if writer.is_none() {
433            if chunk_index >= limit {
434                return Err(io::Error::other("output file suffixes exhausted"));
435            }
436            writer = Some(create_writer(config, chunk_index)?);
437            bytes_in_chunk = 0;
438        }
439
440        // If the line itself is longer than max_bytes, we still write the whole line
441        // to this chunk (GNU split behavior: -C never splits a line).
442        writer.as_mut().unwrap().write_all(&buf)?;
443        bytes_in_chunk += line_len;
444
445        if bytes_in_chunk >= max_bytes {
446            if let Some(ref mut w) = writer {
447                w.finish()?;
448            }
449            writer = None;
450            chunk_index += 1;
451            bytes_in_chunk = 0;
452        }
453    }
454
455    if let Some(ref mut w) = writer {
456        w.finish()?;
457    }
458
459    Ok(())
460}
461
462/// Split input into exactly N chunks by byte count.
463/// Reads the whole file to determine size, then distributes bytes evenly.
464fn split_by_number(input_path: &str, config: &SplitConfig, n_chunks: u64) -> io::Result<()> {
465    let limit = max_chunks(&config.suffix_type, config.suffix_length);
466    if n_chunks > limit {
467        return Err(io::Error::other("output file suffixes exhausted"));
468    }
469    if n_chunks == 0 {
470        return Err(io::Error::new(
471            io::ErrorKind::InvalidInput,
472            "invalid number of chunks: 0",
473        ));
474    }
475
476    // Read input data
477    let data = if input_path == "-" {
478        let mut buf = Vec::new();
479        io::stdin().lock().read_to_end(&mut buf)?;
480        buf
481    } else {
482        fs::read(input_path)?
483    };
484
485    let total = data.len() as u64;
486    let base_chunk_size = total / n_chunks;
487    let remainder = total % n_chunks;
488
489    let mut offset: u64 = 0;
490    for i in 0..n_chunks {
491        // First `remainder` chunks get one extra byte
492        let chunk_size = base_chunk_size + if i < remainder { 1 } else { 0 };
493
494        if config.elide_empty && chunk_size == 0 {
495            continue;
496        }
497
498        let mut writer = create_writer(config, i)?;
499        if chunk_size > 0 {
500            let start = offset as usize;
501            let end = start + chunk_size as usize;
502            writer.write_all(&data[start..end])?;
503        }
504        writer.finish()?;
505        offset += chunk_size;
506    }
507
508    Ok(())
509}
510
511/// Main entry point: split a file according to the given configuration.
512/// `input_path` is the path to the input file, or "-" for stdin.
513pub fn split_file(input_path: &str, config: &SplitConfig) -> io::Result<()> {
514    // For number-based splitting, we need to read the whole file to know size.
515    if let SplitMode::Number(n) = config.mode {
516        return split_by_number(input_path, config, n);
517    }
518
519    // Open input
520    let reader: Box<dyn Read> = if input_path == "-" {
521        Box::new(io::stdin().lock())
522    } else {
523        let path = Path::new(input_path);
524        if !path.exists() {
525            return Err(io::Error::new(
526                io::ErrorKind::NotFound,
527                format!(
528                    "cannot open '{}' for reading: No such file or directory",
529                    input_path
530                ),
531            ));
532        }
533        Box::new(File::open(path)?)
534    };
535
536    match config.mode {
537        SplitMode::Lines(n) => {
538            let mut buf_reader = BufReader::with_capacity(256 * 1024, reader);
539            split_by_lines(&mut buf_reader, config, n)
540        }
541        SplitMode::Bytes(n) => {
542            let mut reader = reader;
543            split_by_bytes(&mut reader, config, n)
544        }
545        SplitMode::LineBytes(n) => {
546            let mut buf_reader = BufReader::with_capacity(256 * 1024, reader);
547            split_by_line_bytes(&mut buf_reader, config, n)
548        }
549        SplitMode::Number(_) => unreachable!(),
550    }
551}
552
553/// Get the list of output file paths that would be generated for given config and chunk count.
554pub fn output_paths(config: &SplitConfig, count: u64) -> Vec<PathBuf> {
555    (0..count)
556        .map(|i| PathBuf::from(output_path(config, i)))
557        .collect()
558}