1use std::fs::{self, File};
2use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
3use std::path::{Path, PathBuf};
4use std::process::{Command, Stdio};
5
6#[derive(Clone, Debug, PartialEq)]
8pub enum SuffixType {
9 Alphabetic,
11 Numeric(u64),
13 Hex(u64),
15}
16
17#[derive(Clone, Debug)]
19pub enum SplitMode {
20 Lines(u64),
22 Bytes(u64),
24 LineBytes(u64),
26 Number(u64),
28}
29
30#[derive(Clone, Debug)]
32pub struct SplitConfig {
33 pub mode: SplitMode,
34 pub suffix_type: SuffixType,
35 pub suffix_length: usize,
36 pub additional_suffix: String,
37 pub prefix: String,
38 pub elide_empty: bool,
39 pub verbose: bool,
40 pub filter: Option<String>,
41 pub separator: u8,
42}
43
44impl Default for SplitConfig {
45 fn default() -> Self {
46 Self {
47 mode: SplitMode::Lines(1000),
48 suffix_type: SuffixType::Alphabetic,
49 suffix_length: 2,
50 additional_suffix: String::new(),
51 prefix: "x".to_string(),
52 elide_empty: false,
53 verbose: false,
54 filter: None,
55 separator: b'\n',
56 }
57 }
58}
59
60pub fn parse_size(s: &str) -> Result<u64, String> {
65 let s = s.trim();
66 if s.is_empty() {
67 return Err("empty size".to_string());
68 }
69
70 let mut num_end = 0;
72 for (i, c) in s.char_indices() {
73 if c.is_ascii_digit() || (i == 0 && (c == '+' || c == '-')) {
74 num_end = i + c.len_utf8();
75 } else {
76 break;
77 }
78 }
79
80 if num_end == 0 {
81 return Err(format!("invalid number: '{}'", s));
82 }
83
84 let num_str = &s[..num_end];
85 let suffix = &s[num_end..];
86
87 let num: u64 = num_str
88 .parse()
89 .map_err(|_| format!("invalid number: '{}'", num_str))?;
90
91 let multiplier: u64 = match suffix {
92 "" => 1,
93 "b" => 512,
94 "kB" => 1000,
95 "K" | "KiB" => 1024,
96 "MB" => 1_000_000,
97 "M" | "MiB" => 1_048_576,
98 "GB" => 1_000_000_000,
99 "G" | "GiB" => 1_073_741_824,
100 "TB" => 1_000_000_000_000,
101 "T" | "TiB" => 1_099_511_627_776,
102 "PB" => 1_000_000_000_000_000,
103 "P" | "PiB" => 1_125_899_906_842_624,
104 "EB" => 1_000_000_000_000_000_000,
105 "E" | "EiB" => 1_152_921_504_606_846_976,
106 "ZB" | "Z" | "ZiB" | "YB" | "Y" | "YiB" => {
107 if num > 0 {
108 return Ok(u64::MAX);
109 }
110 return Ok(0);
111 }
112 _ => return Err(format!("invalid suffix in '{}'", s)),
113 };
114
115 num.checked_mul(multiplier)
116 .ok_or_else(|| format!("number too large: '{}'", s))
117}
118
119pub fn generate_suffix(index: u64, suffix_type: &SuffixType, suffix_length: usize) -> String {
121 match suffix_type {
122 SuffixType::Alphabetic => {
123 let mut result = Vec::with_capacity(suffix_length);
124 let mut remaining = index;
125 for _ in 0..suffix_length {
126 result.push(b'a' + (remaining % 26) as u8);
127 remaining /= 26;
128 }
129 result.reverse();
130 String::from_utf8(result).unwrap()
131 }
132 SuffixType::Numeric(start) => {
133 let val = start + index;
134 format!("{:0>width$}", val, width = suffix_length)
135 }
136 SuffixType::Hex(start) => {
137 let val = start + index;
138 format!("{:0>width$x}", val, width = suffix_length)
139 }
140 }
141}
142
143pub fn max_chunks(suffix_type: &SuffixType, suffix_length: usize) -> u64 {
145 match suffix_type {
146 SuffixType::Alphabetic => 26u64.saturating_pow(suffix_length as u32),
147 SuffixType::Numeric(_) | SuffixType::Hex(_) => 10u64.saturating_pow(suffix_length as u32),
148 }
149}
150
151fn output_path(config: &SplitConfig, index: u64) -> String {
153 let suffix = generate_suffix(index, &config.suffix_type, config.suffix_length);
154 format!("{}{}{}", config.prefix, suffix, config.additional_suffix)
155}
156
157trait ChunkWriter: Write {
159 fn finish(&mut self) -> io::Result<()>;
160}
161
162struct FileChunkWriter {
164 writer: BufWriter<File>,
165}
166
167impl FileChunkWriter {
168 fn create(path: &str) -> io::Result<Self> {
169 let file = File::create(path)?;
170 Ok(Self {
171 writer: BufWriter::with_capacity(1024 * 1024, file), })
173 }
174}
175
176impl Write for FileChunkWriter {
177 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
178 self.writer.write(buf)
179 }
180
181 fn flush(&mut self) -> io::Result<()> {
182 self.writer.flush()
183 }
184}
185
186impl ChunkWriter for FileChunkWriter {
187 fn finish(&mut self) -> io::Result<()> {
188 self.writer.flush()
189 }
190}
191
192struct FilterChunkWriter {
194 child: std::process::Child,
195 _stdin_taken: bool,
196}
197
198impl FilterChunkWriter {
199 fn create(filter_cmd: &str, output_path: &str) -> io::Result<Self> {
200 let child = Command::new("sh")
201 .arg("-c")
202 .arg(filter_cmd)
203 .env("FILE", output_path)
204 .stdin(Stdio::piped())
205 .spawn()?;
206 Ok(Self {
207 child,
208 _stdin_taken: false,
209 })
210 }
211}
212
213impl Write for FilterChunkWriter {
214 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
215 if let Some(ref mut stdin) = self.child.stdin {
216 stdin.write(buf)
217 } else {
218 Err(io::Error::new(io::ErrorKind::BrokenPipe, "stdin closed"))
219 }
220 }
221
222 fn flush(&mut self) -> io::Result<()> {
223 if let Some(ref mut stdin) = self.child.stdin {
224 stdin.flush()
225 } else {
226 Ok(())
227 }
228 }
229}
230
231impl ChunkWriter for FilterChunkWriter {
232 fn finish(&mut self) -> io::Result<()> {
233 self.child.stdin.take();
235 let status = self.child.wait()?;
236 if !status.success() {
237 return Err(io::Error::other(format!(
238 "filter command exited with status {}",
239 status
240 )));
241 }
242 Ok(())
243 }
244}
245
246fn create_writer(config: &SplitConfig, index: u64) -> io::Result<Box<dyn ChunkWriter>> {
248 let path = output_path(config, index);
249 if config.verbose {
250 eprintln!("creating file '{}'", path);
251 }
252 if let Some(ref filter_cmd) = config.filter {
253 Ok(Box::new(FilterChunkWriter::create(filter_cmd, &path)?))
254 } else {
255 Ok(Box::new(FileChunkWriter::create(&path)?))
256 }
257}
258
259fn split_by_lines(
261 reader: &mut dyn BufRead,
262 config: &SplitConfig,
263 lines_per_chunk: u64,
264) -> io::Result<()> {
265 let limit = max_chunks(&config.suffix_type, config.suffix_length);
266 let mut chunk_index: u64 = 0;
267 let mut line_count: u64 = 0;
268 let mut writer: Option<Box<dyn ChunkWriter>> = None;
269 let sep = config.separator;
270
271 let mut buf = Vec::with_capacity(8192);
272 loop {
273 buf.clear();
274 let bytes_read = read_until_sep(reader, sep, &mut buf)?;
275 if bytes_read == 0 {
276 break;
277 }
278
279 if writer.is_none() {
280 if chunk_index >= limit {
281 return Err(io::Error::other("output file suffixes exhausted"));
282 }
283 writer = Some(create_writer(config, chunk_index)?);
284 }
285
286 writer.as_mut().unwrap().write_all(&buf)?;
287 line_count += 1;
288
289 if line_count >= lines_per_chunk {
290 writer.as_mut().unwrap().finish()?;
291 writer = None;
292 line_count = 0;
293 chunk_index += 1;
294 }
295 }
296
297 if let Some(ref mut w) = writer {
298 w.finish()?;
299 }
300
301 Ok(())
302}
303
304fn read_until_sep(reader: &mut dyn BufRead, sep: u8, buf: &mut Vec<u8>) -> io::Result<usize> {
307 if sep == b'\n' {
308 let n = reader.read_until(b'\n', buf)?;
310 return Ok(n);
311 }
312 let start_len = buf.len();
314 loop {
315 let available = match reader.fill_buf() {
316 Ok(b) => b,
317 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
318 Err(e) => return Err(e),
319 };
320 if available.is_empty() {
321 return Ok(buf.len() - start_len);
322 }
323 if let Some(pos) = memchr::memchr(sep, available) {
324 buf.extend_from_slice(&available[..=pos]);
325 let consume = pos + 1;
326 reader.consume(consume);
327 return Ok(buf.len() - start_len);
328 }
329 buf.extend_from_slice(available);
330 let len = available.len();
331 reader.consume(len);
332 }
333}
334
335fn split_by_bytes(
337 reader: &mut dyn Read,
338 config: &SplitConfig,
339 bytes_per_chunk: u64,
340) -> io::Result<()> {
341 let limit = max_chunks(&config.suffix_type, config.suffix_length);
342 let mut chunk_index: u64 = 0;
343 let mut bytes_in_chunk: u64 = 0;
344 let mut writer: Option<Box<dyn ChunkWriter>> = None;
345
346 let mut read_buf = vec![0u8; 1024 * 1024]; loop {
348 let bytes_read = match reader.read(&mut read_buf) {
349 Ok(0) => break,
350 Ok(n) => n,
351 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
352 Err(e) => return Err(e),
353 };
354
355 let mut offset = 0usize;
356 while offset < bytes_read {
357 if writer.is_none() {
358 if chunk_index >= limit {
359 return Err(io::Error::other("output file suffixes exhausted"));
360 }
361 writer = Some(create_writer(config, chunk_index)?);
362 bytes_in_chunk = 0;
363 }
364
365 let remaining_in_chunk = (bytes_per_chunk - bytes_in_chunk) as usize;
366 let remaining_in_buf = bytes_read - offset;
367 let to_write = remaining_in_chunk.min(remaining_in_buf);
368
369 writer
370 .as_mut()
371 .unwrap()
372 .write_all(&read_buf[offset..offset + to_write])?;
373 bytes_in_chunk += to_write as u64;
374 offset += to_write;
375
376 if bytes_in_chunk >= bytes_per_chunk {
377 writer.as_mut().unwrap().finish()?;
378 writer = None;
379 chunk_index += 1;
380 }
381 }
382 }
383
384 if let Some(ref mut w) = writer {
385 if config.elide_empty && bytes_in_chunk == 0 {
386 w.finish()?;
387 let path = output_path(config, chunk_index);
389 let _ = fs::remove_file(&path);
390 } else {
391 w.finish()?;
392 }
393 }
394
395 Ok(())
396}
397
398fn split_by_line_bytes(
400 reader: &mut dyn BufRead,
401 config: &SplitConfig,
402 max_bytes: u64,
403) -> io::Result<()> {
404 let limit = max_chunks(&config.suffix_type, config.suffix_length);
405 let mut chunk_index: u64 = 0;
406 let mut bytes_in_chunk: u64 = 0;
407 let mut writer: Option<Box<dyn ChunkWriter>> = None;
408 let sep = config.separator;
409
410 let mut buf = Vec::with_capacity(8192);
411 loop {
412 buf.clear();
413 let bytes_read = read_until_sep(reader, sep, &mut buf)?;
414 if bytes_read == 0 {
415 break;
416 }
417
418 let line_len = buf.len() as u64;
419
420 if bytes_in_chunk > 0 && bytes_in_chunk + line_len > max_bytes {
424 if let Some(ref mut w) = writer {
425 w.finish()?;
426 }
427 writer = None;
428 chunk_index += 1;
429 bytes_in_chunk = 0;
430 }
431
432 if writer.is_none() {
433 if chunk_index >= limit {
434 return Err(io::Error::other("output file suffixes exhausted"));
435 }
436 writer = Some(create_writer(config, chunk_index)?);
437 bytes_in_chunk = 0;
438 }
439
440 writer.as_mut().unwrap().write_all(&buf)?;
443 bytes_in_chunk += line_len;
444
445 if bytes_in_chunk >= max_bytes {
446 if let Some(ref mut w) = writer {
447 w.finish()?;
448 }
449 writer = None;
450 chunk_index += 1;
451 bytes_in_chunk = 0;
452 }
453 }
454
455 if let Some(ref mut w) = writer {
456 w.finish()?;
457 }
458
459 Ok(())
460}
461
462fn split_by_number(input_path: &str, config: &SplitConfig, n_chunks: u64) -> io::Result<()> {
465 let limit = max_chunks(&config.suffix_type, config.suffix_length);
466 if n_chunks > limit {
467 return Err(io::Error::other("output file suffixes exhausted"));
468 }
469 if n_chunks == 0 {
470 return Err(io::Error::new(
471 io::ErrorKind::InvalidInput,
472 "invalid number of chunks: 0",
473 ));
474 }
475
476 let data = if input_path == "-" {
478 let mut buf = Vec::new();
479 io::stdin().lock().read_to_end(&mut buf)?;
480 buf
481 } else {
482 fs::read(input_path)?
483 };
484
485 let total = data.len() as u64;
486 let base_chunk_size = total / n_chunks;
487 let remainder = total % n_chunks;
488
489 let mut offset: u64 = 0;
490 for i in 0..n_chunks {
491 let chunk_size = base_chunk_size + if i < remainder { 1 } else { 0 };
493
494 if config.elide_empty && chunk_size == 0 {
495 continue;
496 }
497
498 let mut writer = create_writer(config, i)?;
499 if chunk_size > 0 {
500 let start = offset as usize;
501 let end = start + chunk_size as usize;
502 writer.write_all(&data[start..end])?;
503 }
504 writer.finish()?;
505 offset += chunk_size;
506 }
507
508 Ok(())
509}
510
511pub fn split_file(input_path: &str, config: &SplitConfig) -> io::Result<()> {
514 if let SplitMode::Number(n) = config.mode {
516 return split_by_number(input_path, config, n);
517 }
518
519 let reader: Box<dyn Read> = if input_path == "-" {
521 Box::new(io::stdin().lock())
522 } else {
523 let path = Path::new(input_path);
524 if !path.exists() {
525 return Err(io::Error::new(
526 io::ErrorKind::NotFound,
527 format!(
528 "cannot open '{}' for reading: No such file or directory",
529 input_path
530 ),
531 ));
532 }
533 Box::new(File::open(path)?)
534 };
535
536 match config.mode {
537 SplitMode::Lines(n) => {
538 let mut buf_reader = BufReader::with_capacity(256 * 1024, reader);
539 split_by_lines(&mut buf_reader, config, n)
540 }
541 SplitMode::Bytes(n) => {
542 let mut reader = reader;
543 split_by_bytes(&mut reader, config, n)
544 }
545 SplitMode::LineBytes(n) => {
546 let mut buf_reader = BufReader::with_capacity(256 * 1024, reader);
547 split_by_line_bytes(&mut buf_reader, config, n)
548 }
549 SplitMode::Number(_) => unreachable!(),
550 }
551}
552
553pub fn output_paths(config: &SplitConfig, count: u64) -> Vec<PathBuf> {
555 (0..count)
556 .map(|i| PathBuf::from(output_path(config, i)))
557 .collect()
558}