1use std::fs::{self, File};
2use std::io::{self, BufRead, BufReader, BufWriter, Read, Write};
3use std::path::{Path, PathBuf};
4use std::process::{Command, Stdio};
5
6#[derive(Clone, Debug, PartialEq)]
8pub enum SuffixType {
9 Alphabetic,
11 Numeric(u64),
13 Hex(u64),
15}
16
17#[derive(Clone, Debug)]
19pub enum SplitMode {
20 Lines(u64),
22 Bytes(u64),
24 LineBytes(u64),
26 Number(u64),
28}
29
30#[derive(Clone, Debug)]
32pub struct SplitConfig {
33 pub mode: SplitMode,
34 pub suffix_type: SuffixType,
35 pub suffix_length: usize,
36 pub additional_suffix: String,
37 pub prefix: String,
38 pub elide_empty: bool,
39 pub verbose: bool,
40 pub filter: Option<String>,
41 pub separator: u8,
42}
43
44impl Default for SplitConfig {
45 fn default() -> Self {
46 Self {
47 mode: SplitMode::Lines(1000),
48 suffix_type: SuffixType::Alphabetic,
49 suffix_length: 2,
50 additional_suffix: String::new(),
51 prefix: "x".to_string(),
52 elide_empty: false,
53 verbose: false,
54 filter: None,
55 separator: b'\n',
56 }
57 }
58}
59
60pub fn parse_size(s: &str) -> Result<u64, String> {
65 let s = s.trim();
66 if s.is_empty() {
67 return Err("empty size".to_string());
68 }
69
70 let mut num_end = 0;
72 for (i, c) in s.char_indices() {
73 if c.is_ascii_digit() || (i == 0 && (c == '+' || c == '-')) {
74 num_end = i + c.len_utf8();
75 } else {
76 break;
77 }
78 }
79
80 if num_end == 0 {
81 return Err(format!("invalid number: '{}'", s));
82 }
83
84 let num_str = &s[..num_end];
85 let suffix = &s[num_end..];
86
87 let num: u64 = num_str
88 .parse()
89 .map_err(|_| format!("invalid number: '{}'", num_str))?;
90
91 let multiplier: u64 = match suffix {
92 "" => 1,
93 "b" => 512,
94 "kB" => 1000,
95 "K" | "KiB" => 1024,
96 "MB" => 1_000_000,
97 "M" | "MiB" => 1_048_576,
98 "GB" => 1_000_000_000,
99 "G" | "GiB" => 1_073_741_824,
100 "TB" => 1_000_000_000_000,
101 "T" | "TiB" => 1_099_511_627_776,
102 "PB" => 1_000_000_000_000_000,
103 "P" | "PiB" => 1_125_899_906_842_624,
104 "EB" => 1_000_000_000_000_000_000,
105 "E" | "EiB" => 1_152_921_504_606_846_976,
106 "ZB" | "Z" | "ZiB" | "YB" | "Y" | "YiB" => {
107 if num > 0 {
108 return Ok(u64::MAX);
109 }
110 return Ok(0);
111 }
112 _ => return Err(format!("invalid suffix in '{}'", s)),
113 };
114
115 num.checked_mul(multiplier)
116 .ok_or_else(|| format!("number too large: '{}'", s))
117}
118
119pub fn generate_suffix(index: u64, suffix_type: &SuffixType, suffix_length: usize) -> String {
121 match suffix_type {
122 SuffixType::Alphabetic => {
123 let mut result = Vec::with_capacity(suffix_length);
124 let mut remaining = index;
125 for _ in 0..suffix_length {
126 result.push(b'a' + (remaining % 26) as u8);
127 remaining /= 26;
128 }
129 result.reverse();
130 String::from_utf8(result).unwrap()
131 }
132 SuffixType::Numeric(start) => {
133 let val = start + index;
134 format!("{:0>width$}", val, width = suffix_length)
135 }
136 SuffixType::Hex(start) => {
137 let val = start + index;
138 format!("{:0>width$x}", val, width = suffix_length)
139 }
140 }
141}
142
143pub fn max_chunks(suffix_type: &SuffixType, suffix_length: usize) -> u64 {
145 match suffix_type {
146 SuffixType::Alphabetic => 26u64.saturating_pow(suffix_length as u32),
147 SuffixType::Numeric(_) | SuffixType::Hex(_) => 10u64.saturating_pow(suffix_length as u32),
148 }
149}
150
151fn output_path(config: &SplitConfig, index: u64) -> String {
153 let suffix = generate_suffix(index, &config.suffix_type, config.suffix_length);
154 format!("{}{}{}", config.prefix, suffix, config.additional_suffix)
155}
156
157trait ChunkWriter: Write {
159 fn finish(&mut self) -> io::Result<()>;
160}
161
162struct FileChunkWriter {
164 writer: BufWriter<File>,
165}
166
167impl FileChunkWriter {
168 fn create(path: &str) -> io::Result<Self> {
169 let file = File::create(path)?;
170 Ok(Self {
171 writer: BufWriter::with_capacity(1024 * 1024, file), })
173 }
174}
175
176impl Write for FileChunkWriter {
177 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
178 self.writer.write(buf)
179 }
180
181 fn flush(&mut self) -> io::Result<()> {
182 self.writer.flush()
183 }
184}
185
186impl ChunkWriter for FileChunkWriter {
187 fn finish(&mut self) -> io::Result<()> {
188 self.writer.flush()
189 }
190}
191
192struct FilterChunkWriter {
194 child: std::process::Child,
195 _stdin_taken: bool,
196}
197
198impl FilterChunkWriter {
199 fn create(filter_cmd: &str, output_path: &str) -> io::Result<Self> {
200 let child = Command::new("sh")
201 .arg("-c")
202 .arg(filter_cmd)
203 .env("FILE", output_path)
204 .stdin(Stdio::piped())
205 .spawn()?;
206 Ok(Self {
207 child,
208 _stdin_taken: false,
209 })
210 }
211}
212
213impl Write for FilterChunkWriter {
214 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
215 if let Some(ref mut stdin) = self.child.stdin {
216 stdin.write(buf)
217 } else {
218 Err(io::Error::new(io::ErrorKind::BrokenPipe, "stdin closed"))
219 }
220 }
221
222 fn flush(&mut self) -> io::Result<()> {
223 if let Some(ref mut stdin) = self.child.stdin {
224 stdin.flush()
225 } else {
226 Ok(())
227 }
228 }
229}
230
231impl ChunkWriter for FilterChunkWriter {
232 fn finish(&mut self) -> io::Result<()> {
233 self.child.stdin.take();
235 let status = self.child.wait()?;
236 if !status.success() {
237 return Err(io::Error::other(format!(
238 "filter command exited with status {}",
239 status
240 )));
241 }
242 Ok(())
243 }
244}
245
246fn create_writer(config: &SplitConfig, index: u64) -> io::Result<Box<dyn ChunkWriter>> {
248 let path = output_path(config, index);
249 if config.verbose {
250 eprintln!("creating file '{}'", path);
251 }
252 if let Some(ref filter_cmd) = config.filter {
253 Ok(Box::new(FilterChunkWriter::create(filter_cmd, &path)?))
254 } else {
255 Ok(Box::new(FileChunkWriter::create(&path)?))
256 }
257}
258
259fn split_by_lines(
263 reader: &mut dyn BufRead,
264 config: &SplitConfig,
265 lines_per_chunk: u64,
266) -> io::Result<()> {
267 let limit = max_chunks(&config.suffix_type, config.suffix_length);
268 let mut chunk_index: u64 = 0;
269 let mut lines_in_chunk: u64 = 0;
270 let mut writer: Option<Box<dyn ChunkWriter>> = None;
271 let sep = config.separator;
272
273 loop {
274 let available = match reader.fill_buf() {
275 Ok([]) => break,
276 Ok(b) => b,
277 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
278 Err(e) => return Err(e),
279 };
280
281 let mut pos = 0;
282 let buf_len = available.len();
283
284 while pos < buf_len {
285 if writer.is_none() {
286 if chunk_index >= limit {
287 return Err(io::Error::other("output file suffixes exhausted"));
288 }
289 writer = Some(create_writer(config, chunk_index)?);
290 lines_in_chunk = 0;
291 }
292
293 let lines_needed = lines_per_chunk - lines_in_chunk;
295 let slice = &available[pos..];
296
297 let mut found = 0u64;
300 let mut last_sep_end = 0;
301
302 for offset in memchr::memchr_iter(sep, slice) {
303 found += 1;
304 last_sep_end = offset + 1;
305 if found >= lines_needed {
306 break;
307 }
308 }
309
310 if found >= lines_needed {
311 writer.as_mut().unwrap().write_all(&slice[..last_sep_end])?;
313 pos += last_sep_end;
314 writer.as_mut().unwrap().finish()?;
316 writer = None;
317 chunk_index += 1;
318 } else {
319 writer.as_mut().unwrap().write_all(slice)?;
321 lines_in_chunk += found;
322 pos = buf_len;
323 }
324 }
325
326 let consumed = buf_len;
327 reader.consume(consumed);
328 }
329
330 if let Some(ref mut w) = writer {
332 w.finish()?;
333 }
334
335 Ok(())
336}
337
338fn read_until_sep(reader: &mut dyn BufRead, sep: u8, buf: &mut Vec<u8>) -> io::Result<usize> {
341 if sep == b'\n' {
342 let n = reader.read_until(b'\n', buf)?;
344 return Ok(n);
345 }
346 let start_len = buf.len();
348 loop {
349 let available = match reader.fill_buf() {
350 Ok(b) => b,
351 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
352 Err(e) => return Err(e),
353 };
354 if available.is_empty() {
355 return Ok(buf.len() - start_len);
356 }
357 if let Some(pos) = memchr::memchr(sep, available) {
358 buf.extend_from_slice(&available[..=pos]);
359 let consume = pos + 1;
360 reader.consume(consume);
361 return Ok(buf.len() - start_len);
362 }
363 buf.extend_from_slice(available);
364 let len = available.len();
365 reader.consume(len);
366 }
367}
368
369fn split_by_bytes(
371 reader: &mut dyn Read,
372 config: &SplitConfig,
373 bytes_per_chunk: u64,
374) -> io::Result<()> {
375 let limit = max_chunks(&config.suffix_type, config.suffix_length);
376 let mut chunk_index: u64 = 0;
377 let mut bytes_in_chunk: u64 = 0;
378 let mut writer: Option<Box<dyn ChunkWriter>> = None;
379
380 let mut read_buf = vec![0u8; 1024 * 1024]; loop {
382 let bytes_read = match reader.read(&mut read_buf) {
383 Ok(0) => break,
384 Ok(n) => n,
385 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
386 Err(e) => return Err(e),
387 };
388
389 let mut offset = 0usize;
390 while offset < bytes_read {
391 if writer.is_none() {
392 if chunk_index >= limit {
393 return Err(io::Error::other("output file suffixes exhausted"));
394 }
395 writer = Some(create_writer(config, chunk_index)?);
396 bytes_in_chunk = 0;
397 }
398
399 let remaining_in_chunk = (bytes_per_chunk - bytes_in_chunk) as usize;
400 let remaining_in_buf = bytes_read - offset;
401 let to_write = remaining_in_chunk.min(remaining_in_buf);
402
403 writer
404 .as_mut()
405 .unwrap()
406 .write_all(&read_buf[offset..offset + to_write])?;
407 bytes_in_chunk += to_write as u64;
408 offset += to_write;
409
410 if bytes_in_chunk >= bytes_per_chunk {
411 writer.as_mut().unwrap().finish()?;
412 writer = None;
413 chunk_index += 1;
414 }
415 }
416 }
417
418 if let Some(ref mut w) = writer {
419 if config.elide_empty && bytes_in_chunk == 0 {
420 w.finish()?;
421 let path = output_path(config, chunk_index);
423 let _ = fs::remove_file(&path);
424 } else {
425 w.finish()?;
426 }
427 }
428
429 Ok(())
430}
431
432fn split_by_line_bytes(
434 reader: &mut dyn BufRead,
435 config: &SplitConfig,
436 max_bytes: u64,
437) -> io::Result<()> {
438 let limit = max_chunks(&config.suffix_type, config.suffix_length);
439 let mut chunk_index: u64 = 0;
440 let mut bytes_in_chunk: u64 = 0;
441 let mut writer: Option<Box<dyn ChunkWriter>> = None;
442 let sep = config.separator;
443
444 let mut buf = Vec::with_capacity(8192);
445 loop {
446 buf.clear();
447 let bytes_read = read_until_sep(reader, sep, &mut buf)?;
448 if bytes_read == 0 {
449 break;
450 }
451
452 let line_len = buf.len() as u64;
453
454 if bytes_in_chunk > 0 && bytes_in_chunk + line_len > max_bytes {
458 if let Some(ref mut w) = writer {
459 w.finish()?;
460 }
461 writer = None;
462 chunk_index += 1;
463 bytes_in_chunk = 0;
464 }
465
466 if writer.is_none() {
467 if chunk_index >= limit {
468 return Err(io::Error::other("output file suffixes exhausted"));
469 }
470 writer = Some(create_writer(config, chunk_index)?);
471 bytes_in_chunk = 0;
472 }
473
474 writer.as_mut().unwrap().write_all(&buf)?;
477 bytes_in_chunk += line_len;
478
479 if bytes_in_chunk >= max_bytes {
480 if let Some(ref mut w) = writer {
481 w.finish()?;
482 }
483 writer = None;
484 chunk_index += 1;
485 bytes_in_chunk = 0;
486 }
487 }
488
489 if let Some(ref mut w) = writer {
490 w.finish()?;
491 }
492
493 Ok(())
494}
495
496fn split_by_number(input_path: &str, config: &SplitConfig, n_chunks: u64) -> io::Result<()> {
499 let limit = max_chunks(&config.suffix_type, config.suffix_length);
500 if n_chunks > limit {
501 return Err(io::Error::other("output file suffixes exhausted"));
502 }
503 if n_chunks == 0 {
504 return Err(io::Error::new(
505 io::ErrorKind::InvalidInput,
506 "invalid number of chunks: 0",
507 ));
508 }
509
510 let data = if input_path == "-" {
512 let mut buf = Vec::new();
513 io::stdin().lock().read_to_end(&mut buf)?;
514 buf
515 } else {
516 fs::read(input_path)?
517 };
518
519 let total = data.len() as u64;
520 let base_chunk_size = total / n_chunks;
521 let remainder = total % n_chunks;
522
523 let mut offset: u64 = 0;
524 for i in 0..n_chunks {
525 let chunk_size = base_chunk_size + if i < remainder { 1 } else { 0 };
527
528 if config.elide_empty && chunk_size == 0 {
529 continue;
530 }
531
532 let mut writer = create_writer(config, i)?;
533 if chunk_size > 0 {
534 let start = offset as usize;
535 let end = start + chunk_size as usize;
536 writer.write_all(&data[start..end])?;
537 }
538 writer.finish()?;
539 offset += chunk_size;
540 }
541
542 Ok(())
543}
544
545pub fn split_file(input_path: &str, config: &SplitConfig) -> io::Result<()> {
548 if let SplitMode::Number(n) = config.mode {
550 return split_by_number(input_path, config, n);
551 }
552
553 let reader: Box<dyn Read> = if input_path == "-" {
555 Box::new(io::stdin().lock())
556 } else {
557 let path = Path::new(input_path);
558 if !path.exists() {
559 return Err(io::Error::new(
560 io::ErrorKind::NotFound,
561 format!(
562 "cannot open '{}' for reading: No such file or directory",
563 input_path
564 ),
565 ));
566 }
567 Box::new(File::open(path)?)
568 };
569
570 match config.mode {
571 SplitMode::Lines(n) => {
572 let mut buf_reader = BufReader::with_capacity(1024 * 1024, reader);
573 split_by_lines(&mut buf_reader, config, n)
574 }
575 SplitMode::Bytes(n) => {
576 let mut reader = reader;
577 split_by_bytes(&mut reader, config, n)
578 }
579 SplitMode::LineBytes(n) => {
580 let mut buf_reader = BufReader::with_capacity(1024 * 1024, reader);
581 split_by_line_bytes(&mut buf_reader, config, n)
582 }
583 SplitMode::Number(_) => unreachable!(),
584 }
585}
586
587pub fn output_paths(config: &SplitConfig, count: u64) -> Vec<PathBuf> {
589 (0..count)
590 .map(|i| PathBuf::from(output_path(config, i)))
591 .collect()
592}