1use memchr::memchr_iter;
2use regex::Regex;
3use std::fs;
4use std::io;
5
6#[derive(Clone, Debug)]
8pub enum Pattern {
9 Regex { regex: String, offset: i64 },
11 SkipTo { regex: String, offset: i64 },
14 LineNumber(usize),
16 Repeat(usize),
18 RepeatForever,
20}
21
22#[derive(Clone, Debug)]
24pub struct CsplitConfig {
25 pub prefix: String,
26 pub suffix_format: String,
27 pub digits: usize,
28 pub keep_files: bool,
29 pub quiet: bool,
30 pub elide_empty: bool,
31}
32
33impl Default for CsplitConfig {
34 fn default() -> Self {
35 Self {
36 prefix: "xx".to_string(),
37 suffix_format: String::new(),
38 digits: 2,
39 keep_files: false,
40 quiet: false,
41 elide_empty: false,
42 }
43 }
44}
45
46pub fn parse_pattern(s: &str) -> Result<Pattern, String> {
48 let s = s.trim();
49
50 if s == "{*}" {
52 return Ok(Pattern::RepeatForever);
53 }
54
55 if s.starts_with('{') && s.ends_with('}') {
57 let inner = &s[1..s.len() - 1];
58 let n: usize = inner
59 .parse()
60 .map_err(|_| format!("invalid repeat count: '{}'", s))?;
61 return Ok(Pattern::Repeat(n));
62 }
63
64 if s.starts_with('/') {
66 let rest = &s[1..];
67 if let Some(end_pos) = rest.rfind('/') {
68 let regex_str = &rest[..end_pos];
69 let after = rest[end_pos + 1..].trim();
70 let offset = if after.is_empty() {
71 0
72 } else {
73 after
74 .parse::<i64>()
75 .map_err(|_| format!("invalid offset: '{}'", after))?
76 };
77 Regex::new(regex_str).map_err(|e| format!("invalid regex '{}': {}", regex_str, e))?;
79 return Ok(Pattern::Regex {
80 regex: regex_str.to_string(),
81 offset,
82 });
83 }
84 return Err(format!("unmatched '/' in pattern: '{}'", s));
85 }
86
87 if s.starts_with('%') {
89 let rest = &s[1..];
90 if let Some(end_pos) = rest.rfind('%') {
91 let regex_str = &rest[..end_pos];
92 let after = rest[end_pos + 1..].trim();
93 let offset = if after.is_empty() {
94 0
95 } else {
96 after
97 .parse::<i64>()
98 .map_err(|_| format!("invalid offset: '{}'", after))?
99 };
100 Regex::new(regex_str).map_err(|e| format!("invalid regex '{}': {}", regex_str, e))?;
102 return Ok(Pattern::SkipTo {
103 regex: regex_str.to_string(),
104 offset,
105 });
106 }
107 return Err(format!("unmatched '%' in pattern: '{}'", s));
108 }
109
110 let n: usize = s.parse().map_err(|_| format!("invalid pattern: '{}'", s))?;
112 if n == 0 {
113 return Err("line number must be positive".to_string());
114 }
115 Ok(Pattern::LineNumber(n))
116}
117
118pub fn output_filename(config: &CsplitConfig, index: usize) -> String {
120 if config.suffix_format.is_empty() {
121 format!("{}{:0>width$}", config.prefix, index, width = config.digits)
122 } else {
123 let suffix = format_suffix(&config.suffix_format, index);
125 format!("{}{}", config.prefix, suffix)
126 }
127}
128
129pub fn format_suffix(fmt: &str, value: usize) -> String {
132 let mut result = String::new();
133 let mut chars = fmt.chars().peekable();
134
135 while let Some(ch) = chars.next() {
136 if ch == '%' {
137 let mut width_str = String::new();
139 let mut zero_pad = false;
140
141 if chars.peek() == Some(&'0') {
142 zero_pad = true;
143 chars.next();
144 }
145
146 while let Some(&c) = chars.peek() {
147 if c.is_ascii_digit() {
148 width_str.push(c);
149 chars.next();
150 } else {
151 break;
152 }
153 }
154
155 if chars.peek() == Some(&'d') {
157 chars.next();
158 let width: usize = width_str.parse().unwrap_or(0);
159 if zero_pad && width > 0 {
160 result.push_str(&format!("{:0>width$}", value, width = width));
161 } else if width > 0 {
162 result.push_str(&format!("{:>width$}", value, width = width));
163 } else {
164 result.push_str(&format!("{}", value));
165 }
166 } else if chars.peek() == Some(&'%') {
167 chars.next();
168 result.push('%');
169 } else {
170 result.push('%');
172 result.push_str(&width_str);
173 }
174 } else {
175 result.push(ch);
176 }
177 }
178
179 result
180}
181
182fn build_line_offsets(data: &[u8]) -> Vec<usize> {
186 let mut offsets = Vec::with_capacity(data.len() / 40 + 2);
187 offsets.push(0);
188 for pos in memchr_iter(b'\n', data) {
189 offsets.push(pos + 1);
190 }
191 if !data.is_empty() && *data.last().unwrap() != b'\n' {
193 offsets.push(data.len());
194 }
195 offsets
196}
197
198#[inline]
200fn line_content<'a>(data: &'a [u8], offsets: &[usize], idx: usize) -> &'a str {
201 let start = offsets[idx];
202 let mut end = offsets[idx + 1];
203 if end > start && data[end - 1] == b'\n' {
204 end -= 1;
205 }
206 unsafe { std::str::from_utf8_unchecked(&data[start..end]) }
209}
210
211fn write_chunk_range(
214 data: &[u8],
215 offsets: &[usize],
216 start_line: usize,
217 end_line: usize,
218 filename: &str,
219 config: &CsplitConfig,
220) -> Result<u64, String> {
221 let is_empty = start_line >= end_line;
222 if config.elide_empty && is_empty {
223 return Ok(0);
224 }
225
226 if is_empty {
227 fs::write(filename, b"").map_err(|e| format!("cannot write '{}': {}", filename, e))?;
228 return Ok(0);
229 }
230
231 let byte_start = offsets[start_line];
232 let byte_end = offsets[end_line];
233 let chunk = &data[byte_start..byte_end];
234 let bytes = chunk.len() as u64;
235
236 fs::write(filename, chunk).map_err(|e| format!("cannot write '{}': {}", filename, e))?;
237 Ok(bytes)
238}
239
240fn find_match(
243 data: &[u8],
244 offsets: &[usize],
245 total_lines: usize,
246 regex: &Regex,
247 start: usize,
248) -> Option<usize> {
249 (start..total_lines).find(|&idx| regex.is_match(line_content(data, offsets, idx)))
250}
251
252fn apply_regex_pattern(
255 data: &[u8],
256 offsets: &[usize],
257 total_lines: usize,
258 regex: &str,
259 offset: i64,
260 is_skip: bool,
261 current_line: &mut usize,
262 skip_current: &mut bool,
263 sizes: &mut Vec<u64>,
264 created_files: &mut Vec<String>,
265 file_index: &mut usize,
266 config: &CsplitConfig,
267 graceful_no_match: bool,
268) -> Result<bool, String> {
269 let re = Regex::new(regex).map_err(|e| format!("invalid regex: {}", e))?;
270
271 let search_start = if *skip_current
274 && *current_line < total_lines
275 && re.is_match(line_content(data, offsets, *current_line))
276 {
277 *current_line + 1
278 } else {
279 *current_line
280 };
281
282 let match_idx = match find_match(data, offsets, total_lines, &re, search_start) {
283 Some(idx) => idx,
284 None => {
285 if graceful_no_match {
286 return Ok(false);
287 }
288 return Err(format!("{}: no match", regex));
289 }
290 };
291
292 let target = match_idx as i64 + offset;
293 let split_at = if target < *current_line as i64 {
294 *current_line
295 } else if target as usize > total_lines {
296 total_lines
297 } else {
298 target as usize
299 };
300
301 if is_skip {
302 *current_line = split_at;
304 *skip_current = false;
305 } else {
306 let is_empty = *current_line >= split_at;
308 let filename = output_filename(config, *file_index);
309 let bytes = write_chunk_range(data, offsets, *current_line, split_at, &filename, config)?;
310
311 if !(config.elide_empty && is_empty) {
312 created_files.push(filename);
313 sizes.push(bytes);
314 *file_index += 1;
315 }
316
317 *current_line = split_at;
318 *skip_current = offset == 0;
320 }
321
322 Ok(true)
323}
324
325pub fn csplit_file(
329 input: &str,
330 patterns: &[Pattern],
331 config: &CsplitConfig,
332) -> Result<Vec<u64>, String> {
333 let data = input.as_bytes();
334 let offsets = build_line_offsets(data);
335 let total_lines = if offsets.len() <= 1 {
337 0
338 } else {
339 offsets.len() - 1
340 };
341
342 let mut sizes: Vec<u64> = Vec::new();
343 let mut created_files: Vec<String> = Vec::new();
344 let mut file_index: usize = 0;
345 let mut current_line: usize = 0; let mut skip_current = false; let do_cleanup = |files: &[String], config: &CsplitConfig| {
349 if !config.keep_files {
350 for f in files {
351 let _ = fs::remove_file(f);
352 }
353 }
354 };
355
356 let mut pat_idx = 0;
357 while pat_idx < patterns.len() {
358 match &patterns[pat_idx] {
359 Pattern::LineNumber(n) => {
360 let split_at = *n;
362 if split_at <= current_line {
363 let msg = format!("{}: line number out of range", split_at);
364 do_cleanup(&created_files, config);
365 return Err(msg);
366 }
367
368 let end = if split_at > total_lines {
369 total_lines
370 } else {
371 split_at - 1
372 };
373
374 let is_empty = current_line >= end;
375 let filename = output_filename(config, file_index);
376
377 let bytes = write_chunk_range(data, &offsets, current_line, end, &filename, config)
378 .inspect_err(|_| {
379 do_cleanup(&created_files, config);
380 })?;
381
382 if !(config.elide_empty && is_empty) {
383 created_files.push(filename);
384 sizes.push(bytes);
385 file_index += 1;
386 }
387
388 current_line = end;
389 skip_current = false;
390 pat_idx += 1;
391 }
392 Pattern::Regex { regex, offset } => {
393 let regex = regex.clone();
394 let offset = *offset;
395 if let Err(e) = apply_regex_pattern(
396 data,
397 &offsets,
398 total_lines,
399 ®ex,
400 offset,
401 false,
402 &mut current_line,
403 &mut skip_current,
404 &mut sizes,
405 &mut created_files,
406 &mut file_index,
407 config,
408 false,
409 ) {
410 do_cleanup(&created_files, config);
411 return Err(e);
412 }
413 pat_idx += 1;
414 }
415 Pattern::SkipTo { regex, offset } => {
416 let regex = regex.clone();
417 let offset = *offset;
418 if let Err(e) = apply_regex_pattern(
419 data,
420 &offsets,
421 total_lines,
422 ®ex,
423 offset,
424 true,
425 &mut current_line,
426 &mut skip_current,
427 &mut sizes,
428 &mut created_files,
429 &mut file_index,
430 config,
431 false,
432 ) {
433 do_cleanup(&created_files, config);
434 return Err(e);
435 }
436 pat_idx += 1;
437 }
438 Pattern::Repeat(n) => {
439 let n = *n;
440 if pat_idx == 0 {
441 do_cleanup(&created_files, config);
442 return Err("{N}: no preceding pattern to repeat".to_string());
443 }
444 let prev_pat = find_prev_pattern(patterns, pat_idx);
446 let prev_pat = match prev_pat {
447 Some(p) => p.clone(),
448 None => {
449 do_cleanup(&created_files, config);
450 return Err("{N}: no preceding pattern to repeat".to_string());
451 }
452 };
453 for _ in 0..n {
454 match &prev_pat {
455 Pattern::LineNumber(ln) => {
456 let end = if *ln > total_lines {
457 total_lines
458 } else {
459 *ln - 1
460 };
461 if end <= current_line {
462 let msg = format!("{}: line number out of range", ln);
463 do_cleanup(&created_files, config);
464 return Err(msg);
465 }
466 let is_empty = current_line >= end;
467 let filename = output_filename(config, file_index);
468 let bytes = write_chunk_range(
469 data,
470 &offsets,
471 current_line,
472 end,
473 &filename,
474 config,
475 )
476 .inspect_err(|_| {
477 do_cleanup(&created_files, config);
478 })?;
479 if !(config.elide_empty && is_empty) {
480 created_files.push(filename);
481 sizes.push(bytes);
482 file_index += 1;
483 }
484 current_line = end;
485 skip_current = false;
486 }
487 Pattern::Regex { regex, offset } => {
488 if let Err(e) = apply_regex_pattern(
489 data,
490 &offsets,
491 total_lines,
492 regex,
493 *offset,
494 false,
495 &mut current_line,
496 &mut skip_current,
497 &mut sizes,
498 &mut created_files,
499 &mut file_index,
500 config,
501 false,
502 ) {
503 do_cleanup(&created_files, config);
504 return Err(e);
505 }
506 }
507 Pattern::SkipTo { regex, offset } => {
508 if let Err(e) = apply_regex_pattern(
509 data,
510 &offsets,
511 total_lines,
512 regex,
513 *offset,
514 true,
515 &mut current_line,
516 &mut skip_current,
517 &mut sizes,
518 &mut created_files,
519 &mut file_index,
520 config,
521 false,
522 ) {
523 do_cleanup(&created_files, config);
524 return Err(e);
525 }
526 }
527 _ => {}
528 }
529 }
530 pat_idx += 1;
531 }
532 Pattern::RepeatForever => {
533 if pat_idx == 0 {
534 do_cleanup(&created_files, config);
535 return Err("{*}: no preceding pattern to repeat".to_string());
536 }
537 let prev_pat = find_prev_pattern(patterns, pat_idx);
538 let prev_pat = match prev_pat {
539 Some(p) => p.clone(),
540 None => {
541 do_cleanup(&created_files, config);
542 return Err("{*}: no preceding pattern to repeat".to_string());
543 }
544 };
545 loop {
547 match &prev_pat {
548 Pattern::Regex { regex, offset } => {
549 match apply_regex_pattern(
550 data,
551 &offsets,
552 total_lines,
553 regex,
554 *offset,
555 false,
556 &mut current_line,
557 &mut skip_current,
558 &mut sizes,
559 &mut created_files,
560 &mut file_index,
561 config,
562 true, ) {
564 Ok(true) => continue,
565 Ok(false) => break,
566 Err(e) => {
567 do_cleanup(&created_files, config);
568 return Err(e);
569 }
570 }
571 }
572 Pattern::SkipTo { regex, offset } => {
573 match apply_regex_pattern(
574 data,
575 &offsets,
576 total_lines,
577 regex,
578 *offset,
579 true,
580 &mut current_line,
581 &mut skip_current,
582 &mut sizes,
583 &mut created_files,
584 &mut file_index,
585 config,
586 true,
587 ) {
588 Ok(true) => continue,
589 Ok(false) => break,
590 Err(e) => {
591 do_cleanup(&created_files, config);
592 return Err(e);
593 }
594 }
595 }
596 _ => break,
597 }
598 }
599 pat_idx += 1;
600 }
601 }
602 }
603
604 if current_line < total_lines {
606 let filename = output_filename(config, file_index);
607
608 let bytes = write_chunk_range(data, &offsets, current_line, total_lines, &filename, config)
609 .inspect_err(|_| {
610 do_cleanup(&created_files, config);
611 })?;
612
613 if !(config.elide_empty && current_line >= total_lines) {
614 created_files.push(filename);
615 sizes.push(bytes);
616 }
617 } else if !config.elide_empty {
618 let filename = output_filename(config, file_index);
620 let bytes =
621 write_chunk_range(data, &offsets, 0, 0, &filename, config).inspect_err(|_| {
622 do_cleanup(&created_files, config);
623 })?;
624 created_files.push(filename);
625 sizes.push(bytes);
626 }
627
628 Ok(sizes)
629}
630
631fn find_prev_pattern(patterns: &[Pattern], idx: usize) -> Option<&Pattern> {
633 let mut i = idx;
634 while i > 0 {
635 i -= 1;
636 match &patterns[i] {
637 Pattern::Repeat(_) | Pattern::RepeatForever => continue,
638 other => return Some(other),
639 }
640 }
641 None
642}
643
644pub fn csplit_from_path(
646 path: &str,
647 patterns: &[Pattern],
648 config: &CsplitConfig,
649) -> Result<Vec<u64>, String> {
650 let input = if path == "-" {
651 let mut buf = String::new();
652 io::Read::read_to_string(&mut io::stdin().lock(), &mut buf)
653 .map_err(|e| format!("read error: {}", e))?;
654 buf
655 } else {
656 std::fs::read_to_string(path).map_err(|e| format!("cannot open '{}': {}", path, e))?
657 };
658
659 csplit_file(&input, patterns, config)
660}
661
662pub fn print_sizes(sizes: &[u64]) {
664 for size in sizes {
665 println!("{}", size);
666 }
667}