coreutils_rs/csplit/
core.rs

1use memchr::memchr_iter;
2use regex::Regex;
3use std::fs;
4use std::io;
5
6/// A parsed csplit pattern.
7#[derive(Clone, Debug)]
8pub enum Pattern {
9    /// Split before the first line matching the regex, with optional offset.
10    Regex { regex: String, offset: i64 },
11    /// Skip to (but don't include) a line matching the regex, with optional offset.
12    /// Lines skipped are not written to any output file.
13    SkipTo { regex: String, offset: i64 },
14    /// Split at a specific line number.
15    LineNumber(usize),
16    /// Repeat the previous pattern N times.
17    Repeat(usize),
18    /// Repeat the previous pattern as many times as possible.
19    RepeatForever,
20}
21
22/// Configuration for the csplit command.
23#[derive(Clone, Debug)]
24pub struct CsplitConfig {
25    pub prefix: String,
26    pub suffix_format: String,
27    pub digits: usize,
28    pub keep_files: bool,
29    pub quiet: bool,
30    pub elide_empty: bool,
31}
32
33impl Default for CsplitConfig {
34    fn default() -> Self {
35        Self {
36            prefix: "xx".to_string(),
37            suffix_format: String::new(),
38            digits: 2,
39            keep_files: false,
40            quiet: false,
41            elide_empty: false,
42        }
43    }
44}
45
46/// Parse a pattern string into a Pattern enum.
47pub fn parse_pattern(s: &str) -> Result<Pattern, String> {
48    let s = s.trim();
49
50    // {*} - repeat forever
51    if s == "{*}" {
52        return Ok(Pattern::RepeatForever);
53    }
54
55    // {N} - repeat N times
56    if s.starts_with('{') && s.ends_with('}') {
57        let inner = &s[1..s.len() - 1];
58        let n: usize = inner
59            .parse()
60            .map_err(|_| format!("invalid repeat count: '{}'", s))?;
61        return Ok(Pattern::Repeat(n));
62    }
63
64    // /REGEX/[OFFSET] - split before matching line
65    if s.starts_with('/') {
66        let rest = &s[1..];
67        if let Some(end_pos) = rest.rfind('/') {
68            let regex_str = &rest[..end_pos];
69            let after = rest[end_pos + 1..].trim();
70            let offset = if after.is_empty() {
71                0
72            } else {
73                after
74                    .parse::<i64>()
75                    .map_err(|_| format!("invalid offset: '{}'", after))?
76            };
77            // Validate regex
78            Regex::new(regex_str).map_err(|e| format!("invalid regex '{}': {}", regex_str, e))?;
79            return Ok(Pattern::Regex {
80                regex: regex_str.to_string(),
81                offset,
82            });
83        }
84        return Err(format!("unmatched '/' in pattern: '{}'", s));
85    }
86
87    // %REGEX%[OFFSET] - skip to matching line
88    if s.starts_with('%') {
89        let rest = &s[1..];
90        if let Some(end_pos) = rest.rfind('%') {
91            let regex_str = &rest[..end_pos];
92            let after = rest[end_pos + 1..].trim();
93            let offset = if after.is_empty() {
94                0
95            } else {
96                after
97                    .parse::<i64>()
98                    .map_err(|_| format!("invalid offset: '{}'", after))?
99            };
100            // Validate regex
101            Regex::new(regex_str).map_err(|e| format!("invalid regex '{}': {}", regex_str, e))?;
102            return Ok(Pattern::SkipTo {
103                regex: regex_str.to_string(),
104                offset,
105            });
106        }
107        return Err(format!("unmatched '%' in pattern: '{}'", s));
108    }
109
110    // LINE_NUMBER - split at line number
111    let n: usize = s.parse().map_err(|_| format!("invalid pattern: '{}'", s))?;
112    if n == 0 {
113        return Err("line number must be positive".to_string());
114    }
115    Ok(Pattern::LineNumber(n))
116}
117
118/// Generate the output filename for a given file index.
119pub fn output_filename(config: &CsplitConfig, index: usize) -> String {
120    if config.suffix_format.is_empty() {
121        format!("{}{:0>width$}", config.prefix, index, width = config.digits)
122    } else {
123        // Simple sprintf-like formatting: support %02d, %03d, etc.
124        let suffix = format_suffix(&config.suffix_format, index);
125        format!("{}{}", config.prefix, suffix)
126    }
127}
128
129/// Simple sprintf-like formatter for suffix format strings.
130/// Supports %d, %02d, %03d, etc.
131pub fn format_suffix(fmt: &str, value: usize) -> String {
132    let mut result = String::new();
133    let mut chars = fmt.chars().peekable();
134
135    while let Some(ch) = chars.next() {
136        if ch == '%' {
137            // Parse width specifier
138            let mut width_str = String::new();
139            let mut zero_pad = false;
140
141            if chars.peek() == Some(&'0') {
142                zero_pad = true;
143                chars.next();
144            }
145
146            while let Some(&c) = chars.peek() {
147                if c.is_ascii_digit() {
148                    width_str.push(c);
149                    chars.next();
150                } else {
151                    break;
152                }
153            }
154
155            // Expect 'd'
156            if chars.peek() == Some(&'d') {
157                chars.next();
158                let width: usize = width_str.parse().unwrap_or(0);
159                if zero_pad && width > 0 {
160                    result.push_str(&format!("{:0>width$}", value, width = width));
161                } else if width > 0 {
162                    result.push_str(&format!("{:>width$}", value, width = width));
163                } else {
164                    result.push_str(&format!("{}", value));
165                }
166            } else if chars.peek() == Some(&'%') {
167                chars.next();
168                result.push('%');
169            } else {
170                // Unknown format, just pass through
171                result.push('%');
172                result.push_str(&width_str);
173            }
174        } else {
175            result.push(ch);
176        }
177    }
178
179    result
180}
181
182/// Build a line offset table from input bytes using SIMD-accelerated newline scan.
183/// Returns offsets where offsets[i] is the byte position of the start of line i,
184/// and offsets[total_lines] is the byte position past the end of line total_lines-1.
185fn build_line_offsets(data: &[u8]) -> Vec<usize> {
186    let mut offsets = Vec::with_capacity(data.len() / 40 + 2);
187    offsets.push(0);
188    for pos in memchr_iter(b'\n', data) {
189        offsets.push(pos + 1);
190    }
191    // If data doesn't end with \n, add end sentinel
192    if !data.is_empty() && *data.last().unwrap() != b'\n' {
193        offsets.push(data.len());
194    }
195    offsets
196}
197
198/// Get the content of line `idx` (without trailing newline) as a &str.
199#[inline]
200fn line_content<'a>(data: &'a [u8], offsets: &[usize], idx: usize) -> &'a str {
201    let start = offsets[idx];
202    let mut end = offsets[idx + 1];
203    if end > start && data[end - 1] == b'\n' {
204        end -= 1;
205    }
206    // SAFETY: callers pass data from csplit_file which takes &str input,
207    // guaranteeing the bytes are valid UTF-8.
208    unsafe { std::str::from_utf8_unchecked(&data[start..end]) }
209}
210
211/// Write the byte range for lines [start_line..end_line) to a file.
212/// Returns the number of bytes written.
213fn write_chunk_range(
214    data: &[u8],
215    offsets: &[usize],
216    start_line: usize,
217    end_line: usize,
218    filename: &str,
219    config: &CsplitConfig,
220) -> Result<u64, String> {
221    let is_empty = start_line >= end_line;
222    if config.elide_empty && is_empty {
223        return Ok(0);
224    }
225
226    if is_empty {
227        fs::write(filename, b"").map_err(|e| format!("cannot write '{}': {}", filename, e))?;
228        return Ok(0);
229    }
230
231    let byte_start = offsets[start_line];
232    let byte_end = offsets[end_line];
233    let chunk = &data[byte_start..byte_end];
234    let bytes = chunk.len() as u64;
235
236    fs::write(filename, chunk).map_err(|e| format!("cannot write '{}': {}", filename, e))?;
237    Ok(bytes)
238}
239
240/// Find the first line matching a regex starting from `start`, returning its index.
241/// Matches per-line to replicate GNU csplit behavior (regex sees individual lines).
242fn find_match(
243    data: &[u8],
244    offsets: &[usize],
245    total_lines: usize,
246    regex: &Regex,
247    start: usize,
248) -> Option<usize> {
249    (start..total_lines).find(|&idx| regex.is_match(line_content(data, offsets, idx)))
250}
251
252/// Apply a single regex or skip-to pattern. Returns Ok(true) if matched,
253/// Ok(false) if no match (only used for repeat-forever graceful stop).
254fn apply_regex_pattern(
255    data: &[u8],
256    offsets: &[usize],
257    total_lines: usize,
258    regex: &str,
259    offset: i64,
260    is_skip: bool,
261    current_line: &mut usize,
262    skip_current: &mut bool,
263    sizes: &mut Vec<u64>,
264    created_files: &mut Vec<String>,
265    file_index: &mut usize,
266    config: &CsplitConfig,
267    graceful_no_match: bool,
268) -> Result<bool, String> {
269    let re = Regex::new(regex).map_err(|e| format!("invalid regex: {}", e))?;
270
271    // When skip_current is set, the line at current_line was the match boundary
272    // from a previous regex split — skip it to find the NEXT occurrence.
273    let search_start = if *skip_current
274        && *current_line < total_lines
275        && re.is_match(line_content(data, offsets, *current_line))
276    {
277        *current_line + 1
278    } else {
279        *current_line
280    };
281
282    let match_idx = match find_match(data, offsets, total_lines, &re, search_start) {
283        Some(idx) => idx,
284        None => {
285            if graceful_no_match {
286                return Ok(false);
287            }
288            return Err(format!("{}: no match", regex));
289        }
290    };
291
292    let target = match_idx as i64 + offset;
293    let split_at = if target < *current_line as i64 {
294        *current_line
295    } else if target as usize > total_lines {
296        total_lines
297    } else {
298        target as usize
299    };
300
301    if is_skip {
302        // SkipTo: discard lines from current_line to split_at
303        *current_line = split_at;
304        *skip_current = false;
305    } else {
306        // Regex: write chunk from current_line to split_at
307        let is_empty = *current_line >= split_at;
308        let filename = output_filename(config, *file_index);
309        let bytes = write_chunk_range(data, offsets, *current_line, split_at, &filename, config)?;
310
311        if !(config.elide_empty && is_empty) {
312            created_files.push(filename);
313            sizes.push(bytes);
314            *file_index += 1;
315        }
316
317        *current_line = split_at;
318        // After a regex match with offset 0, current_line is AT the match line
319        *skip_current = offset == 0;
320    }
321
322    Ok(true)
323}
324
325/// Split a file based on patterns.
326///
327/// Returns the sizes (in bytes) of each created output file.
328pub fn csplit_file(
329    input: &str,
330    patterns: &[Pattern],
331    config: &CsplitConfig,
332) -> Result<Vec<u64>, String> {
333    let data = input.as_bytes();
334    let offsets = build_line_offsets(data);
335    // total_lines = number of lines (offsets has total_lines + 1 entries)
336    let total_lines = if offsets.len() <= 1 {
337        0
338    } else {
339        offsets.len() - 1
340    };
341
342    let mut sizes: Vec<u64> = Vec::new();
343    let mut created_files: Vec<String> = Vec::new();
344    let mut file_index: usize = 0;
345    let mut current_line: usize = 0; // 0-based index into lines
346    let mut skip_current = false; // true when current_line is a regex match boundary
347
348    let do_cleanup = |files: &[String], config: &CsplitConfig| {
349        if !config.keep_files {
350            for f in files {
351                let _ = fs::remove_file(f);
352            }
353        }
354    };
355
356    let mut pat_idx = 0;
357    while pat_idx < patterns.len() {
358        match &patterns[pat_idx] {
359            Pattern::LineNumber(n) => {
360                // Split at line number n (1-based).
361                let split_at = *n;
362                if split_at <= current_line {
363                    let msg = format!("{}: line number out of range", split_at);
364                    do_cleanup(&created_files, config);
365                    return Err(msg);
366                }
367
368                let end = if split_at > total_lines {
369                    total_lines
370                } else {
371                    split_at - 1
372                };
373
374                let is_empty = current_line >= end;
375                let filename = output_filename(config, file_index);
376
377                let bytes = write_chunk_range(data, &offsets, current_line, end, &filename, config)
378                    .inspect_err(|_| {
379                        do_cleanup(&created_files, config);
380                    })?;
381
382                if !(config.elide_empty && is_empty) {
383                    created_files.push(filename);
384                    sizes.push(bytes);
385                    file_index += 1;
386                }
387
388                current_line = end;
389                skip_current = false;
390                pat_idx += 1;
391            }
392            Pattern::Regex { regex, offset } => {
393                let regex = regex.clone();
394                let offset = *offset;
395                if let Err(e) = apply_regex_pattern(
396                    data,
397                    &offsets,
398                    total_lines,
399                    &regex,
400                    offset,
401                    false,
402                    &mut current_line,
403                    &mut skip_current,
404                    &mut sizes,
405                    &mut created_files,
406                    &mut file_index,
407                    config,
408                    false,
409                ) {
410                    do_cleanup(&created_files, config);
411                    return Err(e);
412                }
413                pat_idx += 1;
414            }
415            Pattern::SkipTo { regex, offset } => {
416                let regex = regex.clone();
417                let offset = *offset;
418                if let Err(e) = apply_regex_pattern(
419                    data,
420                    &offsets,
421                    total_lines,
422                    &regex,
423                    offset,
424                    true,
425                    &mut current_line,
426                    &mut skip_current,
427                    &mut sizes,
428                    &mut created_files,
429                    &mut file_index,
430                    config,
431                    false,
432                ) {
433                    do_cleanup(&created_files, config);
434                    return Err(e);
435                }
436                pat_idx += 1;
437            }
438            Pattern::Repeat(n) => {
439                let n = *n;
440                if pat_idx == 0 {
441                    do_cleanup(&created_files, config);
442                    return Err("{N}: no preceding pattern to repeat".to_string());
443                }
444                // Find the preceding non-repeat pattern
445                let prev_pat = find_prev_pattern(patterns, pat_idx);
446                let prev_pat = match prev_pat {
447                    Some(p) => p.clone(),
448                    None => {
449                        do_cleanup(&created_files, config);
450                        return Err("{N}: no preceding pattern to repeat".to_string());
451                    }
452                };
453                for _ in 0..n {
454                    match &prev_pat {
455                        Pattern::LineNumber(ln) => {
456                            let end = if *ln > total_lines {
457                                total_lines
458                            } else {
459                                *ln - 1
460                            };
461                            if end <= current_line {
462                                let msg = format!("{}: line number out of range", ln);
463                                do_cleanup(&created_files, config);
464                                return Err(msg);
465                            }
466                            let is_empty = current_line >= end;
467                            let filename = output_filename(config, file_index);
468                            let bytes = write_chunk_range(
469                                data,
470                                &offsets,
471                                current_line,
472                                end,
473                                &filename,
474                                config,
475                            )
476                            .inspect_err(|_| {
477                                do_cleanup(&created_files, config);
478                            })?;
479                            if !(config.elide_empty && is_empty) {
480                                created_files.push(filename);
481                                sizes.push(bytes);
482                                file_index += 1;
483                            }
484                            current_line = end;
485                            skip_current = false;
486                        }
487                        Pattern::Regex { regex, offset } => {
488                            if let Err(e) = apply_regex_pattern(
489                                data,
490                                &offsets,
491                                total_lines,
492                                regex,
493                                *offset,
494                                false,
495                                &mut current_line,
496                                &mut skip_current,
497                                &mut sizes,
498                                &mut created_files,
499                                &mut file_index,
500                                config,
501                                false,
502                            ) {
503                                do_cleanup(&created_files, config);
504                                return Err(e);
505                            }
506                        }
507                        Pattern::SkipTo { regex, offset } => {
508                            if let Err(e) = apply_regex_pattern(
509                                data,
510                                &offsets,
511                                total_lines,
512                                regex,
513                                *offset,
514                                true,
515                                &mut current_line,
516                                &mut skip_current,
517                                &mut sizes,
518                                &mut created_files,
519                                &mut file_index,
520                                config,
521                                false,
522                            ) {
523                                do_cleanup(&created_files, config);
524                                return Err(e);
525                            }
526                        }
527                        _ => {}
528                    }
529                }
530                pat_idx += 1;
531            }
532            Pattern::RepeatForever => {
533                if pat_idx == 0 {
534                    do_cleanup(&created_files, config);
535                    return Err("{*}: no preceding pattern to repeat".to_string());
536                }
537                let prev_pat = find_prev_pattern(patterns, pat_idx);
538                let prev_pat = match prev_pat {
539                    Some(p) => p.clone(),
540                    None => {
541                        do_cleanup(&created_files, config);
542                        return Err("{*}: no preceding pattern to repeat".to_string());
543                    }
544                };
545                // Repeat until the pattern fails to match (graceful stop)
546                loop {
547                    match &prev_pat {
548                        Pattern::Regex { regex, offset } => {
549                            match apply_regex_pattern(
550                                data,
551                                &offsets,
552                                total_lines,
553                                regex,
554                                *offset,
555                                false,
556                                &mut current_line,
557                                &mut skip_current,
558                                &mut sizes,
559                                &mut created_files,
560                                &mut file_index,
561                                config,
562                                true, // graceful no-match
563                            ) {
564                                Ok(true) => continue,
565                                Ok(false) => break,
566                                Err(e) => {
567                                    do_cleanup(&created_files, config);
568                                    return Err(e);
569                                }
570                            }
571                        }
572                        Pattern::SkipTo { regex, offset } => {
573                            match apply_regex_pattern(
574                                data,
575                                &offsets,
576                                total_lines,
577                                regex,
578                                *offset,
579                                true,
580                                &mut current_line,
581                                &mut skip_current,
582                                &mut sizes,
583                                &mut created_files,
584                                &mut file_index,
585                                config,
586                                true,
587                            ) {
588                                Ok(true) => continue,
589                                Ok(false) => break,
590                                Err(e) => {
591                                    do_cleanup(&created_files, config);
592                                    return Err(e);
593                                }
594                            }
595                        }
596                        _ => break,
597                    }
598                }
599                pat_idx += 1;
600            }
601        }
602    }
603
604    // Write remaining lines as the final chunk
605    if current_line < total_lines {
606        let filename = output_filename(config, file_index);
607
608        let bytes = write_chunk_range(data, &offsets, current_line, total_lines, &filename, config)
609            .inspect_err(|_| {
610                do_cleanup(&created_files, config);
611            })?;
612
613        if !(config.elide_empty && current_line >= total_lines) {
614            created_files.push(filename);
615            sizes.push(bytes);
616        }
617    } else if !config.elide_empty {
618        // Write an empty final file
619        let filename = output_filename(config, file_index);
620        let bytes =
621            write_chunk_range(data, &offsets, 0, 0, &filename, config).inspect_err(|_| {
622                do_cleanup(&created_files, config);
623            })?;
624        created_files.push(filename);
625        sizes.push(bytes);
626    }
627
628    Ok(sizes)
629}
630
631/// Find the preceding non-repeat pattern.
632fn find_prev_pattern(patterns: &[Pattern], idx: usize) -> Option<&Pattern> {
633    let mut i = idx;
634    while i > 0 {
635        i -= 1;
636        match &patterns[i] {
637            Pattern::Repeat(_) | Pattern::RepeatForever => continue,
638            other => return Some(other),
639        }
640    }
641    None
642}
643
644/// Split a file by reading from a path or stdin ("-").
645pub fn csplit_from_path(
646    path: &str,
647    patterns: &[Pattern],
648    config: &CsplitConfig,
649) -> Result<Vec<u64>, String> {
650    let input = if path == "-" {
651        let mut buf = String::new();
652        io::Read::read_to_string(&mut io::stdin().lock(), &mut buf)
653            .map_err(|e| format!("read error: {}", e))?;
654        buf
655    } else {
656        std::fs::read_to_string(path).map_err(|e| format!("cannot open '{}': {}", path, e))?
657    };
658
659    csplit_file(&input, patterns, config)
660}
661
662/// Print the sizes of created files to stdout.
663pub fn print_sizes(sizes: &[u64]) {
664    for size in sizes {
665        println!("{}", size);
666    }
667}
coreutils_rs/csplit/core.rs

coreutils_rs/csplit/
core.rs