Skip to main content

coreutils_rs/csplit/
core.rs

1use regex::Regex;
2use std::fs;
3use std::io;
4
5/// A parsed csplit pattern.
6#[derive(Clone, Debug)]
7pub enum Pattern {
8    /// Split before the first line matching the regex, with optional offset.
9    Regex { regex: String, offset: i64 },
10    /// Skip to (but don't include) a line matching the regex, with optional offset.
11    /// Lines skipped are not written to any output file.
12    SkipTo { regex: String, offset: i64 },
13    /// Split at a specific line number.
14    LineNumber(usize),
15    /// Repeat the previous pattern N times.
16    Repeat(usize),
17    /// Repeat the previous pattern as many times as possible.
18    RepeatForever,
19}
20
21/// Configuration for the csplit command.
22#[derive(Clone, Debug)]
23pub struct CsplitConfig {
24    pub prefix: String,
25    pub suffix_format: String,
26    pub digits: usize,
27    pub keep_files: bool,
28    pub quiet: bool,
29    pub elide_empty: bool,
30}
31
32impl Default for CsplitConfig {
33    fn default() -> Self {
34        Self {
35            prefix: "xx".to_string(),
36            suffix_format: String::new(),
37            digits: 2,
38            keep_files: false,
39            quiet: false,
40            elide_empty: false,
41        }
42    }
43}
44
45/// Parse a pattern string into a Pattern enum.
46pub fn parse_pattern(s: &str) -> Result<Pattern, String> {
47    let s = s.trim();
48
49    // {*} - repeat forever
50    if s == "{*}" {
51        return Ok(Pattern::RepeatForever);
52    }
53
54    // {N} - repeat N times
55    if s.starts_with('{') && s.ends_with('}') {
56        let inner = &s[1..s.len() - 1];
57        let n: usize = inner
58            .parse()
59            .map_err(|_| format!("invalid repeat count: '{}'", s))?;
60        return Ok(Pattern::Repeat(n));
61    }
62
63    // /REGEX/[OFFSET] - split before matching line
64    if s.starts_with('/') {
65        let rest = &s[1..];
66        if let Some(end_pos) = rest.rfind('/') {
67            let regex_str = &rest[..end_pos];
68            let after = rest[end_pos + 1..].trim();
69            let offset = if after.is_empty() {
70                0
71            } else {
72                after
73                    .parse::<i64>()
74                    .map_err(|_| format!("invalid offset: '{}'", after))?
75            };
76            // Validate regex
77            Regex::new(regex_str).map_err(|e| format!("invalid regex '{}': {}", regex_str, e))?;
78            return Ok(Pattern::Regex {
79                regex: regex_str.to_string(),
80                offset,
81            });
82        }
83        return Err(format!("unmatched '/' in pattern: '{}'", s));
84    }
85
86    // %REGEX%[OFFSET] - skip to matching line
87    if s.starts_with('%') {
88        let rest = &s[1..];
89        if let Some(end_pos) = rest.rfind('%') {
90            let regex_str = &rest[..end_pos];
91            let after = rest[end_pos + 1..].trim();
92            let offset = if after.is_empty() {
93                0
94            } else {
95                after
96                    .parse::<i64>()
97                    .map_err(|_| format!("invalid offset: '{}'", after))?
98            };
99            // Validate regex
100            Regex::new(regex_str).map_err(|e| format!("invalid regex '{}': {}", regex_str, e))?;
101            return Ok(Pattern::SkipTo {
102                regex: regex_str.to_string(),
103                offset,
104            });
105        }
106        return Err(format!("unmatched '%' in pattern: '{}'", s));
107    }
108
109    // LINE_NUMBER - split at line number
110    let n: usize = s.parse().map_err(|_| format!("invalid pattern: '{}'", s))?;
111    if n == 0 {
112        return Err("line number must be positive".to_string());
113    }
114    Ok(Pattern::LineNumber(n))
115}
116
117/// Generate the output filename for a given file index.
118pub fn output_filename(config: &CsplitConfig, index: usize) -> String {
119    if config.suffix_format.is_empty() {
120        format!("{}{:0>width$}", config.prefix, index, width = config.digits)
121    } else {
122        // Simple sprintf-like formatting: support %02d, %03d, etc.
123        let suffix = format_suffix(&config.suffix_format, index);
124        format!("{}{}", config.prefix, suffix)
125    }
126}
127
128/// Simple sprintf-like formatter for suffix format strings.
129/// Supports %d, %02d, %03d, etc.
130pub fn format_suffix(fmt: &str, value: usize) -> String {
131    let mut result = String::new();
132    let mut chars = fmt.chars().peekable();
133
134    while let Some(ch) = chars.next() {
135        if ch == '%' {
136            // Parse width specifier
137            let mut width_str = String::new();
138            let mut zero_pad = false;
139
140            if chars.peek() == Some(&'0') {
141                zero_pad = true;
142                chars.next();
143            }
144
145            while let Some(&c) = chars.peek() {
146                if c.is_ascii_digit() {
147                    width_str.push(c);
148                    chars.next();
149                } else {
150                    break;
151                }
152            }
153
154            // Expect 'd'
155            if chars.peek() == Some(&'d') {
156                chars.next();
157                let width: usize = width_str.parse().unwrap_or(0);
158                if zero_pad && width > 0 {
159                    result.push_str(&format!("{:0>width$}", value, width = width));
160                } else if width > 0 {
161                    result.push_str(&format!("{:>width$}", value, width = width));
162                } else {
163                    result.push_str(&format!("{}", value));
164                }
165            } else if chars.peek() == Some(&'%') {
166                chars.next();
167                result.push('%');
168            } else {
169                // Unknown format, just pass through
170                result.push('%');
171                result.push_str(&width_str);
172            }
173        } else {
174            result.push(ch);
175        }
176    }
177
178    result
179}
180
181/// Write lines to a file, returning the number of bytes written.
182fn write_chunk(lines: &[String], filename: &str, config: &CsplitConfig) -> Result<u64, String> {
183    if config.elide_empty && lines.is_empty() {
184        return Ok(0);
185    }
186
187    let mut content = String::new();
188    for line in lines {
189        content.push_str(line);
190        content.push('\n');
191    }
192    let bytes = content.len() as u64;
193
194    if config.elide_empty && bytes == 0 {
195        return Ok(0);
196    }
197
198    fs::write(filename, &content).map_err(|e| format!("cannot write '{}': {}", filename, e))?;
199
200    Ok(bytes)
201}
202
203/// Find the first line matching a regex starting from `start`, returning its index.
204fn find_match(lines: &[String], regex: &Regex, start: usize) -> Option<usize> {
205    for (idx, line) in lines.iter().enumerate().skip(start) {
206        if regex.is_match(line) {
207            return Some(idx);
208        }
209    }
210    None
211}
212
213/// Apply a single regex or skip-to pattern. Returns Ok(true) if matched,
214/// Ok(false) if no match (only used for repeat-forever graceful stop).
215/// For non-repeat patterns, no match is always an error.
216fn apply_regex_pattern(
217    lines: &[String],
218    total_lines: usize,
219    regex: &str,
220    offset: i64,
221    is_skip: bool,
222    current_line: &mut usize,
223    skip_current: &mut bool,
224    sizes: &mut Vec<u64>,
225    created_files: &mut Vec<String>,
226    file_index: &mut usize,
227    config: &CsplitConfig,
228    graceful_no_match: bool,
229) -> Result<bool, String> {
230    let re = Regex::new(regex).map_err(|e| format!("invalid regex: {}", e))?;
231
232    // When skip_current is set, the line at current_line was the match boundary
233    // from a previous regex split — skip it to find the NEXT occurrence.
234    let search_start =
235        if *skip_current && *current_line < total_lines && re.is_match(&lines[*current_line]) {
236            *current_line + 1
237        } else {
238            *current_line
239        };
240
241    let match_idx = match find_match(lines, &re, search_start) {
242        Some(idx) => idx,
243        None => {
244            if graceful_no_match {
245                return Ok(false);
246            }
247            return Err(format!("{}: no match", regex));
248        }
249    };
250
251    let target = match_idx as i64 + offset;
252    let split_at = if target < *current_line as i64 {
253        *current_line
254    } else if target as usize > total_lines {
255        total_lines
256    } else {
257        target as usize
258    };
259
260    if is_skip {
261        // SkipTo: discard lines from current_line to split_at
262        *current_line = split_at;
263        *skip_current = false;
264    } else {
265        // Regex: write chunk from current_line to split_at
266        let chunk_lines = &lines[*current_line..split_at];
267        let filename = output_filename(config, *file_index);
268        let bytes = write_chunk(chunk_lines, &filename, config)?;
269
270        if !(config.elide_empty && chunk_lines.is_empty()) {
271            created_files.push(filename);
272            sizes.push(bytes);
273            *file_index += 1;
274        }
275
276        *current_line = split_at;
277        // After a regex match with offset 0, current_line is AT the match line
278        *skip_current = offset == 0;
279    }
280
281    Ok(true)
282}
283
284/// Split a file based on patterns.
285///
286/// Returns the sizes (in bytes) of each created output file.
287pub fn csplit_file(
288    input: &str,
289    patterns: &[Pattern],
290    config: &CsplitConfig,
291) -> Result<Vec<u64>, String> {
292    let lines: Vec<String> = input.lines().map(|l| l.to_string()).collect();
293    let total_lines = lines.len();
294
295    let mut sizes: Vec<u64> = Vec::new();
296    let mut created_files: Vec<String> = Vec::new();
297    let mut file_index: usize = 0;
298    let mut current_line: usize = 0; // 0-based index into lines
299    let mut skip_current = false; // true when current_line is a regex match boundary
300
301    let do_cleanup = |files: &[String], config: &CsplitConfig| {
302        if !config.keep_files {
303            for f in files {
304                let _ = fs::remove_file(f);
305            }
306        }
307    };
308
309    let mut pat_idx = 0;
310    while pat_idx < patterns.len() {
311        match &patterns[pat_idx] {
312            Pattern::LineNumber(n) => {
313                // Split at line number n (1-based).
314                let split_at = *n;
315                if split_at <= current_line {
316                    let msg = format!("{}: line number out of range", split_at);
317                    do_cleanup(&created_files, config);
318                    return Err(msg);
319                }
320
321                let end = if split_at > total_lines {
322                    total_lines
323                } else {
324                    split_at - 1
325                };
326
327                let chunk_lines = &lines[current_line..end];
328                let filename = output_filename(config, file_index);
329
330                let bytes = write_chunk(chunk_lines, &filename, config).inspect_err(|_| {
331                    do_cleanup(&created_files, config);
332                })?;
333
334                if !(config.elide_empty && chunk_lines.is_empty()) {
335                    created_files.push(filename);
336                    sizes.push(bytes);
337                    file_index += 1;
338                }
339
340                current_line = end;
341                skip_current = false;
342                pat_idx += 1;
343            }
344            Pattern::Regex { regex, offset } => {
345                let regex = regex.clone();
346                let offset = *offset;
347                if let Err(e) = apply_regex_pattern(
348                    &lines,
349                    total_lines,
350                    &regex,
351                    offset,
352                    false,
353                    &mut current_line,
354                    &mut skip_current,
355                    &mut sizes,
356                    &mut created_files,
357                    &mut file_index,
358                    config,
359                    false,
360                ) {
361                    do_cleanup(&created_files, config);
362                    return Err(e);
363                }
364                pat_idx += 1;
365            }
366            Pattern::SkipTo { regex, offset } => {
367                let regex = regex.clone();
368                let offset = *offset;
369                if let Err(e) = apply_regex_pattern(
370                    &lines,
371                    total_lines,
372                    &regex,
373                    offset,
374                    true,
375                    &mut current_line,
376                    &mut skip_current,
377                    &mut sizes,
378                    &mut created_files,
379                    &mut file_index,
380                    config,
381                    false,
382                ) {
383                    do_cleanup(&created_files, config);
384                    return Err(e);
385                }
386                pat_idx += 1;
387            }
388            Pattern::Repeat(n) => {
389                let n = *n;
390                if pat_idx == 0 {
391                    do_cleanup(&created_files, config);
392                    return Err("{N}: no preceding pattern to repeat".to_string());
393                }
394                // Find the preceding non-repeat pattern
395                let prev_pat = find_prev_pattern(patterns, pat_idx);
396                let prev_pat = match prev_pat {
397                    Some(p) => p.clone(),
398                    None => {
399                        do_cleanup(&created_files, config);
400                        return Err("{N}: no preceding pattern to repeat".to_string());
401                    }
402                };
403                for _ in 0..n {
404                    match &prev_pat {
405                        Pattern::LineNumber(ln) => {
406                            // For repeated line numbers, this doesn't make much sense
407                            // but follow the same logic
408                            let end = if *ln > total_lines {
409                                total_lines
410                            } else {
411                                *ln - 1
412                            };
413                            if end <= current_line {
414                                let msg = format!("{}: line number out of range", ln);
415                                do_cleanup(&created_files, config);
416                                return Err(msg);
417                            }
418                            let chunk_lines = &lines[current_line..end];
419                            let filename = output_filename(config, file_index);
420                            let bytes =
421                                write_chunk(chunk_lines, &filename, config).inspect_err(|_| {
422                                    do_cleanup(&created_files, config);
423                                })?;
424                            if !(config.elide_empty && chunk_lines.is_empty()) {
425                                created_files.push(filename);
426                                sizes.push(bytes);
427                                file_index += 1;
428                            }
429                            current_line = end;
430                            skip_current = false;
431                        }
432                        Pattern::Regex { regex, offset } => {
433                            if let Err(e) = apply_regex_pattern(
434                                &lines,
435                                total_lines,
436                                regex,
437                                *offset,
438                                false,
439                                &mut current_line,
440                                &mut skip_current,
441                                &mut sizes,
442                                &mut created_files,
443                                &mut file_index,
444                                config,
445                                false,
446                            ) {
447                                do_cleanup(&created_files, config);
448                                return Err(e);
449                            }
450                        }
451                        Pattern::SkipTo { regex, offset } => {
452                            if let Err(e) = apply_regex_pattern(
453                                &lines,
454                                total_lines,
455                                regex,
456                                *offset,
457                                true,
458                                &mut current_line,
459                                &mut skip_current,
460                                &mut sizes,
461                                &mut created_files,
462                                &mut file_index,
463                                config,
464                                false,
465                            ) {
466                                do_cleanup(&created_files, config);
467                                return Err(e);
468                            }
469                        }
470                        _ => {}
471                    }
472                }
473                pat_idx += 1;
474            }
475            Pattern::RepeatForever => {
476                if pat_idx == 0 {
477                    do_cleanup(&created_files, config);
478                    return Err("{*}: no preceding pattern to repeat".to_string());
479                }
480                let prev_pat = find_prev_pattern(patterns, pat_idx);
481                let prev_pat = match prev_pat {
482                    Some(p) => p.clone(),
483                    None => {
484                        do_cleanup(&created_files, config);
485                        return Err("{*}: no preceding pattern to repeat".to_string());
486                    }
487                };
488                // Repeat until the pattern fails to match (graceful stop)
489                loop {
490                    match &prev_pat {
491                        Pattern::Regex { regex, offset } => {
492                            match apply_regex_pattern(
493                                &lines,
494                                total_lines,
495                                regex,
496                                *offset,
497                                false,
498                                &mut current_line,
499                                &mut skip_current,
500                                &mut sizes,
501                                &mut created_files,
502                                &mut file_index,
503                                config,
504                                true, // graceful no-match
505                            ) {
506                                Ok(true) => continue,
507                                Ok(false) => break,
508                                Err(e) => {
509                                    do_cleanup(&created_files, config);
510                                    return Err(e);
511                                }
512                            }
513                        }
514                        Pattern::SkipTo { regex, offset } => {
515                            match apply_regex_pattern(
516                                &lines,
517                                total_lines,
518                                regex,
519                                *offset,
520                                true,
521                                &mut current_line,
522                                &mut skip_current,
523                                &mut sizes,
524                                &mut created_files,
525                                &mut file_index,
526                                config,
527                                true,
528                            ) {
529                                Ok(true) => continue,
530                                Ok(false) => break,
531                                Err(e) => {
532                                    do_cleanup(&created_files, config);
533                                    return Err(e);
534                                }
535                            }
536                        }
537                        _ => break,
538                    }
539                }
540                pat_idx += 1;
541            }
542        }
543    }
544
545    // Write remaining lines as the final chunk
546    if current_line < total_lines {
547        let chunk_lines = &lines[current_line..total_lines];
548        let filename = output_filename(config, file_index);
549
550        let bytes = write_chunk(chunk_lines, &filename, config).inspect_err(|_| {
551            do_cleanup(&created_files, config);
552        })?;
553
554        if !(config.elide_empty && chunk_lines.is_empty()) {
555            created_files.push(filename);
556            sizes.push(bytes);
557        }
558    } else if !config.elide_empty {
559        // Write an empty final file
560        let filename = output_filename(config, file_index);
561        let bytes = write_chunk(&[], &filename, config).inspect_err(|_| {
562            do_cleanup(&created_files, config);
563        })?;
564        created_files.push(filename);
565        sizes.push(bytes);
566    }
567
568    Ok(sizes)
569}
570
571/// Find the preceding non-repeat pattern.
572fn find_prev_pattern(patterns: &[Pattern], idx: usize) -> Option<&Pattern> {
573    let mut i = idx;
574    while i > 0 {
575        i -= 1;
576        match &patterns[i] {
577            Pattern::Repeat(_) | Pattern::RepeatForever => continue,
578            other => return Some(other),
579        }
580    }
581    None
582}
583
584/// Split a file by reading from a path or stdin ("-").
585pub fn csplit_from_path(
586    path: &str,
587    patterns: &[Pattern],
588    config: &CsplitConfig,
589) -> Result<Vec<u64>, String> {
590    let input = if path == "-" {
591        let mut buf = String::new();
592        io::stdin()
593            .read_line(&mut buf)
594            .map_err(|e| format!("read error: {}", e))?;
595        // Read all remaining
596        let mut all = buf;
597        let mut line = String::new();
598        while io::stdin()
599            .read_line(&mut line)
600            .map_err(|e| format!("read error: {}", e))?
601            > 0
602        {
603            all.push_str(&line);
604            line.clear();
605        }
606        all
607    } else {
608        fs::read_to_string(path).map_err(|e| format!("cannot open '{}': {}", path, e))?
609    };
610
611    csplit_file(&input, patterns, config)
612}
613
614/// Print the sizes of created files to stdout.
615pub fn print_sizes(sizes: &[u64]) {
616    for size in sizes {
617        println!("{}", size);
618    }
619}