Skip to main content

coreutils_rs/csplit/
core.rs

1use regex::Regex;
2use std::fs;
3use std::io;
4
5/// A parsed csplit pattern.
6#[derive(Clone, Debug)]
7pub enum Pattern {
8    /// Split before the first line matching the regex, with optional offset.
9    Regex { regex: String, offset: i64 },
10    /// Skip to (but don't include) a line matching the regex, with optional offset.
11    /// Lines skipped are not written to any output file.
12    SkipTo { regex: String, offset: i64 },
13    /// Split at a specific line number.
14    LineNumber(usize),
15    /// Repeat the previous pattern N times.
16    Repeat(usize),
17    /// Repeat the previous pattern as many times as possible.
18    RepeatForever,
19}
20
21/// Configuration for the csplit command.
22#[derive(Clone, Debug)]
23pub struct CsplitConfig {
24    pub prefix: String,
25    pub suffix_format: String,
26    pub digits: usize,
27    pub keep_files: bool,
28    pub quiet: bool,
29    pub elide_empty: bool,
30}
31
32impl Default for CsplitConfig {
33    fn default() -> Self {
34        Self {
35            prefix: "xx".to_string(),
36            suffix_format: String::new(),
37            digits: 2,
38            keep_files: false,
39            quiet: false,
40            elide_empty: false,
41        }
42    }
43}
44
45/// Parse a pattern string into a Pattern enum.
46pub fn parse_pattern(s: &str) -> Result<Pattern, String> {
47    let s = s.trim();
48
49    // {*} - repeat forever
50    if s == "{*}" {
51        return Ok(Pattern::RepeatForever);
52    }
53
54    // {N} - repeat N times
55    if s.starts_with('{') && s.ends_with('}') {
56        let inner = &s[1..s.len() - 1];
57        let n: usize = inner
58            .parse()
59            .map_err(|_| format!("invalid repeat count: '{}'", s))?;
60        return Ok(Pattern::Repeat(n));
61    }
62
63    // /REGEX/[OFFSET] - split before matching line
64    if s.starts_with('/') {
65        let rest = &s[1..];
66        if let Some(end_pos) = rest.rfind('/') {
67            let regex_str = &rest[..end_pos];
68            let after = rest[end_pos + 1..].trim();
69            let offset = if after.is_empty() {
70                0
71            } else {
72                after
73                    .parse::<i64>()
74                    .map_err(|_| format!("invalid offset: '{}'", after))?
75            };
76            // Validate regex
77            Regex::new(regex_str).map_err(|e| format!("invalid regex '{}': {}", regex_str, e))?;
78            return Ok(Pattern::Regex {
79                regex: regex_str.to_string(),
80                offset,
81            });
82        }
83        return Err(format!("unmatched '/' in pattern: '{}'", s));
84    }
85
86    // %REGEX%[OFFSET] - skip to matching line
87    if s.starts_with('%') {
88        let rest = &s[1..];
89        if let Some(end_pos) = rest.rfind('%') {
90            let regex_str = &rest[..end_pos];
91            let after = rest[end_pos + 1..].trim();
92            let offset = if after.is_empty() {
93                0
94            } else {
95                after
96                    .parse::<i64>()
97                    .map_err(|_| format!("invalid offset: '{}'", after))?
98            };
99            // Validate regex
100            Regex::new(regex_str).map_err(|e| format!("invalid regex '{}': {}", regex_str, e))?;
101            return Ok(Pattern::SkipTo {
102                regex: regex_str.to_string(),
103                offset,
104            });
105        }
106        return Err(format!("unmatched '%' in pattern: '{}'", s));
107    }
108
109    // LINE_NUMBER - split at line number
110    let n: usize = s.parse().map_err(|_| format!("invalid pattern: '{}'", s))?;
111    if n == 0 {
112        return Err("line number must be positive".to_string());
113    }
114    Ok(Pattern::LineNumber(n))
115}
116
117/// Generate the output filename for a given file index.
118pub fn output_filename(config: &CsplitConfig, index: usize) -> String {
119    if config.suffix_format.is_empty() {
120        format!("{}{:0>width$}", config.prefix, index, width = config.digits)
121    } else {
122        // Simple sprintf-like formatting: support %02d, %03d, etc.
123        let suffix = format_suffix(&config.suffix_format, index);
124        format!("{}{}", config.prefix, suffix)
125    }
126}
127
128/// Simple sprintf-like formatter for suffix format strings.
129/// Supports %d, %02d, %03d, etc.
130pub fn format_suffix(fmt: &str, value: usize) -> String {
131    let mut result = String::new();
132    let mut chars = fmt.chars().peekable();
133
134    while let Some(ch) = chars.next() {
135        if ch == '%' {
136            // Parse width specifier
137            let mut width_str = String::new();
138            let mut zero_pad = false;
139
140            if chars.peek() == Some(&'0') {
141                zero_pad = true;
142                chars.next();
143            }
144
145            while let Some(&c) = chars.peek() {
146                if c.is_ascii_digit() {
147                    width_str.push(c);
148                    chars.next();
149                } else {
150                    break;
151                }
152            }
153
154            // Expect 'd'
155            if chars.peek() == Some(&'d') {
156                chars.next();
157                let width: usize = width_str.parse().unwrap_or(0);
158                if zero_pad && width > 0 {
159                    result.push_str(&format!("{:0>width$}", value, width = width));
160                } else if width > 0 {
161                    result.push_str(&format!("{:>width$}", value, width = width));
162                } else {
163                    result.push_str(&format!("{}", value));
164                }
165            } else if chars.peek() == Some(&'%') {
166                chars.next();
167                result.push('%');
168            } else {
169                // Unknown format, just pass through
170                result.push('%');
171                result.push_str(&width_str);
172            }
173        } else {
174            result.push(ch);
175        }
176    }
177
178    result
179}
180
181/// Write lines to a file, returning the number of bytes written.
182fn write_chunk(lines: &[String], filename: &str, config: &CsplitConfig) -> Result<u64, String> {
183    if config.elide_empty && lines.is_empty() {
184        return Ok(0);
185    }
186
187    let mut content = String::new();
188    for line in lines {
189        content.push_str(line);
190        content.push('\n');
191    }
192    let bytes = content.len() as u64;
193
194    if config.elide_empty && bytes == 0 {
195        return Ok(0);
196    }
197
198    fs::write(filename, &content).map_err(|e| format!("cannot write '{}': {}", filename, e))?;
199
200    Ok(bytes)
201}
202
203/// Find the first line matching a regex starting from `start`, returning its index.
204fn find_match(lines: &[String], regex: &Regex, start: usize) -> Option<usize> {
205    for (idx, line) in lines.iter().enumerate().skip(start) {
206        if regex.is_match(line) {
207            return Some(idx);
208        }
209    }
210    None
211}
212
213/// Split a file based on patterns.
214///
215/// Returns the sizes (in bytes) of each created output file.
216pub fn csplit_file(
217    input: &str,
218    patterns: &[Pattern],
219    config: &CsplitConfig,
220) -> Result<Vec<u64>, String> {
221    let lines: Vec<String> = input.lines().map(|l| l.to_string()).collect();
222    let total_lines = lines.len();
223
224    // Expand patterns: resolve Repeat and RepeatForever
225    let expanded = expand_patterns(patterns)?;
226
227    let mut sizes: Vec<u64> = Vec::new();
228    let mut created_files: Vec<String> = Vec::new();
229    let mut file_index: usize = 0;
230    let mut current_line: usize = 0; // 0-based index into lines
231
232    let do_cleanup = |files: &[String], config: &CsplitConfig| {
233        if !config.keep_files {
234            for f in files {
235                let _ = fs::remove_file(f);
236            }
237        }
238    };
239
240    for pat in &expanded {
241        match pat {
242            Pattern::LineNumber(n) => {
243                // Split at line number n (1-based).
244                // Everything from current_line to line n-1 goes in this chunk.
245                let split_at = *n; // 1-based line number
246                if split_at <= current_line {
247                    let msg = format!("{}: line number out of range", split_at);
248                    do_cleanup(&created_files, config);
249                    return Err(msg);
250                }
251
252                let end = if split_at > total_lines {
253                    total_lines
254                } else {
255                    split_at - 1 // Convert to 0-based exclusive end
256                };
257
258                let chunk_lines = &lines[current_line..end];
259                let filename = output_filename(config, file_index);
260
261                let bytes = write_chunk(chunk_lines, &filename, config).inspect_err(|_| {
262                    do_cleanup(&created_files, config);
263                })?;
264
265                if !(config.elide_empty && chunk_lines.is_empty()) {
266                    created_files.push(filename);
267                    sizes.push(bytes);
268                    file_index += 1;
269                }
270
271                current_line = end;
272            }
273            Pattern::Regex { regex, offset } => {
274                // Find first line matching regex starting from current_line
275                let re = Regex::new(regex).map_err(|e| {
276                    do_cleanup(&created_files, config);
277                    format!("invalid regex: {}", e)
278                })?;
279
280                // Start searching from current_line, but if the line at
281                // current_line itself matches (which happens after a previous
282                // regex split placed us here), skip it so we find the NEXT
283                // occurrence rather than re-matching the boundary line.
284                let search_start = if current_line > 0
285                    && current_line < total_lines
286                    && re.is_match(&lines[current_line])
287                {
288                    current_line + 1
289                } else {
290                    current_line
291                };
292
293                if let Some(match_idx) = find_match(&lines, &re, search_start) {
294                    // Apply offset
295                    let target = match_idx as i64 + *offset;
296                    let split_at = if target < current_line as i64 {
297                        current_line
298                    } else if target as usize > total_lines {
299                        total_lines
300                    } else {
301                        target as usize
302                    };
303
304                    let chunk_lines = &lines[current_line..split_at];
305                    let filename = output_filename(config, file_index);
306
307                    let bytes = write_chunk(chunk_lines, &filename, config).inspect_err(|_| {
308                        do_cleanup(&created_files, config);
309                    })?;
310
311                    if !(config.elide_empty && chunk_lines.is_empty()) {
312                        created_files.push(filename);
313                        sizes.push(bytes);
314                        file_index += 1;
315                    }
316
317                    current_line = split_at;
318                } else {
319                    let msg = format!("{}: no match", regex);
320                    do_cleanup(&created_files, config);
321                    return Err(msg);
322                }
323            }
324            Pattern::SkipTo { regex, offset } => {
325                // Skip to matching line, discarding lines
326                let re = Regex::new(regex).map_err(|e| {
327                    do_cleanup(&created_files, config);
328                    format!("invalid regex: {}", e)
329                })?;
330
331                if let Some(match_idx) = find_match(&lines, &re, current_line) {
332                    let target = match_idx as i64 + *offset;
333                    let skip_to = if target < current_line as i64 {
334                        current_line
335                    } else if target as usize > total_lines {
336                        total_lines
337                    } else {
338                        target as usize
339                    };
340
341                    // Lines from current_line to skip_to are discarded
342                    current_line = skip_to;
343                } else {
344                    let msg = format!("{}: no match", regex);
345                    do_cleanup(&created_files, config);
346                    return Err(msg);
347                }
348            }
349            Pattern::Repeat(_) | Pattern::RepeatForever => {
350                // These should have been expanded already
351                unreachable!("Repeat patterns should be expanded before processing");
352            }
353        }
354    }
355
356    // Write remaining lines as the final chunk
357    if current_line < total_lines {
358        let chunk_lines = &lines[current_line..total_lines];
359        let filename = output_filename(config, file_index);
360
361        let bytes = write_chunk(chunk_lines, &filename, config).inspect_err(|_| {
362            do_cleanup(&created_files, config);
363        })?;
364
365        if !(config.elide_empty && chunk_lines.is_empty()) {
366            created_files.push(filename);
367            sizes.push(bytes);
368        }
369    } else if !config.elide_empty {
370        // Write an empty final file
371        let filename = output_filename(config, file_index);
372        let bytes = write_chunk(&[], &filename, config).inspect_err(|_| {
373            do_cleanup(&created_files, config);
374        })?;
375        created_files.push(filename);
376        sizes.push(bytes);
377    }
378
379    Ok(sizes)
380}
381
382/// Expand repeat patterns into the underlying patterns they repeat.
383/// Returns a flat list of non-repeat patterns.
384fn expand_patterns(patterns: &[Pattern]) -> Result<Vec<Pattern>, String> {
385    let mut expanded: Vec<Pattern> = Vec::new();
386    let mut i = 0;
387
388    while i < patterns.len() {
389        match &patterns[i] {
390            Pattern::Repeat(n) => {
391                if expanded.is_empty() {
392                    return Err("{N}: no preceding pattern to repeat".to_string());
393                }
394                let prev = expanded.last().unwrap().clone();
395                for _ in 0..*n {
396                    expanded.push(prev.clone());
397                }
398                i += 1;
399            }
400            Pattern::RepeatForever => {
401                if expanded.is_empty() {
402                    return Err("{*}: no preceding pattern to repeat".to_string());
403                }
404                // We can't actually expand forever at parse time.
405                // Mark with a sentinel: we'll repeat the previous pattern
406                // up to a reasonable limit (10000) to prevent infinite loops.
407                let prev = expanded.last().unwrap().clone();
408                for _ in 0..10000 {
409                    expanded.push(prev.clone());
410                }
411                i += 1;
412            }
413            other => {
414                expanded.push(other.clone());
415                i += 1;
416            }
417        }
418    }
419
420    Ok(expanded)
421}
422
423/// Split a file by reading from a path or stdin ("-").
424pub fn csplit_from_path(
425    path: &str,
426    patterns: &[Pattern],
427    config: &CsplitConfig,
428) -> Result<Vec<u64>, String> {
429    let input = if path == "-" {
430        let mut buf = String::new();
431        io::stdin()
432            .read_line(&mut buf)
433            .map_err(|e| format!("read error: {}", e))?;
434        // Read all remaining
435        let mut all = buf;
436        let mut line = String::new();
437        while io::stdin()
438            .read_line(&mut line)
439            .map_err(|e| format!("read error: {}", e))?
440            > 0
441        {
442            all.push_str(&line);
443            line.clear();
444        }
445        all
446    } else {
447        fs::read_to_string(path).map_err(|e| format!("cannot open '{}': {}", path, e))?
448    };
449
450    csplit_file(&input, patterns, config)
451}
452
453/// Print the sizes of created files to stdout.
454pub fn print_sizes(sizes: &[u64]) {
455    for size in sizes {
456        println!("{}", size);
457    }
458}