Skip to main content

coreutils_rs/nl/
core.rs

1use std::io::Write;
2
3/// Line numbering style.
4#[derive(Clone)]
5pub enum NumberingStyle {
6    /// Number all lines.
7    All,
8    /// Number only non-empty lines (default for body).
9    NonEmpty,
10    /// Don't number lines.
11    None,
12    /// Number lines matching a basic regular expression.
13    Regex(regex::Regex),
14}
15
16/// Number format for line numbers.
17#[derive(Clone, Copy, Debug, PartialEq)]
18pub enum NumberFormat {
19    /// Left-justified, no leading zeros.
20    Ln,
21    /// Right-justified, no leading zeros (default).
22    Rn,
23    /// Right-justified, leading zeros.
24    Rz,
25}
26
27/// Configuration for the nl command.
28pub struct NlConfig {
29    pub body_style: NumberingStyle,
30    pub header_style: NumberingStyle,
31    pub footer_style: NumberingStyle,
32    pub section_delimiter: Vec<u8>,
33    pub line_increment: i64,
34    pub join_blank_lines: usize,
35    pub number_format: NumberFormat,
36    pub no_renumber: bool,
37    pub number_separator: Vec<u8>,
38    pub starting_line_number: i64,
39    pub number_width: usize,
40}
41
42impl Default for NlConfig {
43    fn default() -> Self {
44        Self {
45            body_style: NumberingStyle::NonEmpty,
46            header_style: NumberingStyle::None,
47            footer_style: NumberingStyle::None,
48            section_delimiter: vec![b'\\', b':'],
49            line_increment: 1,
50            join_blank_lines: 1,
51            number_format: NumberFormat::Rn,
52            no_renumber: false,
53            number_separator: vec![b'\t'],
54            starting_line_number: 1,
55            number_width: 6,
56        }
57    }
58}
59
60/// Parse a numbering style string.
61pub fn parse_numbering_style(s: &str) -> Result<NumberingStyle, String> {
62    match s {
63        "a" => Ok(NumberingStyle::All),
64        "t" => Ok(NumberingStyle::NonEmpty),
65        "n" => Ok(NumberingStyle::None),
66        _ if s.starts_with('p') => {
67            let pattern = &s[1..];
68            match regex::Regex::new(pattern) {
69                Ok(re) => Ok(NumberingStyle::Regex(re)),
70                Err(e) => Err(format!("invalid regular expression: {}", e)),
71            }
72        }
73        _ => Err(format!("invalid numbering style: '{}'", s)),
74    }
75}
76
77/// Parse a number format string.
78pub fn parse_number_format(s: &str) -> Result<NumberFormat, String> {
79    match s {
80        "ln" => Ok(NumberFormat::Ln),
81        "rn" => Ok(NumberFormat::Rn),
82        "rz" => Ok(NumberFormat::Rz),
83        _ => Err(format!("invalid line numbering: '{}'", s)),
84    }
85}
86
87/// Logical page section types.
88#[derive(Clone, Copy, PartialEq)]
89enum Section {
90    Header,
91    Body,
92    Footer,
93}
94
95/// Check if a line is a section delimiter.
96#[inline]
97fn check_section_delimiter(line: &[u8], delim: &[u8]) -> Option<Section> {
98    if delim.is_empty() {
99        return None;
100    }
101    let dlen = delim.len();
102
103    // Check header (3x)
104    if line.len() == dlen * 3 {
105        let mut is_header = true;
106        for i in 0..3 {
107            if &line[i * dlen..(i + 1) * dlen] != delim {
108                is_header = false;
109                break;
110            }
111        }
112        if is_header {
113            return Some(Section::Header);
114        }
115    }
116
117    // Check body (2x)
118    if line.len() == dlen * 2 && &line[..dlen] == delim && &line[dlen..] == delim {
119        return Some(Section::Body);
120    }
121
122    // Check footer (1x)
123    if line.len() == dlen && line == delim {
124        return Some(Section::Footer);
125    }
126
127    None
128}
129
130/// Format a line number according to the format and width.
131#[inline]
132fn format_number(num: i64, format: NumberFormat, width: usize, buf: &mut Vec<u8>) {
133    let mut num_buf = itoa::Buffer::new();
134    let num_str = num_buf.format(num);
135
136    match format {
137        NumberFormat::Ln => {
138            buf.extend_from_slice(num_str.as_bytes());
139            let pad = width.saturating_sub(num_str.len());
140            buf.resize(buf.len() + pad, b' ');
141        }
142        NumberFormat::Rn => {
143            let pad = width.saturating_sub(num_str.len());
144            buf.resize(buf.len() + pad, b' ');
145            buf.extend_from_slice(num_str.as_bytes());
146        }
147        NumberFormat::Rz => {
148            if num < 0 {
149                buf.push(b'-');
150                let abs_str = &num_str[1..];
151                let pad = width.saturating_sub(abs_str.len() + 1);
152                buf.resize(buf.len() + pad, b'0');
153                buf.extend_from_slice(abs_str.as_bytes());
154            } else {
155                let pad = width.saturating_sub(num_str.len());
156                buf.resize(buf.len() + pad, b'0');
157                buf.extend_from_slice(num_str.as_bytes());
158            }
159        }
160    }
161}
162
163/// Check if a line should be numbered based on the style.
164#[inline]
165fn should_number(line: &[u8], style: &NumberingStyle) -> bool {
166    match style {
167        NumberingStyle::All => true,
168        NumberingStyle::NonEmpty => !line.is_empty(),
169        NumberingStyle::None => false,
170        NumberingStyle::Regex(re) => match std::str::from_utf8(line) {
171            Ok(s) => re.is_match(s),
172            Err(_) => false,
173        },
174    }
175}
176
177/// Build the nl output into a Vec.
178pub fn nl_to_vec(data: &[u8], config: &NlConfig) -> Vec<u8> {
179    let mut line_number = config.starting_line_number;
180    nl_to_vec_with_state(data, config, &mut line_number)
181}
182
183/// Check if config is the simple "number all lines" case suitable for fast path.
184#[inline]
185fn is_simple_number_all(config: &NlConfig) -> bool {
186    matches!(config.body_style, NumberingStyle::All)
187        && matches!(config.header_style, NumberingStyle::None)
188        && matches!(config.footer_style, NumberingStyle::None)
189        && config.join_blank_lines == 1
190        && config.line_increment == 1
191        && !config.no_renumber
192}
193
194/// Inner write helper: formats number prefix + line content + newline into buffer.
195/// SAFETY: caller ensures output has capacity for total_len bytes at start_pos.
196#[inline(always)]
197unsafe fn write_numbered_line(
198    output: &mut Vec<u8>,
199    fmt: NumberFormat,
200    num_str: &str,
201    pad: usize,
202    sep: &[u8],
203    line_data: *const u8,
204    line_len: usize,
205) {
206    unsafe {
207        let prefix_len = pad + num_str.len() + sep.len();
208        let total_len = prefix_len + line_len + 1;
209        let start_pos = output.len();
210        let dst = output.as_mut_ptr().add(start_pos);
211
212        match fmt {
213            NumberFormat::Rn => {
214                std::ptr::write_bytes(dst, b' ', pad);
215                std::ptr::copy_nonoverlapping(num_str.as_ptr(), dst.add(pad), num_str.len());
216            }
217            NumberFormat::Rz => {
218                std::ptr::write_bytes(dst, b'0', pad);
219                std::ptr::copy_nonoverlapping(num_str.as_ptr(), dst.add(pad), num_str.len());
220            }
221            NumberFormat::Ln => {
222                std::ptr::copy_nonoverlapping(num_str.as_ptr(), dst, num_str.len());
223                std::ptr::write_bytes(dst.add(num_str.len()), b' ', pad);
224            }
225        }
226        std::ptr::copy_nonoverlapping(sep.as_ptr(), dst.add(pad + num_str.len()), sep.len());
227        std::ptr::copy_nonoverlapping(line_data, dst.add(prefix_len), line_len);
228        *dst.add(prefix_len + line_len) = b'\n';
229        output.set_len(start_pos + total_len);
230    }
231}
232
233/// Ultra-fast path for nl -b a: eliminates section delimiter checks and uses raw
234/// buffer writes. Handles all three number formats (Rn, Rz, Ln) in a single
235/// function to avoid code duplication.
236fn nl_number_all_fast(data: &[u8], config: &NlConfig, line_number: &mut i64) -> Vec<u8> {
237    let alloc = (data.len() * 2 + 256).min(128 * 1024 * 1024);
238    let mut output: Vec<u8> = Vec::with_capacity(alloc);
239
240    let width = config.number_width;
241    let sep = &config.number_separator;
242    let fmt = config.number_format;
243    let mut num = *line_number;
244    let mut pos: usize = 0;
245    let mut num_buf = itoa::Buffer::new();
246
247    for nl_pos in memchr::memchr_iter(b'\n', data) {
248        let line_len = nl_pos - pos;
249        let needed = output.len() + line_len + width + sep.len() + 22;
250        if needed > output.capacity() {
251            output.reserve(needed - output.capacity() + 4 * 1024 * 1024);
252        }
253
254        let num_str = num_buf.format(num);
255        let pad = width.saturating_sub(num_str.len());
256
257        unsafe {
258            write_numbered_line(
259                &mut output,
260                fmt,
261                num_str,
262                pad,
263                sep,
264                data.as_ptr().add(pos),
265                line_len,
266            );
267        }
268
269        num += 1;
270        pos = nl_pos + 1;
271    }
272
273    // Handle final line without trailing newline
274    if pos < data.len() {
275        let remaining = data.len() - pos;
276        let needed = output.len() + remaining + width + sep.len() + 22;
277        if needed > output.capacity() {
278            output.reserve(needed - output.capacity() + 1024);
279        }
280        let num_str = num_buf.format(num);
281        let pad = width.saturating_sub(num_str.len());
282
283        unsafe {
284            write_numbered_line(
285                &mut output,
286                fmt,
287                num_str,
288                pad,
289                sep,
290                data.as_ptr().add(pos),
291                remaining,
292            );
293        }
294        num += 1;
295    }
296
297    *line_number = num;
298    output
299}
300
301/// Streaming fast path for nl -b a: writes output in ~1MB batches directly to fd,
302/// dramatically reducing write() syscall count vs writing each line individually,
303/// and avoiding enormous output Vec allocation for large inputs.
304/// Returns Ok(bytes_written) on success.
305#[cfg(unix)]
306fn nl_number_all_stream(
307    data: &[u8],
308    config: &NlConfig,
309    line_number: &mut i64,
310    fd: i32,
311) -> std::io::Result<()> {
312    const BUF_SIZE: usize = 1024 * 1024; // 1MB output buffer
313
314    let width = config.number_width;
315    let sep = &config.number_separator;
316    let fmt = config.number_format;
317    let mut num = *line_number;
318    let mut pos: usize = 0;
319    let mut num_buf = itoa::Buffer::new();
320
321    // Pre-allocated output buffer. We flush when near full.
322    let mut output: Vec<u8> = Vec::with_capacity(BUF_SIZE + 64 * 1024);
323
324    for nl_pos in memchr::memchr_iter(b'\n', data) {
325        let line_len = nl_pos - pos;
326
327        // Flush buffer when it reaches ~1MB to keep syscalls large but bounded
328        if output.len() + line_len + width + sep.len() + 22 > BUF_SIZE {
329            write_all_fd(fd, &output)?;
330            output.clear();
331        }
332
333        // Ensure capacity for this line
334        let needed = output.len() + line_len + width + sep.len() + 22;
335        if needed > output.capacity() {
336            output.reserve(needed - output.capacity());
337        }
338
339        let num_str = num_buf.format(num);
340        let pad = width.saturating_sub(num_str.len());
341
342        unsafe {
343            write_numbered_line(
344                &mut output,
345                fmt,
346                num_str,
347                pad,
348                sep,
349                data.as_ptr().add(pos),
350                line_len,
351            );
352        }
353
354        num += 1;
355        pos = nl_pos + 1;
356    }
357
358    // Handle final line without trailing newline
359    if pos < data.len() {
360        let remaining = data.len() - pos;
361        let needed = output.len() + remaining + width + sep.len() + 22;
362        if needed > output.capacity() {
363            output.reserve(needed - output.capacity());
364        }
365        let num_str = num_buf.format(num);
366        let pad = width.saturating_sub(num_str.len());
367
368        unsafe {
369            write_numbered_line(
370                &mut output,
371                fmt,
372                num_str,
373                pad,
374                sep,
375                data.as_ptr().add(pos),
376                remaining,
377            );
378        }
379        num += 1;
380    }
381
382    // Flush remaining data
383    if !output.is_empty() {
384        write_all_fd(fd, &output)?;
385    }
386
387    *line_number = num;
388    Ok(())
389}
390
391/// Streaming generic path: writes output in ~1MB batches directly to fd.
392/// Handles all numbering styles, section delimiters, and blank line joining.
393#[cfg(unix)]
394fn nl_generic_stream(
395    data: &[u8],
396    config: &NlConfig,
397    line_number: &mut i64,
398    fd: i32,
399) -> std::io::Result<()> {
400    if data.is_empty() {
401        return Ok(());
402    }
403
404    const BUF_SIZE: usize = 1024 * 1024; // 1MB output buffer
405
406    let mut output: Vec<u8> = Vec::with_capacity(BUF_SIZE + 64 * 1024);
407    let mut current_section = Section::Body;
408    let mut consecutive_blanks: usize = 0;
409    let mut start = 0;
410    let mut line_iter = memchr::memchr_iter(b'\n', data);
411
412    loop {
413        let (line, has_newline) = match line_iter.next() {
414            Some(pos) => (&data[start..pos], true),
415            None => {
416                if start < data.len() {
417                    (&data[start..], false)
418                } else {
419                    break;
420                }
421            }
422        };
423
424        // Flush when buffer is near capacity
425        if output.len() > BUF_SIZE {
426            write_all_fd(fd, &output)?;
427            output.clear();
428        }
429
430        // Check for section delimiter
431        if let Some(section) = check_section_delimiter(line, &config.section_delimiter) {
432            if !config.no_renumber {
433                *line_number = config.starting_line_number;
434            }
435            current_section = section;
436            consecutive_blanks = 0;
437            output.push(b'\n');
438            if has_newline {
439                start += line.len() + 1;
440            } else {
441                break;
442            }
443            continue;
444        }
445
446        let style = match current_section {
447            Section::Header => &config.header_style,
448            Section::Body => &config.body_style,
449            Section::Footer => &config.footer_style,
450        };
451
452        let is_blank = line.is_empty();
453
454        if is_blank {
455            consecutive_blanks += 1;
456        } else {
457            consecutive_blanks = 0;
458        }
459
460        let do_number = if is_blank && config.join_blank_lines > 1 {
461            if should_number(line, style) {
462                consecutive_blanks >= config.join_blank_lines
463            } else {
464                false
465            }
466        } else {
467            should_number(line, style)
468        };
469
470        if do_number {
471            if is_blank && config.join_blank_lines > 1 {
472                consecutive_blanks = 0;
473            }
474            format_number(
475                *line_number,
476                config.number_format,
477                config.number_width,
478                &mut output,
479            );
480            output.extend_from_slice(&config.number_separator);
481            output.extend_from_slice(line);
482            *line_number = line_number.wrapping_add(config.line_increment);
483        } else {
484            let total_pad = config.number_width + config.number_separator.len();
485            output.resize(output.len() + total_pad, b' ');
486            output.extend_from_slice(line);
487        }
488
489        if has_newline {
490            output.push(b'\n');
491            start += line.len() + 1;
492        } else {
493            output.push(b'\n');
494            break;
495        }
496    }
497
498    // Flush remaining
499    if !output.is_empty() {
500        write_all_fd(fd, &output)?;
501    }
502
503    Ok(())
504}
505
506/// Write buffer to a file descriptor, retrying on partial/interrupted writes.
507#[cfg(unix)]
508#[inline]
509fn write_all_fd(fd: i32, data: &[u8]) -> std::io::Result<()> {
510    let mut written = 0;
511    while written < data.len() {
512        let ret = unsafe {
513            libc::write(
514                fd,
515                data[written..].as_ptr() as *const libc::c_void,
516                (data.len() - written) as _,
517            )
518        };
519        if ret > 0 {
520            written += ret as usize;
521        } else if ret == 0 {
522            return Err(std::io::Error::new(
523                std::io::ErrorKind::WriteZero,
524                "write returned 0",
525            ));
526        } else {
527            let err = std::io::Error::last_os_error();
528            if err.kind() == std::io::ErrorKind::Interrupted {
529                continue;
530            }
531            return Err(err);
532        }
533    }
534    Ok(())
535}
536
537/// Stream nl output directly to a file descriptor in batched writes.
538/// This is the preferred entry point for the binary — avoids building the entire
539/// output in memory and instead flushes ~1MB chunks. For large files this
540/// dramatically reduces memory usage and write() syscall overhead.
541#[cfg(unix)]
542pub fn nl_stream_with_state(
543    data: &[u8],
544    config: &NlConfig,
545    line_number: &mut i64,
546    fd: i32,
547) -> std::io::Result<()> {
548    if data.is_empty() {
549        return Ok(());
550    }
551
552    // Fast path: number-all without section delimiters
553    let has_section_delims = !config.section_delimiter.is_empty()
554        && memchr::memmem::find(data, &config.section_delimiter).is_some();
555    if is_simple_number_all(config) && !has_section_delims {
556        return nl_number_all_stream(data, config, line_number, fd);
557    }
558
559    nl_generic_stream(data, config, line_number, fd)
560}
561
562/// Build the nl output into a Vec, continuing numbering from `line_number`.
563/// Updates `line_number` in place so callers can continue across multiple files.
564pub fn nl_to_vec_with_state(data: &[u8], config: &NlConfig, line_number: &mut i64) -> Vec<u8> {
565    if data.is_empty() {
566        return Vec::new();
567    }
568
569    // Fast paths for common benchmark cases.
570    // Guard: skip fast path if data contains section delimiters (rare in practice).
571    let has_section_delims = !config.section_delimiter.is_empty()
572        && memchr::memmem::find(data, &config.section_delimiter).is_some();
573    if is_simple_number_all(config) && !has_section_delims {
574        return nl_number_all_fast(data, config, line_number);
575    }
576
577    // Generic path: pre-allocate generously instead of counting newlines
578    let alloc = (data.len() * 2 + 256).min(128 * 1024 * 1024);
579    let mut output: Vec<u8> = Vec::with_capacity(alloc);
580
581    let mut current_section = Section::Body;
582    let mut consecutive_blanks: usize = 0;
583
584    let mut start = 0;
585    let mut line_iter = memchr::memchr_iter(b'\n', data);
586
587    loop {
588        let (line, has_newline) = match line_iter.next() {
589            Some(pos) => (&data[start..pos], true),
590            None => {
591                if start < data.len() {
592                    (&data[start..], false)
593                } else {
594                    break;
595                }
596            }
597        };
598
599        // Check for section delimiter
600        if let Some(section) = check_section_delimiter(line, &config.section_delimiter) {
601            if !config.no_renumber {
602                *line_number = config.starting_line_number;
603            }
604            current_section = section;
605            consecutive_blanks = 0;
606            output.push(b'\n');
607            if has_newline {
608                start += line.len() + 1;
609            } else {
610                break;
611            }
612            continue;
613        }
614
615        let style = match current_section {
616            Section::Header => &config.header_style,
617            Section::Body => &config.body_style,
618            Section::Footer => &config.footer_style,
619        };
620
621        let is_blank = line.is_empty();
622
623        if is_blank {
624            consecutive_blanks += 1;
625        } else {
626            consecutive_blanks = 0;
627        }
628
629        let do_number = if is_blank && config.join_blank_lines > 1 {
630            if should_number(line, style) {
631                consecutive_blanks >= config.join_blank_lines
632            } else {
633                false
634            }
635        } else {
636            should_number(line, style)
637        };
638
639        if do_number {
640            if is_blank && config.join_blank_lines > 1 {
641                consecutive_blanks = 0;
642            }
643            format_number(
644                *line_number,
645                config.number_format,
646                config.number_width,
647                &mut output,
648            );
649            output.extend_from_slice(&config.number_separator);
650            output.extend_from_slice(line);
651            *line_number = line_number.wrapping_add(config.line_increment);
652        } else {
653            // Non-numbered lines: GNU nl outputs width + separator_len total spaces, then content
654            let total_pad = config.number_width + config.number_separator.len();
655            output.resize(output.len() + total_pad, b' ');
656            output.extend_from_slice(line);
657        }
658
659        if has_newline {
660            output.push(b'\n');
661            start += line.len() + 1;
662        } else {
663            // GNU nl always adds a trailing newline, even when the input lacks one
664            // (but has content on the last line). Empty input produces empty output.
665            output.push(b'\n');
666            break;
667        }
668    }
669
670    output
671}
672
673/// Number lines and write to the provided writer.
674pub fn nl(data: &[u8], config: &NlConfig, out: &mut impl Write) -> std::io::Result<()> {
675    let output = nl_to_vec(data, config);
676    out.write_all(&output)
677}