coreutils_rs/nl/
core.rs

1use std::io::Write;
2
3/// Line numbering style.
4#[derive(Clone)]
5pub enum NumberingStyle {
6    /// Number all lines.
7    All,
8    /// Number only non-empty lines (default for body).
9    NonEmpty,
10    /// Don't number lines.
11    None,
12    /// Number lines matching a basic regular expression.
13    Regex(regex::Regex),
14}
15
16/// Number format for line numbers.
17#[derive(Clone, Copy, Debug, PartialEq)]
18pub enum NumberFormat {
19    /// Left-justified, no leading zeros.
20    Ln,
21    /// Right-justified, no leading zeros (default).
22    Rn,
23    /// Right-justified, leading zeros.
24    Rz,
25}
26
27/// Configuration for the nl command.
28pub struct NlConfig {
29    pub body_style: NumberingStyle,
30    pub header_style: NumberingStyle,
31    pub footer_style: NumberingStyle,
32    pub section_delimiter: Vec<u8>,
33    pub line_increment: i64,
34    pub join_blank_lines: usize,
35    pub number_format: NumberFormat,
36    pub no_renumber: bool,
37    pub number_separator: Vec<u8>,
38    pub starting_line_number: i64,
39    pub number_width: usize,
40}
41
42impl Default for NlConfig {
43    fn default() -> Self {
44        Self {
45            body_style: NumberingStyle::NonEmpty,
46            header_style: NumberingStyle::None,
47            footer_style: NumberingStyle::None,
48            section_delimiter: vec![b'\\', b':'],
49            line_increment: 1,
50            join_blank_lines: 1,
51            number_format: NumberFormat::Rn,
52            no_renumber: false,
53            number_separator: vec![b'\t'],
54            starting_line_number: 1,
55            number_width: 6,
56        }
57    }
58}
59
60/// Parse a numbering style string.
61pub fn parse_numbering_style(s: &str) -> Result<NumberingStyle, String> {
62    match s {
63        "a" => Ok(NumberingStyle::All),
64        "t" => Ok(NumberingStyle::NonEmpty),
65        "n" => Ok(NumberingStyle::None),
66        _ if s.starts_with('p') => {
67            let pattern = &s[1..];
68            match regex::Regex::new(pattern) {
69                Ok(re) => Ok(NumberingStyle::Regex(re)),
70                Err(e) => Err(format!("invalid regular expression: {}", e)),
71            }
72        }
73        _ => Err(format!("invalid numbering style: '{}'", s)),
74    }
75}
76
77/// Parse a number format string.
78pub fn parse_number_format(s: &str) -> Result<NumberFormat, String> {
79    match s {
80        "ln" => Ok(NumberFormat::Ln),
81        "rn" => Ok(NumberFormat::Rn),
82        "rz" => Ok(NumberFormat::Rz),
83        _ => Err(format!("invalid line numbering: '{}'", s)),
84    }
85}
86
87/// Logical page section types.
88#[derive(Clone, Copy, PartialEq)]
89enum Section {
90    Header,
91    Body,
92    Footer,
93}
94
95/// Check if a line is a section delimiter.
96#[inline]
97fn check_section_delimiter(line: &[u8], delim: &[u8]) -> Option<Section> {
98    if delim.is_empty() {
99        return None;
100    }
101    let dlen = delim.len();
102
103    // Check header (3x)
104    if line.len() == dlen * 3 {
105        let mut is_header = true;
106        for i in 0..3 {
107            if &line[i * dlen..(i + 1) * dlen] != delim {
108                is_header = false;
109                break;
110            }
111        }
112        if is_header {
113            return Some(Section::Header);
114        }
115    }
116
117    // Check body (2x)
118    if line.len() == dlen * 2 && &line[..dlen] == delim && &line[dlen..] == delim {
119        return Some(Section::Body);
120    }
121
122    // Check footer (1x)
123    if line.len() == dlen && line == delim {
124        return Some(Section::Footer);
125    }
126
127    None
128}
129
130/// Format a line number according to the format and width.
131#[inline]
132fn format_number(num: i64, format: NumberFormat, width: usize, buf: &mut Vec<u8>) {
133    let mut num_buf = itoa::Buffer::new();
134    let num_str = num_buf.format(num);
135
136    match format {
137        NumberFormat::Ln => {
138            buf.extend_from_slice(num_str.as_bytes());
139            let pad = width.saturating_sub(num_str.len());
140            buf.resize(buf.len() + pad, b' ');
141        }
142        NumberFormat::Rn => {
143            let pad = width.saturating_sub(num_str.len());
144            buf.resize(buf.len() + pad, b' ');
145            buf.extend_from_slice(num_str.as_bytes());
146        }
147        NumberFormat::Rz => {
148            if num < 0 {
149                buf.push(b'-');
150                let abs_str = &num_str[1..];
151                let pad = width.saturating_sub(abs_str.len() + 1);
152                buf.resize(buf.len() + pad, b'0');
153                buf.extend_from_slice(abs_str.as_bytes());
154            } else {
155                let pad = width.saturating_sub(num_str.len());
156                buf.resize(buf.len() + pad, b'0');
157                buf.extend_from_slice(num_str.as_bytes());
158            }
159        }
160    }
161}
162
163/// Check if a line should be numbered based on the style.
164#[inline]
165fn should_number(line: &[u8], style: &NumberingStyle) -> bool {
166    match style {
167        NumberingStyle::All => true,
168        NumberingStyle::NonEmpty => !line.is_empty(),
169        NumberingStyle::None => false,
170        NumberingStyle::Regex(re) => match std::str::from_utf8(line) {
171            Ok(s) => re.is_match(s),
172            Err(_) => false,
173        },
174    }
175}
176
177/// Build the nl output into a Vec.
178pub fn nl_to_vec(data: &[u8], config: &NlConfig) -> Vec<u8> {
179    let mut line_number = config.starting_line_number;
180    nl_to_vec_with_state(data, config, &mut line_number)
181}
182
183/// Check if config is the simple "number all lines" case suitable for fast path.
184#[inline]
185fn is_simple_number_all(config: &NlConfig) -> bool {
186    matches!(config.body_style, NumberingStyle::All)
187        && matches!(config.header_style, NumberingStyle::None)
188        && matches!(config.footer_style, NumberingStyle::None)
189        && config.join_blank_lines == 1
190        && config.line_increment == 1
191        && !config.no_renumber
192        && config.number_width + config.number_separator.len() <= 30
193}
194
195/// Check if config is the default "number non-empty lines" case suitable for fast path.
196#[inline]
197fn is_simple_number_nonempty(config: &NlConfig) -> bool {
198    matches!(config.body_style, NumberingStyle::NonEmpty)
199        && matches!(config.header_style, NumberingStyle::None)
200        && matches!(config.footer_style, NumberingStyle::None)
201        && config.join_blank_lines == 1
202        && config.line_increment == 1
203        && !config.no_renumber
204        && config.number_width + config.number_separator.len() <= 30
205}
206
207/// Inner write helper: formats number prefix + line content + newline into buffer.
208/// SAFETY: caller ensures output has capacity for total_len bytes at start_pos.
209#[inline(always)]
210unsafe fn write_numbered_line(
211    output: &mut Vec<u8>,
212    fmt: NumberFormat,
213    num_str: &str,
214    pad: usize,
215    sep: &[u8],
216    line_data: *const u8,
217    line_len: usize,
218) {
219    unsafe {
220        let prefix_len = pad + num_str.len() + sep.len();
221        let total_len = prefix_len + line_len + 1;
222        let start_pos = output.len();
223        let dst = output.as_mut_ptr().add(start_pos);
224
225        match fmt {
226            NumberFormat::Rn => {
227                std::ptr::write_bytes(dst, b' ', pad);
228                std::ptr::copy_nonoverlapping(num_str.as_ptr(), dst.add(pad), num_str.len());
229            }
230            NumberFormat::Rz => {
231                std::ptr::write_bytes(dst, b'0', pad);
232                std::ptr::copy_nonoverlapping(num_str.as_ptr(), dst.add(pad), num_str.len());
233            }
234            NumberFormat::Ln => {
235                std::ptr::copy_nonoverlapping(num_str.as_ptr(), dst, num_str.len());
236                std::ptr::write_bytes(dst.add(num_str.len()), b' ', pad);
237            }
238        }
239        std::ptr::copy_nonoverlapping(sep.as_ptr(), dst.add(pad + num_str.len()), sep.len());
240        std::ptr::copy_nonoverlapping(line_data, dst.add(prefix_len), line_len);
241        *dst.add(prefix_len + line_len) = b'\n';
242        output.set_len(start_pos + total_len);
243    }
244}
245
246/// Ultra-fast path for nl -b a: eliminates section delimiter checks and uses raw
247/// buffer writes. Handles all three number formats (Rn, Rz, Ln) in a single
248/// function to avoid code duplication.
249fn nl_number_all_fast(data: &[u8], config: &NlConfig, line_number: &mut i64) -> Vec<u8> {
250    let alloc = (data.len() * 2 + 256).min(128 * 1024 * 1024);
251    let mut output: Vec<u8> = Vec::with_capacity(alloc);
252
253    let width = config.number_width;
254    let sep = &config.number_separator;
255    let fmt = config.number_format;
256    let mut num = *line_number;
257    let mut pos: usize = 0;
258    let mut num_buf = itoa::Buffer::new();
259
260    for nl_pos in memchr::memchr_iter(b'\n', data) {
261        let line_len = nl_pos - pos;
262        let needed = output.len() + line_len + width + sep.len() + 22;
263        if needed > output.capacity() {
264            output.reserve(needed - output.capacity() + 4 * 1024 * 1024);
265        }
266
267        let num_str = num_buf.format(num);
268        let pad = width.saturating_sub(num_str.len());
269
270        unsafe {
271            write_numbered_line(
272                &mut output,
273                fmt,
274                num_str,
275                pad,
276                sep,
277                data.as_ptr().add(pos),
278                line_len,
279            );
280        }
281
282        num += 1;
283        pos = nl_pos + 1;
284    }
285
286    // Handle final line without trailing newline
287    if pos < data.len() {
288        let remaining = data.len() - pos;
289        let needed = output.len() + remaining + width + sep.len() + 22;
290        if needed > output.capacity() {
291            output.reserve(needed - output.capacity() + 1024);
292        }
293        let num_str = num_buf.format(num);
294        let pad = width.saturating_sub(num_str.len());
295
296        unsafe {
297            write_numbered_line(
298                &mut output,
299                fmt,
300                num_str,
301                pad,
302                sep,
303                data.as_ptr().add(pos),
304                remaining,
305            );
306        }
307        num += 1;
308    }
309
310    *line_number = num;
311    output
312}
313
314/// Streaming fast path for nl -b a: writes output in ~1MB batches directly to fd.
315/// Uses pre-formatted prefix in a stack-allocated buffer with in-place digit
316/// increment to avoid reformatting the number string for every single line.
317/// Raw write_pos tracking eliminates per-line Vec metadata overhead.
318#[cfg(unix)]
319fn nl_number_all_stream(
320    data: &[u8],
321    config: &NlConfig,
322    line_number: &mut i64,
323    fd: i32,
324) -> std::io::Result<()> {
325    const BUF_SIZE: usize = 1024 * 1024; // 1MB output buffer
326
327    let width = config.number_width;
328    let sep = &config.number_separator;
329    let fmt = config.number_format;
330    let mut num = *line_number;
331    let mut pos: usize = 0;
332
333    let mut output: Vec<u8> = Vec::with_capacity(BUF_SIZE + 64 * 1024);
334    let mut buf_ptr = output.as_mut_ptr();
335    let mut write_pos: usize = 0;
336    let data_ptr = data.as_ptr();
337
338    // Use fixed-size array for prefix (avoid heap indirection)
339    let mut prefix_buf = [0u8; 32];
340    let mut prefix_len: usize;
341    let mut num_end: usize;
342
343    let mut num_buf = itoa::Buffer::new();
344
345    // Format initial prefix
346    {
347        let num_str = num_buf.format(num);
348        let pad = width.saturating_sub(num_str.len());
349        let mut wp = 0;
350        match fmt {
351            NumberFormat::Rn => {
352                for _ in 0..pad {
353                    prefix_buf[wp] = b' ';
354                    wp += 1;
355                }
356                prefix_buf[wp..wp + num_str.len()].copy_from_slice(num_str.as_bytes());
357                wp += num_str.len();
358            }
359            NumberFormat::Rz => {
360                for _ in 0..pad {
361                    prefix_buf[wp] = b'0';
362                    wp += 1;
363                }
364                prefix_buf[wp..wp + num_str.len()].copy_from_slice(num_str.as_bytes());
365                wp += num_str.len();
366            }
367            NumberFormat::Ln => {
368                prefix_buf[wp..wp + num_str.len()].copy_from_slice(num_str.as_bytes());
369                wp += num_str.len();
370                for _ in 0..pad {
371                    prefix_buf[wp] = b' ';
372                    wp += 1;
373                }
374            }
375        }
376        num_end = wp;
377        prefix_buf[wp..wp + sep.len()].copy_from_slice(sep);
378        wp += sep.len();
379        prefix_len = wp;
380    }
381
382    for nl_pos in memchr::memchr_iter(b'\n', data) {
383        let line_len = nl_pos - pos;
384
385        let needed = line_len + prefix_len + 2;
386        if write_pos + needed > BUF_SIZE {
387            unsafe {
388                output.set_len(write_pos);
389            }
390            write_all_fd(fd, &output)?;
391            write_pos = 0;
392            if needed > output.capacity() {
393                output.reserve(needed);
394                buf_ptr = output.as_mut_ptr();
395            }
396        }
397
398        unsafe {
399            let dst = buf_ptr.add(write_pos);
400            std::ptr::copy_nonoverlapping(prefix_buf.as_ptr(), dst, prefix_len);
401            std::ptr::copy_nonoverlapping(data_ptr.add(pos), dst.add(prefix_len), line_len);
402            *dst.add(prefix_len + line_len) = b'\n';
403        }
404        write_pos += prefix_len + line_len + 1;
405
406        num += 1;
407        pos = nl_pos + 1;
408
409        // In-place digit increment
410        match fmt {
411            NumberFormat::Rn | NumberFormat::Rz => {
412                let mut idx = num_end - 1;
413                loop {
414                    if prefix_buf[idx] < b'9' {
415                        prefix_buf[idx] += 1;
416                        break;
417                    }
418                    prefix_buf[idx] = b'0';
419                    if idx == 0 {
420                        let ns = num_buf.format(num);
421                        let p = width.saturating_sub(ns.len());
422                        let pc = if fmt == NumberFormat::Rz { b'0' } else { b' ' };
423                        let mut wp = 0;
424                        for _ in 0..p {
425                            prefix_buf[wp] = pc;
426                            wp += 1;
427                        }
428                        prefix_buf[wp..wp + ns.len()].copy_from_slice(ns.as_bytes());
429                        wp += ns.len();
430                        num_end = wp;
431                        prefix_buf[wp..wp + sep.len()].copy_from_slice(sep);
432                        prefix_len = wp + sep.len();
433                        break;
434                    }
435                    idx -= 1;
436                    let c = prefix_buf[idx];
437                    if c == b' ' || c == b'0' {
438                        prefix_buf[idx] = b'1';
439                        break;
440                    }
441                }
442            }
443            NumberFormat::Ln => {
444                let mut last_digit = 0;
445                for j in 0..num_end {
446                    if prefix_buf[j].is_ascii_digit() {
447                        last_digit = j;
448                    } else {
449                        break;
450                    }
451                }
452                let mut idx = last_digit;
453                loop {
454                    if prefix_buf[idx] < b'9' {
455                        prefix_buf[idx] += 1;
456                        break;
457                    }
458                    prefix_buf[idx] = b'0';
459                    if idx == 0 {
460                        let ns = num_buf.format(num);
461                        let p = width.saturating_sub(ns.len());
462                        let mut wp = 0;
463                        prefix_buf[wp..wp + ns.len()].copy_from_slice(ns.as_bytes());
464                        wp += ns.len();
465                        for _ in 0..p {
466                            prefix_buf[wp] = b' ';
467                            wp += 1;
468                        }
469                        num_end = wp;
470                        prefix_buf[wp..wp + sep.len()].copy_from_slice(sep);
471                        prefix_len = wp + sep.len();
472                        break;
473                    }
474                    idx -= 1;
475                    if prefix_buf[idx] == b' ' {
476                        let ns = num_buf.format(num);
477                        let p = width.saturating_sub(ns.len());
478                        let mut wp = 0;
479                        prefix_buf[wp..wp + ns.len()].copy_from_slice(ns.as_bytes());
480                        wp += ns.len();
481                        for _ in 0..p {
482                            prefix_buf[wp] = b' ';
483                            wp += 1;
484                        }
485                        num_end = wp;
486                        prefix_buf[wp..wp + sep.len()].copy_from_slice(sep);
487                        prefix_len = wp + sep.len();
488                        break;
489                    }
490                }
491            }
492        }
493    }
494
495    // Handle final line without trailing newline
496    if pos < data.len() {
497        let remaining = data.len() - pos;
498        let needed = prefix_len + remaining + 2;
499        if write_pos + needed > BUF_SIZE {
500            unsafe {
501                output.set_len(write_pos);
502            }
503            write_all_fd(fd, &output)?;
504            write_pos = 0;
505            if needed > output.capacity() {
506                output.reserve(needed);
507                buf_ptr = output.as_mut_ptr();
508            }
509        }
510        unsafe {
511            let dst = buf_ptr.add(write_pos);
512            std::ptr::copy_nonoverlapping(prefix_buf.as_ptr(), dst, prefix_len);
513            std::ptr::copy_nonoverlapping(data_ptr.add(pos), dst.add(prefix_len), remaining);
514            *dst.add(prefix_len + remaining) = b'\n';
515        }
516        write_pos += prefix_len + remaining + 1;
517        num += 1;
518    }
519
520    if write_pos > 0 {
521        unsafe {
522            output.set_len(write_pos);
523        }
524        write_all_fd(fd, &output)?;
525    }
526
527    *line_number = num;
528    Ok(())
529}
530
531/// Streaming fast path for default nl (body=NonEmpty): same optimization as
532/// nl_number_all_stream but skips numbering for blank lines.
533#[cfg(unix)]
534fn nl_number_nonempty_stream(
535    data: &[u8],
536    config: &NlConfig,
537    line_number: &mut i64,
538    fd: i32,
539) -> std::io::Result<()> {
540    const BUF_SIZE: usize = 1024 * 1024;
541
542    let width = config.number_width;
543    let sep = &config.number_separator;
544    let fmt = config.number_format;
545    let mut num = *line_number;
546    let mut pos: usize = 0;
547
548    let mut output: Vec<u8> = Vec::with_capacity(BUF_SIZE + 64 * 1024);
549    let mut buf_ptr = output.as_mut_ptr();
550    let mut write_pos: usize = 0;
551    let data_ptr = data.as_ptr();
552
553    let mut prefix_buf = [0u8; 32];
554    let mut prefix_len: usize;
555    let mut num_end: usize;
556    let mut num_buf = itoa::Buffer::new();
557
558    // Pre-compute blank line padding (width + separator filled with spaces)
559    let blank_pad = width + sep.len();
560
561    // Format initial prefix
562    {
563        let num_str = num_buf.format(num);
564        let pad = width.saturating_sub(num_str.len());
565        let mut wp = 0;
566        match fmt {
567            NumberFormat::Rn => {
568                for _ in 0..pad {
569                    prefix_buf[wp] = b' ';
570                    wp += 1;
571                }
572                prefix_buf[wp..wp + num_str.len()].copy_from_slice(num_str.as_bytes());
573                wp += num_str.len();
574            }
575            NumberFormat::Rz => {
576                for _ in 0..pad {
577                    prefix_buf[wp] = b'0';
578                    wp += 1;
579                }
580                prefix_buf[wp..wp + num_str.len()].copy_from_slice(num_str.as_bytes());
581                wp += num_str.len();
582            }
583            NumberFormat::Ln => {
584                prefix_buf[wp..wp + num_str.len()].copy_from_slice(num_str.as_bytes());
585                wp += num_str.len();
586                for _ in 0..pad {
587                    prefix_buf[wp] = b' ';
588                    wp += 1;
589                }
590            }
591        }
592        num_end = wp;
593        prefix_buf[wp..wp + sep.len()].copy_from_slice(sep);
594        wp += sep.len();
595        prefix_len = wp;
596    }
597
598    for nl_pos in memchr::memchr_iter(b'\n', data) {
599        let line_len = nl_pos - pos;
600
601        // For blank lines (line_len==0), actual bytes are blank_pad+1, so `needed`
602        // overestimates by ~prefix_len. Harmless: just flushes one line early at boundary.
603        let needed = line_len + prefix_len + 2;
604        if write_pos + needed > BUF_SIZE {
605            unsafe {
606                output.set_len(write_pos);
607            }
608            write_all_fd(fd, &output)?;
609            write_pos = 0;
610            // Grow buffer for oversized lines
611            if needed > output.capacity() {
612                output.reserve(needed);
613                buf_ptr = output.as_mut_ptr();
614            }
615        }
616
617        if line_len == 0 {
618            // Blank line: write spaces(width + sep_len) + newline, no numbering
619            // GNU nl replaces the separator with spaces for unnumbered lines
620            unsafe {
621                let dst = buf_ptr.add(write_pos);
622                std::ptr::write_bytes(dst, b' ', blank_pad);
623                *dst.add(blank_pad) = b'\n';
624            }
625            write_pos += blank_pad + 1;
626        } else {
627            // Non-blank line: write numbered prefix + content + newline
628            unsafe {
629                let dst = buf_ptr.add(write_pos);
630                std::ptr::copy_nonoverlapping(prefix_buf.as_ptr(), dst, prefix_len);
631                std::ptr::copy_nonoverlapping(data_ptr.add(pos), dst.add(prefix_len), line_len);
632                *dst.add(prefix_len + line_len) = b'\n';
633            }
634            write_pos += prefix_len + line_len + 1;
635
636            num += 1;
637
638            // In-place digit increment
639            match fmt {
640                NumberFormat::Rn | NumberFormat::Rz => {
641                    let mut idx = num_end - 1;
642                    loop {
643                        if prefix_buf[idx] < b'9' {
644                            prefix_buf[idx] += 1;
645                            break;
646                        }
647                        prefix_buf[idx] = b'0';
648                        if idx == 0 {
649                            let ns = num_buf.format(num);
650                            let p = width.saturating_sub(ns.len());
651                            let pc = if fmt == NumberFormat::Rz { b'0' } else { b' ' };
652                            let mut wp = 0;
653                            for _ in 0..p {
654                                prefix_buf[wp] = pc;
655                                wp += 1;
656                            }
657                            prefix_buf[wp..wp + ns.len()].copy_from_slice(ns.as_bytes());
658                            wp += ns.len();
659                            num_end = wp;
660                            prefix_buf[wp..wp + sep.len()].copy_from_slice(sep);
661                            prefix_len = wp + sep.len();
662                            break;
663                        }
664                        idx -= 1;
665                        let c = prefix_buf[idx];
666                        if c == b' ' || c == b'0' {
667                            prefix_buf[idx] = b'1';
668                            break;
669                        }
670                    }
671                }
672                NumberFormat::Ln => {
673                    let mut last_digit = 0;
674                    for j in 0..num_end {
675                        if prefix_buf[j].is_ascii_digit() {
676                            last_digit = j;
677                        } else {
678                            break;
679                        }
680                    }
681                    let mut idx = last_digit;
682                    loop {
683                        if prefix_buf[idx] < b'9' {
684                            prefix_buf[idx] += 1;
685                            break;
686                        }
687                        prefix_buf[idx] = b'0';
688                        if idx == 0 {
689                            let ns = num_buf.format(num);
690                            let p = width.saturating_sub(ns.len());
691                            let mut wp = 0;
692                            prefix_buf[wp..wp + ns.len()].copy_from_slice(ns.as_bytes());
693                            wp += ns.len();
694                            for _ in 0..p {
695                                prefix_buf[wp] = b' ';
696                                wp += 1;
697                            }
698                            num_end = wp;
699                            prefix_buf[wp..wp + sep.len()].copy_from_slice(sep);
700                            prefix_len = wp + sep.len();
701                            break;
702                        }
703                        idx -= 1;
704                        if prefix_buf[idx] == b' ' {
705                            let ns = num_buf.format(num);
706                            let p = width.saturating_sub(ns.len());
707                            let mut wp = 0;
708                            prefix_buf[wp..wp + ns.len()].copy_from_slice(ns.as_bytes());
709                            wp += ns.len();
710                            for _ in 0..p {
711                                prefix_buf[wp] = b' ';
712                                wp += 1;
713                            }
714                            num_end = wp;
715                            prefix_buf[wp..wp + sep.len()].copy_from_slice(sep);
716                            prefix_len = wp + sep.len();
717                            break;
718                        }
719                    }
720                }
721            }
722        }
723
724        pos = nl_pos + 1;
725    }
726
727    // Handle final line without trailing newline
728    if pos < data.len() {
729        let remaining = data.len() - pos;
730        let needed = prefix_len + remaining + 2;
731        if write_pos + needed > BUF_SIZE {
732            unsafe {
733                output.set_len(write_pos);
734            }
735            write_all_fd(fd, &output)?;
736            write_pos = 0;
737            if needed > output.capacity() {
738                output.reserve(needed);
739                buf_ptr = output.as_mut_ptr();
740            }
741        }
742        // Final partial line is always non-blank
743        unsafe {
744            let dst = buf_ptr.add(write_pos);
745            std::ptr::copy_nonoverlapping(prefix_buf.as_ptr(), dst, prefix_len);
746            std::ptr::copy_nonoverlapping(data_ptr.add(pos), dst.add(prefix_len), remaining);
747            *dst.add(prefix_len + remaining) = b'\n';
748        }
749        write_pos += prefix_len + remaining + 1;
750        num += 1;
751    }
752
753    if write_pos > 0 {
754        unsafe {
755            output.set_len(write_pos);
756        }
757        write_all_fd(fd, &output)?;
758    }
759
760    *line_number = num;
761    Ok(())
762}
763
764/// Streaming generic path: writes output in ~1MB batches directly to fd.
765/// Handles all numbering styles, section delimiters, and blank line joining.
766#[cfg(unix)]
767fn nl_generic_stream(
768    data: &[u8],
769    config: &NlConfig,
770    line_number: &mut i64,
771    fd: i32,
772) -> std::io::Result<()> {
773    if data.is_empty() {
774        return Ok(());
775    }
776
777    const BUF_SIZE: usize = 1024 * 1024; // 1MB output buffer
778
779    let mut output: Vec<u8> = Vec::with_capacity(BUF_SIZE + 64 * 1024);
780    let mut current_section = Section::Body;
781    let mut consecutive_blanks: usize = 0;
782    let mut start = 0;
783    let mut line_iter = memchr::memchr_iter(b'\n', data);
784
785    loop {
786        let (line, has_newline) = match line_iter.next() {
787            Some(pos) => (&data[start..pos], true),
788            None => {
789                if start < data.len() {
790                    (&data[start..], false)
791                } else {
792                    break;
793                }
794            }
795        };
796
797        // Flush when buffer is near capacity
798        if output.len() > BUF_SIZE {
799            write_all_fd(fd, &output)?;
800            output.clear();
801        }
802
803        // Check for section delimiter
804        if let Some(section) = check_section_delimiter(line, &config.section_delimiter) {
805            if !config.no_renumber {
806                *line_number = config.starting_line_number;
807            }
808            current_section = section;
809            consecutive_blanks = 0;
810            output.push(b'\n');
811            if has_newline {
812                start += line.len() + 1;
813            } else {
814                break;
815            }
816            continue;
817        }
818
819        let style = match current_section {
820            Section::Header => &config.header_style,
821            Section::Body => &config.body_style,
822            Section::Footer => &config.footer_style,
823        };
824
825        let is_blank = line.is_empty();
826
827        if is_blank {
828            consecutive_blanks += 1;
829        } else {
830            consecutive_blanks = 0;
831        }
832
833        let do_number = if is_blank && config.join_blank_lines > 1 {
834            if should_number(line, style) {
835                consecutive_blanks >= config.join_blank_lines
836            } else {
837                false
838            }
839        } else {
840            should_number(line, style)
841        };
842
843        if do_number {
844            if is_blank && config.join_blank_lines > 1 {
845                consecutive_blanks = 0;
846            }
847            format_number(
848                *line_number,
849                config.number_format,
850                config.number_width,
851                &mut output,
852            );
853            output.extend_from_slice(&config.number_separator);
854            output.extend_from_slice(line);
855            *line_number = line_number.wrapping_add(config.line_increment);
856        } else {
857            let total_pad = config.number_width + config.number_separator.len();
858            output.resize(output.len() + total_pad, b' ');
859            output.extend_from_slice(line);
860        }
861
862        if has_newline {
863            output.push(b'\n');
864            start += line.len() + 1;
865        } else {
866            output.push(b'\n');
867            break;
868        }
869    }
870
871    // Flush remaining
872    if !output.is_empty() {
873        write_all_fd(fd, &output)?;
874    }
875
876    Ok(())
877}
878
879/// Write buffer to a file descriptor, retrying on partial/interrupted writes.
880#[cfg(unix)]
881#[inline]
882fn write_all_fd(fd: i32, data: &[u8]) -> std::io::Result<()> {
883    let mut written = 0;
884    while written < data.len() {
885        let ret = unsafe {
886            libc::write(
887                fd,
888                data[written..].as_ptr() as *const libc::c_void,
889                (data.len() - written) as _,
890            )
891        };
892        if ret > 0 {
893            written += ret as usize;
894        } else if ret == 0 {
895            return Err(std::io::Error::new(
896                std::io::ErrorKind::WriteZero,
897                "write returned 0",
898            ));
899        } else {
900            let err = std::io::Error::last_os_error();
901            if err.kind() == std::io::ErrorKind::Interrupted {
902                continue;
903            }
904            return Err(err);
905        }
906    }
907    Ok(())
908}
909
910/// Stream nl output directly to a file descriptor in batched writes.
911/// This is the preferred entry point for the binary — avoids building the entire
912/// output in memory and instead flushes ~1MB chunks. For large files this
913/// dramatically reduces memory usage and write() syscall overhead.
914#[cfg(unix)]
915pub fn nl_stream_with_state(
916    data: &[u8],
917    config: &NlConfig,
918    line_number: &mut i64,
919    fd: i32,
920) -> std::io::Result<()> {
921    if data.is_empty() {
922        return Ok(());
923    }
924
925    // Fast path: number-all or number-nonempty with simple config
926    let is_all = is_simple_number_all(config);
927    let is_nonempty = !is_all && is_simple_number_nonempty(config);
928
929    if is_all || is_nonempty {
930        // Skip delimiter scan when delimiter is empty
931        let has_delimiters = if config.section_delimiter.is_empty() {
932            false
933        } else {
934            // Use memmem SIMD scan — fast for typical text without backslashes.
935            memchr::memmem::find(data, &config.section_delimiter).is_some()
936        };
937
938        if !has_delimiters {
939            return if is_all {
940                nl_number_all_stream(data, config, line_number, fd)
941            } else {
942                nl_number_nonempty_stream(data, config, line_number, fd)
943            };
944        }
945    }
946
947    nl_generic_stream(data, config, line_number, fd)
948}
949
950/// Build the nl output into a Vec, continuing numbering from `line_number`.
951/// Updates `line_number` in place so callers can continue across multiple files.
952pub fn nl_to_vec_with_state(data: &[u8], config: &NlConfig, line_number: &mut i64) -> Vec<u8> {
953    if data.is_empty() {
954        return Vec::new();
955    }
956
957    // Fast paths for common benchmark cases.
958    // Guard: skip fast path if data contains section delimiters (rare in practice).
959    let has_section_delims = !config.section_delimiter.is_empty()
960        && memchr::memmem::find(data, &config.section_delimiter).is_some();
961    if is_simple_number_all(config) && !has_section_delims {
962        return nl_number_all_fast(data, config, line_number);
963    }
964
965    // Generic path: pre-allocate generously instead of counting newlines
966    let alloc = (data.len() * 2 + 256).min(128 * 1024 * 1024);
967    let mut output: Vec<u8> = Vec::with_capacity(alloc);
968
969    let mut current_section = Section::Body;
970    let mut consecutive_blanks: usize = 0;
971
972    let mut start = 0;
973    let mut line_iter = memchr::memchr_iter(b'\n', data);
974
975    loop {
976        let (line, has_newline) = match line_iter.next() {
977            Some(pos) => (&data[start..pos], true),
978            None => {
979                if start < data.len() {
980                    (&data[start..], false)
981                } else {
982                    break;
983                }
984            }
985        };
986
987        // Check for section delimiter
988        if let Some(section) = check_section_delimiter(line, &config.section_delimiter) {
989            if !config.no_renumber {
990                *line_number = config.starting_line_number;
991            }
992            current_section = section;
993            consecutive_blanks = 0;
994            output.push(b'\n');
995            if has_newline {
996                start += line.len() + 1;
997            } else {
998                break;
999            }
1000            continue;
1001        }
1002
1003        let style = match current_section {
1004            Section::Header => &config.header_style,
1005            Section::Body => &config.body_style,
1006            Section::Footer => &config.footer_style,
1007        };
1008
1009        let is_blank = line.is_empty();
1010
1011        if is_blank {
1012            consecutive_blanks += 1;
1013        } else {
1014            consecutive_blanks = 0;
1015        }
1016
1017        let do_number = if is_blank && config.join_blank_lines > 1 {
1018            if should_number(line, style) {
1019                consecutive_blanks >= config.join_blank_lines
1020            } else {
1021                false
1022            }
1023        } else {
1024            should_number(line, style)
1025        };
1026
1027        if do_number {
1028            if is_blank && config.join_blank_lines > 1 {
1029                consecutive_blanks = 0;
1030            }
1031            format_number(
1032                *line_number,
1033                config.number_format,
1034                config.number_width,
1035                &mut output,
1036            );
1037            output.extend_from_slice(&config.number_separator);
1038            output.extend_from_slice(line);
1039            *line_number = line_number.wrapping_add(config.line_increment);
1040        } else {
1041            // Non-numbered lines: GNU nl outputs width + separator_len total spaces, then content
1042            let total_pad = config.number_width + config.number_separator.len();
1043            output.resize(output.len() + total_pad, b' ');
1044            output.extend_from_slice(line);
1045        }
1046
1047        if has_newline {
1048            output.push(b'\n');
1049            start += line.len() + 1;
1050        } else {
1051            // GNU nl always adds a trailing newline, even when the input lacks one
1052            // (but has content on the last line). Empty input produces empty output.
1053            output.push(b'\n');
1054            break;
1055        }
1056    }
1057
1058    output
1059}
1060
1061/// Number lines and write to the provided writer.
1062pub fn nl(data: &[u8], config: &NlConfig, out: &mut impl Write) -> std::io::Result<()> {
1063    let output = nl_to_vec(data, config);
1064    out.write_all(&output)
1065}
coreutils_rs/nl/core.rs

coreutils_rs/nl/
core.rs