rgx-cli 0.12.1

A terminal regex tester with real-time matching, multi-engine support, and plain-English explanations
Documentation
//! `rgx filter` subcommand — live/non-interactive regex filter over stdin or a file.

use std::io::{self, BufRead, BufReader, IsTerminal, Read, Write};
use std::path::Path;

use crate::config::cli::FilterArgs;
use crate::engine::{self, CompiledRegex, EngineFlags, EngineKind};

pub mod app;
pub mod json_path;
pub mod run;
pub mod ui;
pub use app::{FilterApp, Outcome};

#[derive(Debug, Clone, Copy, Default)]
pub struct FilterOptions {
    pub invert: bool,
    pub case_insensitive: bool,
}

impl FilterOptions {
    fn flags(&self) -> EngineFlags {
        EngineFlags {
            case_insensitive: self.case_insensitive,
            ..EngineFlags::default()
        }
    }
}

/// Match one haystack against a compiled pattern and apply the `invert` flag.
/// Returns `Some(spans)` if the line should be emitted — an empty `Vec` in
/// invert mode (since we don't highlight "did-not-match" lines), or the actual
/// match byte ranges otherwise. Returns `None` if the line should be filtered
/// out. Centralizing this keeps `filter_lines`, `filter_lines_with_extracted`,
/// and the TUI `collect_matches` paths from drifting.
pub fn match_haystack(
    compiled: &dyn CompiledRegex,
    haystack: &str,
    invert: bool,
) -> Option<Vec<std::ops::Range<usize>>> {
    let found = compiled.find_matches(haystack).unwrap_or_default();
    let hit = !found.is_empty();
    if hit == invert {
        return None;
    }
    Some(if invert {
        Vec::new()
    } else {
        found.into_iter().map(|m| m.start..m.end).collect()
    })
}

/// Apply the pattern to each line. Returns the 0-indexed line numbers of every
/// line whose match status (matches vs. invert) satisfies `options.invert`.
///
/// Returns `Err` if the pattern fails to compile. An empty pattern is treated
/// as "match everything" (every line passes) so the TUI has a sensible default
/// before the user types.
pub fn filter_lines(
    lines: &[String],
    pattern: &str,
    options: FilterOptions,
) -> Result<Vec<usize>, String> {
    if pattern.is_empty() {
        // Empty pattern — every line passes iff not inverted.
        return Ok(if options.invert {
            Vec::new()
        } else {
            (0..lines.len()).collect()
        });
    }

    let engine = engine::create_engine(EngineKind::RustRegex);
    let compiled = engine
        .compile(pattern, &options.flags())
        .map_err(|e| e.to_string())?;

    let mut indices = Vec::with_capacity(lines.len());
    for (idx, line) in lines.iter().enumerate() {
        if match_haystack(&*compiled, line, options.invert).is_some() {
            indices.push(idx);
        }
    }
    Ok(indices)
}

/// Apply the pattern to the extracted string for each line. Lines whose
/// `extracted[i]` is `None` are excluded from the match set regardless of
/// whether the pattern is empty or `invert` is set — a missing/non-string
/// field is not a "line" for matching purposes.
///
/// Returns the 0-indexed line numbers of the raw input that should be emitted
/// (i.e. whose extracted value satisfies the pattern + invert flag).
pub fn filter_lines_with_extracted(
    extracted: &[Option<String>],
    pattern: &str,
    options: FilterOptions,
) -> Result<Vec<usize>, String> {
    if pattern.is_empty() {
        // Empty pattern matches every present extracted value. In invert mode
        // that set becomes empty (an always-match pattern inverts to nothing).
        // None entries are excluded either way.
        if options.invert {
            return Ok(Vec::new());
        }
        return Ok(extracted
            .iter()
            .enumerate()
            .filter_map(|(idx, v)| v.as_ref().map(|_| idx))
            .collect());
    }

    let engine = engine::create_engine(EngineKind::RustRegex);
    let compiled = engine
        .compile(pattern, &options.flags())
        .map_err(|e| e.to_string())?;

    let mut indices = Vec::with_capacity(extracted.len());
    for (idx, slot) in extracted.iter().enumerate() {
        let Some(s) = slot else {
            // Missing field or parse failure — never emit.
            continue;
        };
        if match_haystack(&*compiled, s, options.invert).is_some() {
            indices.push(idx);
        }
    }
    Ok(indices)
}

/// Returns per-line extracted strings. `None` means the line should be excluded
/// from matching (JSON parse failure, path miss, or non-string value). The
/// returned vector has the same length as `lines`, so callers can index it
/// directly alongside the raw lines.
pub fn extract_strings(lines: &[String], path_expr: &str) -> Result<Vec<Option<String>>, String> {
    let path = json_path::parse_path(path_expr)?;
    let mut out = Vec::with_capacity(lines.len());
    for line in lines {
        let extracted = serde_json::from_str::<serde_json::Value>(line)
            .ok()
            .and_then(|v| {
                json_path::extract(&v, &path).and_then(|v| v.as_str().map(str::to_string))
            });
        out.push(extracted);
    }
    Ok(out)
}

/// Exit codes, matching grep conventions.
pub const EXIT_MATCH: i32 = 0;
pub const EXIT_NO_MATCH: i32 = 1;
pub const EXIT_ERROR: i32 = 2;

/// Per-line byte cap. A single line above this size is truncated — prevents
/// one unterminated multi-gigabyte line from OOMing before `max_lines` helps.
/// 10 MiB comfortably covers the largest real-world log lines (long stack
/// traces, flattened JSON payloads) without letting a hostile stream run away.
pub const MAX_LINE_BYTES: usize = 10 * 1024 * 1024;

/// Emit matching lines to `writer`. If `line_number` is true, each line is
/// prefixed with its 1-indexed line number and a colon.
pub fn emit_matches(
    writer: &mut dyn Write,
    lines: &[String],
    matched: &[usize],
    line_number: bool,
) -> io::Result<()> {
    for &idx in matched {
        if line_number {
            writeln!(writer, "{}:{}", idx + 1, lines[idx])?;
        } else {
            writeln!(writer, "{}", lines[idx])?;
        }
    }
    Ok(())
}

/// Emit only the count of matched lines.
pub fn emit_count(writer: &mut dyn Write, matched_count: usize) -> io::Result<()> {
    writeln!(writer, "{matched_count}")
}

/// Read all lines from either a file path or the provided reader (typically stdin).
/// Trailing `\n`/`\r\n` is stripped per line. A trailing empty line (from a
/// terminating newline) is dropped.
///
/// Invalid UTF-8 bytes are replaced with `U+FFFD REPLACEMENT CHARACTER` rather
/// than aborting the read — this matches `grep`'s behavior and keeps the filter
/// usable against binary-ish logs (e.g. files with stray latin-1 bytes).
///
/// `max_lines` caps the number of lines read to prevent OOM on unbounded
/// streams. Pass `0` to disable the cap. Individual lines above
/// `MAX_LINE_BYTES` are truncated (the rest of that line is discarded) so a
/// single unterminated multi-gigabyte line cannot OOM the process before the
/// line cap kicks in.
///
/// Returns `(lines, truncated)` where `truncated` is `true` if the line cap
/// was reached before end-of-input OR any individual line was byte-truncated.
pub fn read_input(
    file: Option<&Path>,
    fallback: impl Read,
    max_lines: usize,
) -> io::Result<(Vec<String>, bool)> {
    let mut reader: Box<dyn BufRead> = match file {
        Some(path) => Box::new(BufReader::new(std::fs::File::open(path)?)),
        None => Box::new(BufReader::new(fallback)),
    };
    let mut out = Vec::new();
    let mut buf = Vec::new();
    let mut truncated = false;
    // +1 so `read_until` will still consume the terminating newline when the
    // line is exactly `MAX_LINE_BYTES` bytes of content.
    let line_limit = MAX_LINE_BYTES as u64 + 1;
    loop {
        if max_lines != 0 && out.len() >= max_lines {
            // Peek one byte: is there any more data after the cap? Only then
            // do we flag truncation, so callers don't warn about files that
            // just happen to have exactly `max_lines` lines. A single byte is
            // enough to decide, and caps the peek so a giant post-cap line
            // can't OOM us.
            let mut one = [0u8; 1];
            if reader.read(&mut one)? > 0 {
                truncated = true;
            }
            break;
        }
        buf.clear();
        let n = (&mut reader).take(line_limit).read_until(b'\n', &mut buf)?;
        if n == 0 {
            break;
        }
        // If we filled the limited reader without seeing `\n`, this line
        // exceeds MAX_LINE_BYTES. Drain the remainder on the unlimited
        // reader so the next iteration starts at the true next line, and
        // truncate `buf` down to the cap (the extra byte came from the `+1`
        // we allowed so ordinary MAX_LINE_BYTES-long lines still capture
        // their terminating newline).
        let line_overflowed = buf.last() != Some(&b'\n') && n as u64 == line_limit;
        if line_overflowed {
            truncated = true;
            buf.truncate(MAX_LINE_BYTES);
            let mut discard = Vec::new();
            reader.read_until(b'\n', &mut discard)?;
        }
        // Strip trailing \n and optional \r.
        let end = buf
            .iter()
            .rposition(|b| *b != b'\n' && *b != b'\r')
            .map(|i| i + 1)
            .unwrap_or(0);
        out.push(String::from_utf8_lossy(&buf[..end]).into_owned());
    }
    Ok((out, truncated))
}

/// CLI entry point for `rgx filter`. Reads input, decides between non-interactive
/// and TUI modes, and returns an exit code.
pub fn entry(args: FilterArgs) -> i32 {
    match run_entry(args) {
        Ok(code) => code,
        Err(msg) => {
            eprintln!("rgx filter: {msg}");
            EXIT_ERROR
        }
    }
}

fn run_entry(args: FilterArgs) -> Result<i32, String> {
    let (lines, truncated) = read_input(args.file.as_deref(), io::stdin(), args.max_lines)
        .map_err(|e| format!("reading input: {e}"))?;
    if truncated {
        eprintln!(
            "rgx filter: input truncated at {} lines (use --max-lines to override)",
            args.max_lines
        );
    }

    let options = FilterOptions {
        invert: args.invert,
        case_insensitive: args.case_insensitive,
    };

    // Non-interactive paths: --count, --line-number, or a pattern was given and
    // stdout is not a TTY (so we're being piped).
    let has_pattern = args.pattern.as_deref().is_some_and(|p| !p.is_empty());
    let stdout_is_tty = io::stdout().is_terminal();
    let non_interactive = args.count || args.line_number || (has_pattern && !stdout_is_tty);

    // If --json was given, resolve the per-line extracted strings up front.
    // We do this before splitting non-interactive vs. TUI so both paths
    // see the same view of the input.
    let json_extracted = if let Some(path_expr) = args.json.as_deref() {
        Some(extract_strings(&lines, path_expr).map_err(|e| format!("--json: {e}"))?)
    } else {
        None
    };

    if non_interactive {
        let pattern = args.pattern.unwrap_or_default();
        let matched = match &json_extracted {
            Some(extracted) => filter_lines_with_extracted(extracted, &pattern, options)
                .map_err(|e| format!("pattern: {e}"))?,
            None => filter_lines(&lines, &pattern, options).map_err(|e| format!("pattern: {e}"))?,
        };

        let mut stdout = io::stdout().lock();
        if args.count {
            emit_count(&mut stdout, matched.len()).map_err(|e| format!("writing output: {e}"))?;
        } else {
            // Emit the raw lines regardless of --json — users still get the
            // full JSON records back, not just the extracted fields.
            emit_matches(&mut stdout, &lines, &matched, args.line_number)
                .map_err(|e| format!("writing output: {e}"))?;
        }
        return Ok(if matched.is_empty() {
            EXIT_NO_MATCH
        } else {
            EXIT_MATCH
        });
    }

    // TUI mode.
    let initial_pattern = args.pattern.unwrap_or_default();
    let app = match json_extracted {
        Some(extracted) => {
            FilterApp::with_json_extracted(lines, extracted, &initial_pattern, options)
                .map_err(|e| format!("--json: {e}"))?
        }
        None => FilterApp::new(lines, &initial_pattern, options),
    };
    let (final_app, outcome) = run::run_tui(app).map_err(|e| format!("tui: {e}"))?;

    match outcome {
        Outcome::Emit => {
            let mut stdout = io::stdout().lock();
            emit_matches(&mut stdout, &final_app.lines, &final_app.matched, false)
                .map_err(|e| format!("writing output: {e}"))?;
            Ok(if final_app.matched.is_empty() {
                EXIT_NO_MATCH
            } else {
                EXIT_MATCH
            })
        }
        Outcome::Discard => Ok(EXIT_NO_MATCH),
        Outcome::Pending => Ok(EXIT_ERROR),
    }
}