Skip to main content

rgx/filter/
mod.rs

1//! `rgx filter` subcommand — live/non-interactive regex filter over stdin or a file.
2
3use std::io::{self, BufRead, BufReader, IsTerminal, Read, Write};
4use std::path::Path;
5
6use crate::config::cli::FilterArgs;
7use crate::engine::{self, CompiledRegex, EngineFlags};
8
9pub mod app;
10pub mod json_path;
11pub mod run;
12pub mod ui;
13pub use app::{FilterApp, Outcome};
14
15#[derive(Debug, Clone, Copy, Default)]
16pub struct FilterOptions {
17    pub invert: bool,
18    pub case_insensitive: bool,
19}
20
21impl FilterOptions {
22    fn flags(&self) -> EngineFlags {
23        EngineFlags {
24            case_insensitive: self.case_insensitive,
25            ..EngineFlags::default()
26        }
27    }
28}
29
30/// Match one haystack against a compiled pattern and apply the `invert` flag.
31/// Returns `Some(spans)` if the line should be emitted — an empty `Vec` in
32/// invert mode (since we don't highlight "did-not-match" lines), or the actual
33/// match byte ranges otherwise. Returns `None` if the line should be filtered
34/// out. Centralizing this keeps `filter_lines`, `filter_lines_with_extracted`,
35/// and the TUI `collect_matches` paths from drifting.
36pub fn match_haystack(
37    compiled: &dyn CompiledRegex,
38    haystack: &str,
39    invert: bool,
40) -> Option<Vec<std::ops::Range<usize>>> {
41    let found = compiled.find_matches(haystack).unwrap_or_default();
42    let hit = !found.is_empty();
43    if hit == invert {
44        return None;
45    }
46    Some(if invert {
47        Vec::new()
48    } else {
49        found.into_iter().map(|m| m.start..m.end).collect()
50    })
51}
52
53/// Apply the pattern to each line. Returns the 0-indexed line numbers of every
54/// line whose match status (matches vs. invert) satisfies `options.invert`.
55///
56/// Returns `Err` if the pattern fails to compile. An empty pattern is treated
57/// as "match everything" (every line passes) so the TUI has a sensible default
58/// before the user types.
59pub fn filter_lines(
60    lines: &[String],
61    pattern: &str,
62    options: FilterOptions,
63) -> Result<Vec<usize>, String> {
64    if pattern.is_empty() {
65        // Empty pattern — every line passes iff not inverted.
66        return Ok(if options.invert {
67            Vec::new()
68        } else {
69            (0..lines.len()).collect()
70        });
71    }
72
73    let engine = engine::create_engine(engine::detect_minimum_engine(pattern));
74    let compiled = engine
75        .compile(pattern, &options.flags())
76        .map_err(|e| e.to_string())?;
77
78    let mut indices = Vec::with_capacity(lines.len());
79    for (idx, line) in lines.iter().enumerate() {
80        if match_haystack(&*compiled, line, options.invert).is_some() {
81            indices.push(idx);
82        }
83    }
84    Ok(indices)
85}
86
87/// Apply the pattern to the extracted string for each line. Lines whose
88/// `extracted[i]` is `None` are excluded from the match set regardless of
89/// whether the pattern is empty or `invert` is set — a missing/non-string
90/// field is not a "line" for matching purposes.
91///
92/// Returns the 0-indexed line numbers of the raw input that should be emitted
93/// (i.e. whose extracted value satisfies the pattern + invert flag).
94pub fn filter_lines_with_extracted(
95    extracted: &[Option<String>],
96    pattern: &str,
97    options: FilterOptions,
98) -> Result<Vec<usize>, String> {
99    if pattern.is_empty() {
100        // Empty pattern matches every present extracted value. In invert mode
101        // that set becomes empty (an always-match pattern inverts to nothing).
102        // None entries are excluded either way.
103        if options.invert {
104            return Ok(Vec::new());
105        }
106        return Ok(extracted
107            .iter()
108            .enumerate()
109            .filter_map(|(idx, v)| v.as_ref().map(|_| idx))
110            .collect());
111    }
112
113    let engine = engine::create_engine(engine::detect_minimum_engine(pattern));
114    let compiled = engine
115        .compile(pattern, &options.flags())
116        .map_err(|e| e.to_string())?;
117
118    let mut indices = Vec::with_capacity(extracted.len());
119    for (idx, slot) in extracted.iter().enumerate() {
120        let Some(s) = slot else {
121            // Missing field or parse failure — never emit.
122            continue;
123        };
124        if match_haystack(&*compiled, s, options.invert).is_some() {
125            indices.push(idx);
126        }
127    }
128    Ok(indices)
129}
130
131/// Returns per-line extracted strings. `None` means the line should be excluded
132/// from matching (JSON parse failure, path miss, or non-string value). The
133/// returned vector has the same length as `lines`, so callers can index it
134/// directly alongside the raw lines.
135pub fn extract_strings(lines: &[String], path_expr: &str) -> Result<Vec<Option<String>>, String> {
136    let path = json_path::parse_path(path_expr)?;
137    let mut out = Vec::with_capacity(lines.len());
138    for line in lines {
139        let extracted = serde_json::from_str::<serde_json::Value>(line)
140            .ok()
141            .and_then(|v| {
142                json_path::extract(&v, &path).and_then(|v| v.as_str().map(str::to_string))
143            });
144        out.push(extracted);
145    }
146    Ok(out)
147}
148
149/// Exit codes, matching grep conventions.
150pub const EXIT_MATCH: i32 = 0;
151pub const EXIT_NO_MATCH: i32 = 1;
152pub const EXIT_ERROR: i32 = 2;
153
154/// Per-line byte cap. A single line above this size is truncated — prevents
155/// one unterminated multi-gigabyte line from OOMing before `max_lines` helps.
156/// 10 MiB comfortably covers the largest real-world log lines (long stack
157/// traces, flattened JSON payloads) without letting a hostile stream run away.
158pub const MAX_LINE_BYTES: usize = 10 * 1024 * 1024;
159
160/// Emit matching lines to `writer`. If `line_number` is true, each line is
161/// prefixed with its 1-indexed line number and a colon.
162pub fn emit_matches(
163    writer: &mut dyn Write,
164    lines: &[String],
165    matched: &[usize],
166    line_number: bool,
167) -> io::Result<()> {
168    for &idx in matched {
169        if line_number {
170            writeln!(writer, "{}:{}", idx + 1, lines[idx])?;
171        } else {
172            writeln!(writer, "{}", lines[idx])?;
173        }
174    }
175    Ok(())
176}
177
178/// Emit only the count of matched lines.
179pub fn emit_count(writer: &mut dyn Write, matched_count: usize) -> io::Result<()> {
180    writeln!(writer, "{matched_count}")
181}
182
183/// Read all lines from either a file path or the provided reader (typically stdin).
184/// Trailing `\n`/`\r\n` is stripped per line. A trailing empty line (from a
185/// terminating newline) is dropped.
186///
187/// Invalid UTF-8 bytes are replaced with `U+FFFD REPLACEMENT CHARACTER` rather
188/// than aborting the read — this matches `grep`'s behavior and keeps the filter
189/// usable against binary-ish logs (e.g. files with stray latin-1 bytes).
190///
191/// `max_lines` caps the number of lines read to prevent OOM on unbounded
192/// streams. Pass `0` to disable the cap. Individual lines above
193/// `MAX_LINE_BYTES` are truncated (the rest of that line is discarded) so a
194/// single unterminated multi-gigabyte line cannot OOM the process before the
195/// line cap kicks in.
196///
197/// Returns `(lines, line_truncated, byte_truncated)`:
198/// * `line_truncated` — the line-count cap was reached before end-of-input.
199/// * `byte_truncated` — at least one line exceeded `MAX_LINE_BYTES` and was truncated.
200pub fn read_input(
201    file: Option<&Path>,
202    fallback: impl Read,
203    max_lines: usize,
204) -> io::Result<(Vec<String>, bool, bool)> {
205    let mut reader: Box<dyn BufRead> = match file {
206        Some(path) => Box::new(BufReader::new(std::fs::File::open(path)?)),
207        None => Box::new(BufReader::new(fallback)),
208    };
209    let mut out = Vec::new();
210    let mut buf = Vec::new();
211    let mut line_truncated = false;
212    let mut byte_truncated = false;
213    // +1 so `read_until` will still consume the terminating newline when the
214    // line is exactly `MAX_LINE_BYTES` bytes of content.
215    let line_limit = MAX_LINE_BYTES as u64 + 1;
216    loop {
217        if max_lines != 0 && out.len() >= max_lines {
218            // Peek one byte: is there any more data after the cap? Only then
219            // do we flag truncation, so callers don't warn about files that
220            // just happen to have exactly `max_lines` lines. A single byte is
221            // enough to decide, and caps the peek so a giant post-cap line
222            // can't OOM us.
223            let mut one = [0u8; 1];
224            if reader.read(&mut one)? > 0 {
225                line_truncated = true;
226            }
227            break;
228        }
229        buf.clear();
230        let n = (&mut reader).take(line_limit).read_until(b'\n', &mut buf)?;
231        if n == 0 {
232            break;
233        }
234        // If we filled the limited reader without seeing `\n`, this line
235        // exceeds MAX_LINE_BYTES. Drain the remainder on the unlimited
236        // reader so the next iteration starts at the true next line, and
237        // truncate `buf` down to the cap (the extra byte came from the `+1`
238        // we allowed so ordinary MAX_LINE_BYTES-long lines still capture
239        // their terminating newline).
240        let line_overflowed = buf.last() != Some(&b'\n') && n as u64 == line_limit;
241        if line_overflowed {
242            byte_truncated = true;
243            buf.truncate(MAX_LINE_BYTES);
244            // Drain the rest of the overflowed line in bounded 64 KiB chunks
245            // to prevent OOM when the tail is itself very large with no newline.
246            let mut discard = Vec::with_capacity(65_536);
247            loop {
248                discard.clear();
249                (&mut reader).take(65_536).read_until(b'\n', &mut discard)?;
250                if discard.is_empty() || discard.last() == Some(&b'\n') {
251                    break;
252                }
253            }
254        }
255        // Strip trailing \n and optional \r.
256        let end = buf
257            .iter()
258            .rposition(|b| *b != b'\n' && *b != b'\r')
259            .map(|i| i + 1)
260            .unwrap_or(0);
261        out.push(String::from_utf8_lossy(&buf[..end]).into_owned());
262    }
263    Ok((out, line_truncated, byte_truncated))
264}
265
266/// CLI entry point for `rgx filter`. Reads input, decides between non-interactive
267/// and TUI modes, and returns an exit code.
268pub fn entry(args: FilterArgs) -> i32 {
269    match run_entry(args) {
270        Ok(code) => code,
271        Err(msg) => {
272            eprintln!("rgx filter: {msg}");
273            EXIT_ERROR
274        }
275    }
276}
277
278fn run_entry(args: FilterArgs) -> Result<i32, String> {
279    let (lines, line_truncated, byte_truncated) =
280        read_input(args.file.as_deref(), io::stdin(), args.max_lines)
281            .map_err(|e| format!("reading input: {e}"))?;
282    if byte_truncated {
283        eprintln!(
284            "rgx filter: one or more lines exceeded {} bytes and were truncated",
285            MAX_LINE_BYTES
286        );
287    }
288    if line_truncated {
289        eprintln!(
290            "rgx filter: input truncated at {} lines (use --max-lines to override)",
291            args.max_lines
292        );
293    }
294
295    let options = FilterOptions {
296        invert: args.invert,
297        case_insensitive: args.case_insensitive,
298    };
299
300    // Non-interactive paths: --count, --line-number, or a pattern was given and
301    // stdout is not a TTY (so we're being piped).
302    let has_pattern = args.pattern.as_deref().is_some_and(|p| !p.is_empty());
303    let stdout_is_tty = io::stdout().is_terminal();
304    let non_interactive = args.count || args.line_number || (has_pattern && !stdout_is_tty);
305
306    // If --json was given, resolve the per-line extracted strings up front.
307    // We do this before splitting non-interactive vs. TUI so both paths
308    // see the same view of the input.
309    let json_extracted = if let Some(path_expr) = args.json.as_deref() {
310        Some(extract_strings(&lines, path_expr).map_err(|e| format!("--json: {e}"))?)
311    } else {
312        None
313    };
314
315    if non_interactive {
316        let pattern = args.pattern.unwrap_or_default();
317        let matched = match &json_extracted {
318            Some(extracted) => filter_lines_with_extracted(extracted, &pattern, options)
319                .map_err(|e| format!("pattern: {e}"))?,
320            None => filter_lines(&lines, &pattern, options).map_err(|e| format!("pattern: {e}"))?,
321        };
322
323        let mut stdout = io::stdout().lock();
324        if args.count {
325            emit_count(&mut stdout, matched.len()).map_err(|e| format!("writing output: {e}"))?;
326        } else {
327            // Emit the raw lines regardless of --json — users still get the
328            // full JSON records back, not just the extracted fields.
329            emit_matches(&mut stdout, &lines, &matched, args.line_number)
330                .map_err(|e| format!("writing output: {e}"))?;
331        }
332        return Ok(if matched.is_empty() {
333            EXIT_NO_MATCH
334        } else {
335            EXIT_MATCH
336        });
337    }
338
339    // TUI mode.
340    let initial_pattern = args.pattern.unwrap_or_default();
341    let app = match json_extracted {
342        Some(extracted) => {
343            FilterApp::with_json_extracted(lines, extracted, &initial_pattern, options)
344                .map_err(|e| format!("--json: {e}"))?
345        }
346        None => FilterApp::new(lines, &initial_pattern, options),
347    };
348    let (final_app, outcome) = run::run_tui(app).map_err(|e| format!("tui: {e}"))?;
349
350    match outcome {
351        Outcome::Emit => {
352            let mut stdout = io::stdout().lock();
353            emit_matches(&mut stdout, &final_app.lines, &final_app.matched, false)
354                .map_err(|e| format!("writing output: {e}"))?;
355            Ok(if final_app.matched.is_empty() {
356                EXIT_NO_MATCH
357            } else {
358                EXIT_MATCH
359            })
360        }
361        Outcome::Discard => Ok(EXIT_NO_MATCH),
362        Outcome::Pending => Ok(EXIT_ERROR),
363    }
364}