Skip to main content

rgx/filter/
mod.rs

1//! `rgx filter` subcommand — live/non-interactive regex filter over stdin or a file.
2
3use std::io::{self, BufRead, BufReader, IsTerminal, Read, Write};
4use std::path::Path;
5
6use crate::config::cli::FilterArgs;
7use crate::engine::{self, CompiledRegex, EngineFlags, EngineKind};
8
9pub mod app;
10pub mod json_path;
11pub mod run;
12pub mod ui;
13pub use app::{FilterApp, Outcome};
14
15#[derive(Debug, Clone, Copy, Default)]
16pub struct FilterOptions {
17    pub invert: bool,
18    pub case_insensitive: bool,
19}
20
21impl FilterOptions {
22    fn flags(&self) -> EngineFlags {
23        EngineFlags {
24            case_insensitive: self.case_insensitive,
25            ..EngineFlags::default()
26        }
27    }
28}
29
30/// Match one haystack against a compiled pattern and apply the `invert` flag.
31/// Returns `Some(spans)` if the line should be emitted — an empty `Vec` in
32/// invert mode (since we don't highlight "did-not-match" lines), or the actual
33/// match byte ranges otherwise. Returns `None` if the line should be filtered
34/// out. Centralizing this keeps `filter_lines`, `filter_lines_with_extracted`,
35/// and the TUI `collect_matches` paths from drifting.
36pub fn match_haystack(
37    compiled: &dyn CompiledRegex,
38    haystack: &str,
39    invert: bool,
40) -> Option<Vec<std::ops::Range<usize>>> {
41    let found = compiled.find_matches(haystack).unwrap_or_default();
42    let hit = !found.is_empty();
43    if hit == invert {
44        return None;
45    }
46    Some(if invert {
47        Vec::new()
48    } else {
49        found.into_iter().map(|m| m.start..m.end).collect()
50    })
51}
52
53/// Apply the pattern to each line. Returns the 0-indexed line numbers of every
54/// line whose match status (matches vs. invert) satisfies `options.invert`.
55///
56/// Returns `Err` if the pattern fails to compile. An empty pattern is treated
57/// as "match everything" (every line passes) so the TUI has a sensible default
58/// before the user types.
59pub fn filter_lines(
60    lines: &[String],
61    pattern: &str,
62    options: FilterOptions,
63) -> Result<Vec<usize>, String> {
64    if pattern.is_empty() {
65        // Empty pattern — every line passes iff not inverted.
66        return Ok(if options.invert {
67            Vec::new()
68        } else {
69            (0..lines.len()).collect()
70        });
71    }
72
73    let engine = engine::create_engine(EngineKind::RustRegex);
74    let compiled = engine
75        .compile(pattern, &options.flags())
76        .map_err(|e| e.to_string())?;
77
78    let mut indices = Vec::with_capacity(lines.len());
79    for (idx, line) in lines.iter().enumerate() {
80        if match_haystack(&*compiled, line, options.invert).is_some() {
81            indices.push(idx);
82        }
83    }
84    Ok(indices)
85}
86
87/// Apply the pattern to the extracted string for each line. Lines whose
88/// `extracted[i]` is `None` are excluded from the match set regardless of
89/// whether the pattern is empty or `invert` is set — a missing/non-string
90/// field is not a "line" for matching purposes.
91///
92/// Returns the 0-indexed line numbers of the raw input that should be emitted
93/// (i.e. whose extracted value satisfies the pattern + invert flag).
94pub fn filter_lines_with_extracted(
95    extracted: &[Option<String>],
96    pattern: &str,
97    options: FilterOptions,
98) -> Result<Vec<usize>, String> {
99    if pattern.is_empty() {
100        // Empty pattern matches every present extracted value. In invert mode
101        // that set becomes empty (an always-match pattern inverts to nothing).
102        // None entries are excluded either way.
103        if options.invert {
104            return Ok(Vec::new());
105        }
106        return Ok(extracted
107            .iter()
108            .enumerate()
109            .filter_map(|(idx, v)| v.as_ref().map(|_| idx))
110            .collect());
111    }
112
113    let engine = engine::create_engine(EngineKind::RustRegex);
114    let compiled = engine
115        .compile(pattern, &options.flags())
116        .map_err(|e| e.to_string())?;
117
118    let mut indices = Vec::with_capacity(extracted.len());
119    for (idx, slot) in extracted.iter().enumerate() {
120        let Some(s) = slot else {
121            // Missing field or parse failure — never emit.
122            continue;
123        };
124        if match_haystack(&*compiled, s, options.invert).is_some() {
125            indices.push(idx);
126        }
127    }
128    Ok(indices)
129}
130
131/// Returns per-line extracted strings. `None` means the line should be excluded
132/// from matching (JSON parse failure, path miss, or non-string value). The
133/// returned vector has the same length as `lines`, so callers can index it
134/// directly alongside the raw lines.
135pub fn extract_strings(lines: &[String], path_expr: &str) -> Result<Vec<Option<String>>, String> {
136    let path = json_path::parse_path(path_expr)?;
137    let mut out = Vec::with_capacity(lines.len());
138    for line in lines {
139        let extracted = serde_json::from_str::<serde_json::Value>(line)
140            .ok()
141            .and_then(|v| {
142                json_path::extract(&v, &path).and_then(|v| v.as_str().map(str::to_string))
143            });
144        out.push(extracted);
145    }
146    Ok(out)
147}
148
149/// Exit codes, matching grep conventions.
150pub const EXIT_MATCH: i32 = 0;
151pub const EXIT_NO_MATCH: i32 = 1;
152pub const EXIT_ERROR: i32 = 2;
153
154/// Per-line byte cap. A single line above this size is truncated — prevents
155/// one unterminated multi-gigabyte line from OOMing before `max_lines` helps.
156/// 10 MiB comfortably covers the largest real-world log lines (long stack
157/// traces, flattened JSON payloads) without letting a hostile stream run away.
158pub const MAX_LINE_BYTES: usize = 10 * 1024 * 1024;
159
160/// Emit matching lines to `writer`. If `line_number` is true, each line is
161/// prefixed with its 1-indexed line number and a colon.
162pub fn emit_matches(
163    writer: &mut dyn Write,
164    lines: &[String],
165    matched: &[usize],
166    line_number: bool,
167) -> io::Result<()> {
168    for &idx in matched {
169        if line_number {
170            writeln!(writer, "{}:{}", idx + 1, lines[idx])?;
171        } else {
172            writeln!(writer, "{}", lines[idx])?;
173        }
174    }
175    Ok(())
176}
177
178/// Emit only the count of matched lines.
179pub fn emit_count(writer: &mut dyn Write, matched_count: usize) -> io::Result<()> {
180    writeln!(writer, "{matched_count}")
181}
182
183/// Read all lines from either a file path or the provided reader (typically stdin).
184/// Trailing `\n`/`\r\n` is stripped per line. A trailing empty line (from a
185/// terminating newline) is dropped.
186///
187/// Invalid UTF-8 bytes are replaced with `U+FFFD REPLACEMENT CHARACTER` rather
188/// than aborting the read — this matches `grep`'s behavior and keeps the filter
189/// usable against binary-ish logs (e.g. files with stray latin-1 bytes).
190///
191/// `max_lines` caps the number of lines read to prevent OOM on unbounded
192/// streams. Pass `0` to disable the cap. Individual lines above
193/// `MAX_LINE_BYTES` are truncated (the rest of that line is discarded) so a
194/// single unterminated multi-gigabyte line cannot OOM the process before the
195/// line cap kicks in.
196///
197/// Returns `(lines, truncated)` where `truncated` is `true` if the line cap
198/// was reached before end-of-input OR any individual line was byte-truncated.
199pub fn read_input(
200    file: Option<&Path>,
201    fallback: impl Read,
202    max_lines: usize,
203) -> io::Result<(Vec<String>, bool)> {
204    let mut reader: Box<dyn BufRead> = match file {
205        Some(path) => Box::new(BufReader::new(std::fs::File::open(path)?)),
206        None => Box::new(BufReader::new(fallback)),
207    };
208    let mut out = Vec::new();
209    let mut buf = Vec::new();
210    let mut truncated = false;
211    // +1 so `read_until` will still consume the terminating newline when the
212    // line is exactly `MAX_LINE_BYTES` bytes of content.
213    let line_limit = MAX_LINE_BYTES as u64 + 1;
214    loop {
215        if max_lines != 0 && out.len() >= max_lines {
216            // Peek one byte: is there any more data after the cap? Only then
217            // do we flag truncation, so callers don't warn about files that
218            // just happen to have exactly `max_lines` lines. A single byte is
219            // enough to decide, and caps the peek so a giant post-cap line
220            // can't OOM us.
221            let mut one = [0u8; 1];
222            if reader.read(&mut one)? > 0 {
223                truncated = true;
224            }
225            break;
226        }
227        buf.clear();
228        let n = (&mut reader).take(line_limit).read_until(b'\n', &mut buf)?;
229        if n == 0 {
230            break;
231        }
232        // If we filled the limited reader without seeing `\n`, this line
233        // exceeds MAX_LINE_BYTES. Drain the remainder on the unlimited
234        // reader so the next iteration starts at the true next line, and
235        // truncate `buf` down to the cap (the extra byte came from the `+1`
236        // we allowed so ordinary MAX_LINE_BYTES-long lines still capture
237        // their terminating newline).
238        let line_overflowed = buf.last() != Some(&b'\n') && n as u64 == line_limit;
239        if line_overflowed {
240            truncated = true;
241            buf.truncate(MAX_LINE_BYTES);
242            let mut discard = Vec::new();
243            reader.read_until(b'\n', &mut discard)?;
244        }
245        // Strip trailing \n and optional \r.
246        let end = buf
247            .iter()
248            .rposition(|b| *b != b'\n' && *b != b'\r')
249            .map(|i| i + 1)
250            .unwrap_or(0);
251        out.push(String::from_utf8_lossy(&buf[..end]).into_owned());
252    }
253    Ok((out, truncated))
254}
255
256/// CLI entry point for `rgx filter`. Reads input, decides between non-interactive
257/// and TUI modes, and returns an exit code.
258pub fn entry(args: FilterArgs) -> i32 {
259    match run_entry(args) {
260        Ok(code) => code,
261        Err(msg) => {
262            eprintln!("rgx filter: {msg}");
263            EXIT_ERROR
264        }
265    }
266}
267
268fn run_entry(args: FilterArgs) -> Result<i32, String> {
269    let (lines, truncated) = read_input(args.file.as_deref(), io::stdin(), args.max_lines)
270        .map_err(|e| format!("reading input: {e}"))?;
271    if truncated {
272        eprintln!(
273            "rgx filter: input truncated at {} lines (use --max-lines to override)",
274            args.max_lines
275        );
276    }
277
278    let options = FilterOptions {
279        invert: args.invert,
280        case_insensitive: args.case_insensitive,
281    };
282
283    // Non-interactive paths: --count, --line-number, or a pattern was given and
284    // stdout is not a TTY (so we're being piped).
285    let has_pattern = args.pattern.as_deref().is_some_and(|p| !p.is_empty());
286    let stdout_is_tty = io::stdout().is_terminal();
287    let non_interactive = args.count || args.line_number || (has_pattern && !stdout_is_tty);
288
289    // If --json was given, resolve the per-line extracted strings up front.
290    // We do this before splitting non-interactive vs. TUI so both paths
291    // see the same view of the input.
292    let json_extracted = if let Some(path_expr) = args.json.as_deref() {
293        Some(extract_strings(&lines, path_expr).map_err(|e| format!("--json: {e}"))?)
294    } else {
295        None
296    };
297
298    if non_interactive {
299        let pattern = args.pattern.unwrap_or_default();
300        let matched = match &json_extracted {
301            Some(extracted) => filter_lines_with_extracted(extracted, &pattern, options)
302                .map_err(|e| format!("pattern: {e}"))?,
303            None => filter_lines(&lines, &pattern, options).map_err(|e| format!("pattern: {e}"))?,
304        };
305
306        let mut stdout = io::stdout().lock();
307        if args.count {
308            emit_count(&mut stdout, matched.len()).map_err(|e| format!("writing output: {e}"))?;
309        } else {
310            // Emit the raw lines regardless of --json — users still get the
311            // full JSON records back, not just the extracted fields.
312            emit_matches(&mut stdout, &lines, &matched, args.line_number)
313                .map_err(|e| format!("writing output: {e}"))?;
314        }
315        return Ok(if matched.is_empty() {
316            EXIT_NO_MATCH
317        } else {
318            EXIT_MATCH
319        });
320    }
321
322    // TUI mode.
323    let initial_pattern = args.pattern.unwrap_or_default();
324    let app = match json_extracted {
325        Some(extracted) => {
326            FilterApp::with_json_extracted(lines, extracted, &initial_pattern, options)
327                .map_err(|e| format!("--json: {e}"))?
328        }
329        None => FilterApp::new(lines, &initial_pattern, options),
330    };
331    let (final_app, outcome) = run::run_tui(app).map_err(|e| format!("tui: {e}"))?;
332
333    match outcome {
334        Outcome::Emit => {
335            let mut stdout = io::stdout().lock();
336            emit_matches(&mut stdout, &final_app.lines, &final_app.matched, false)
337                .map_err(|e| format!("writing output: {e}"))?;
338            Ok(if final_app.matched.is_empty() {
339                EXIT_NO_MATCH
340            } else {
341                EXIT_MATCH
342            })
343        }
344        Outcome::Discard => Ok(EXIT_NO_MATCH),
345        Outcome::Pending => Ok(EXIT_ERROR),
346    }
347}