Skip to main content

sql_cli/sql/generators/
file_readers.rs

1use crate::data::datatable::{DataColumn, DataRow, DataTable, DataValue};
2use crate::sql::generators::TableGenerator;
3use anyhow::{anyhow, Result};
4use regex::Regex;
5use std::fs::File;
6use std::io::{BufRead, BufReader};
7use std::sync::Arc;
8
9/// Hard cap on rows any file reader will return. Users who need more can raise
10/// it via a session setting (future work); for now this protects against
11/// accidentally pulling a multi-GB log into memory.
12const MAX_LINES_PER_FILE: usize = 1_000_000;
13
14/// Extract a string argument, erroring if the arg is missing/NULL/non-string.
15fn require_string(args: &[DataValue], idx: usize, name: &str) -> Result<String> {
16    match args.get(idx) {
17        Some(DataValue::String(s)) => Ok(s.clone()),
18        Some(DataValue::InternedString(s)) => Ok(s.as_str().to_string()),
19        Some(DataValue::Null) | None => Err(anyhow!("{} requires argument {}", name, idx + 1)),
20        Some(v) => Err(anyhow!(
21            "{} argument {} must be a string, got {:?}",
22            name,
23            idx + 1,
24            v
25        )),
26    }
27}
28
29/// Extract an optional string argument. Returns None for missing or NULL.
30fn optional_string(args: &[DataValue], idx: usize) -> Option<String> {
31    match args.get(idx) {
32        Some(DataValue::String(s)) => Some(s.clone()),
33        Some(DataValue::InternedString(s)) => Some(s.as_str().to_string()),
34        _ => None,
35    }
36}
37
38/// Open a file and stream its lines, applying an optional include-regex filter
39/// and the global truncation cap. Emits a stderr warning when truncation kicks in.
40///
41/// Returns (line_num, line) pairs where `line_num` is the original 1-based line
42/// number in the source file — so numbers are preserved through filtering.
43fn read_filtered_lines(path: &str, match_regex: Option<&Regex>) -> Result<Vec<(i64, String)>> {
44    let file = File::open(path).map_err(|e| anyhow!("Failed to open '{}': {}", path, e))?;
45    let reader = BufReader::new(file);
46
47    let mut out = Vec::new();
48    let mut truncated = false;
49
50    for (idx, line_result) in reader.lines().enumerate() {
51        let line = line_result.map_err(|e| anyhow!("Error reading '{}': {}", path, e))?;
52        let line_num = (idx + 1) as i64;
53
54        if let Some(re) = match_regex {
55            if !re.is_match(&line) {
56                continue;
57            }
58        }
59
60        if out.len() >= MAX_LINES_PER_FILE {
61            truncated = true;
62            break;
63        }
64        out.push((line_num, line));
65    }
66
67    if truncated {
68        eprintln!(
69            "WARNING: truncated to {} rows (max_lines_per_file cap) when reading '{}'",
70            MAX_LINES_PER_FILE, path
71        );
72    }
73
74    Ok(out)
75}
76
77/// READ_TEXT(path [, match_regex]) - Read a text file line by line.
78///
79/// Emits `(line_num, line)` rows. Optional `match_regex` filters source lines
80/// *before* materializing them, which is the primary fast path for large logs.
81pub struct ReadText;
82
83impl TableGenerator for ReadText {
84    fn name(&self) -> &str {
85        "READ_TEXT"
86    }
87
88    fn columns(&self) -> Vec<DataColumn> {
89        vec![DataColumn::new("line_num"), DataColumn::new("line")]
90    }
91
92    fn generate(&self, args: Vec<DataValue>) -> Result<Arc<DataTable>> {
93        if args.is_empty() || args.len() > 2 {
94            return Err(anyhow!(
95                "READ_TEXT expects 1 or 2 arguments: (path [, match_regex])"
96            ));
97        }
98
99        let path = require_string(&args, 0, "READ_TEXT")?;
100        let match_regex = optional_string(&args, 1)
101            .map(|s| Regex::new(&s).map_err(|e| anyhow!("Invalid match_regex: {}", e)))
102            .transpose()?;
103
104        let lines = read_filtered_lines(&path, match_regex.as_ref())?;
105
106        let mut table = DataTable::new("read_text");
107        table.add_column(DataColumn::new("line_num"));
108        table.add_column(DataColumn::new("line"));
109
110        for (line_num, line) in lines {
111            table
112                .add_row(DataRow::new(vec![
113                    DataValue::Integer(line_num),
114                    DataValue::String(line),
115                ]))
116                .map_err(|e| anyhow!(e))?;
117        }
118
119        Ok(Arc::new(table))
120    }
121
122    fn description(&self) -> &str {
123        "Read a text file line-by-line. Optional second arg is a regex that filters lines at read time."
124    }
125
126    fn arg_count(&self) -> usize {
127        2
128    }
129}
130
131/// GREP(path, pattern [, invert]) - Read only lines matching a regex.
132///
133/// Thin composable wrapper around READ_TEXT's filter path. Third argument
134/// (boolean or integer truthy value) inverts the match, matching `grep -v`.
135pub struct Grep;
136
137impl TableGenerator for Grep {
138    fn name(&self) -> &str {
139        "GREP"
140    }
141
142    fn columns(&self) -> Vec<DataColumn> {
143        vec![DataColumn::new("line_num"), DataColumn::new("line")]
144    }
145
146    fn generate(&self, args: Vec<DataValue>) -> Result<Arc<DataTable>> {
147        if args.len() < 2 || args.len() > 3 {
148            return Err(anyhow!(
149                "GREP expects 2 or 3 arguments: (path, pattern [, invert])"
150            ));
151        }
152
153        let path = require_string(&args, 0, "GREP")?;
154        let pattern_str = require_string(&args, 1, "GREP")?;
155        let pattern =
156            Regex::new(&pattern_str).map_err(|e| anyhow!("Invalid GREP pattern: {}", e))?;
157
158        let invert = match args.get(2) {
159            Some(DataValue::Boolean(b)) => *b,
160            Some(DataValue::Integer(n)) => *n != 0,
161            Some(DataValue::Null) | None => false,
162            Some(v) => return Err(anyhow!("GREP invert flag must be boolean, got {:?}", v)),
163        };
164
165        // When not inverted we can push the filter down into the file reader for
166        // the fast path. When inverted we still iterate every line.
167        let lines = if invert {
168            let all = read_filtered_lines(&path, None)?;
169            all.into_iter()
170                .filter(|(_, line)| !pattern.is_match(line))
171                .collect::<Vec<_>>()
172        } else {
173            read_filtered_lines(&path, Some(&pattern))?
174        };
175
176        let mut table = DataTable::new("grep");
177        table.add_column(DataColumn::new("line_num"));
178        table.add_column(DataColumn::new("line"));
179
180        for (line_num, line) in lines {
181            table
182                .add_row(DataRow::new(vec![
183                    DataValue::Integer(line_num),
184                    DataValue::String(line),
185                ]))
186                .map_err(|e| anyhow!(e))?;
187        }
188
189        Ok(Arc::new(table))
190    }
191
192    fn description(&self) -> &str {
193        "Read only lines matching a regex (third arg inverts the match, like grep -v)"
194    }
195
196    fn arg_count(&self) -> usize {
197        3
198    }
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204    use std::io::Write;
205    use tempfile::NamedTempFile;
206
207    fn write_tmp(contents: &str) -> NamedTempFile {
208        let mut f = NamedTempFile::new().unwrap();
209        f.write_all(contents.as_bytes()).unwrap();
210        f
211    }
212
213    #[test]
214    fn test_read_text_returns_all_lines() {
215        let f = write_tmp("one\ntwo\nthree\n");
216        let table = ReadText
217            .generate(vec![DataValue::String(
218                f.path().to_string_lossy().to_string(),
219            )])
220            .unwrap();
221        assert_eq!(table.row_count(), 3);
222        assert_eq!(
223            table.get_value(0, 1).unwrap(),
224            &DataValue::String("one".to_string())
225        );
226        assert_eq!(table.get_value(2, 0).unwrap(), &DataValue::Integer(3));
227    }
228
229    #[test]
230    fn test_read_text_with_match_regex_filters_lines() {
231        let f = write_tmp("INFO boot\nERROR disk full\nINFO shutdown\nERROR oom\n");
232        let table = ReadText
233            .generate(vec![
234                DataValue::String(f.path().to_string_lossy().to_string()),
235                DataValue::String("ERROR".to_string()),
236            ])
237            .unwrap();
238        assert_eq!(table.row_count(), 2);
239        // Line numbers preserve original file positions (2 and 4), not 1 and 2.
240        assert_eq!(table.get_value(0, 0).unwrap(), &DataValue::Integer(2));
241        assert_eq!(table.get_value(1, 0).unwrap(), &DataValue::Integer(4));
242    }
243
244    #[test]
245    fn test_read_text_requires_path() {
246        assert!(ReadText.generate(vec![]).is_err());
247    }
248
249    #[test]
250    fn test_read_text_invalid_regex_errors_early() {
251        let f = write_tmp("hello\n");
252        let err = ReadText
253            .generate(vec![
254                DataValue::String(f.path().to_string_lossy().to_string()),
255                DataValue::String("(unclosed".to_string()),
256            ])
257            .unwrap_err();
258        assert!(err.to_string().contains("match_regex"));
259    }
260
261    #[test]
262    fn test_grep_matches_like_grep() {
263        let f = write_tmp("apple\nbanana\ncherry\napricot\n");
264        let table = Grep
265            .generate(vec![
266                DataValue::String(f.path().to_string_lossy().to_string()),
267                DataValue::String("^ap".to_string()),
268            ])
269            .unwrap();
270        assert_eq!(table.row_count(), 2);
271        assert_eq!(
272            table.get_value(0, 1).unwrap(),
273            &DataValue::String("apple".to_string())
274        );
275        assert_eq!(
276            table.get_value(1, 1).unwrap(),
277            &DataValue::String("apricot".to_string())
278        );
279    }
280
281    #[test]
282    fn test_grep_invert_like_grep_v() {
283        let f = write_tmp("apple\nbanana\ncherry\napricot\n");
284        let table = Grep
285            .generate(vec![
286                DataValue::String(f.path().to_string_lossy().to_string()),
287                DataValue::String("^ap".to_string()),
288                DataValue::Boolean(true),
289            ])
290            .unwrap();
291        assert_eq!(table.row_count(), 2);
292        assert_eq!(
293            table.get_value(0, 1).unwrap(),
294            &DataValue::String("banana".to_string())
295        );
296    }
297}