rgx/filter/json_path.rs
1//! Minimal dotted/indexed path language for JSONL field extraction.
2//!
3//! Grammar:
4//!
5//! ```text
6//! path := segment+
7//! segment := ('.' ident) | ('[' digits ']') | ('[' quoted ']')
8//! ident := [A-Za-z_][A-Za-z0-9_]*
9//! quoted := '"' ( [^"\\] | '\\' ( '"' | '\\' ) )* '"'
10//! ```
11//!
12//! Dotted identifiers cover the common JSONL shape (`.msg`, `.steps[0].text`).
13//! For keys that can't be expressed as a bare identifier — hyphens, dots,
14//! spaces, unicode — use the bracketed-string form: `.["user-id"]`,
15//! `["weird key"]`, or `.["日本語"]`. Only `\"` and `\\` are recognized as
16//! escapes inside the quoted form; anything else is still on the no-wildcards,
17//! no-filters side of scope for `--json`.
18
19use serde_json::Value;
20
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum Segment {
23 Key(String),
24 Index(usize),
25}
26
27/// Parse a dotted/indexed path expression into a list of segments.
28///
29/// Returns `Err` with a message pointing at the character offset on failure.
30pub fn parse_path(s: &str) -> Result<Vec<Segment>, String> {
31 if s.is_empty() {
32 return Err("empty path".to_string());
33 }
34
35 let bytes = s.as_bytes();
36 let mut segments = Vec::new();
37 let mut i = 0;
38
39 while i < bytes.len() {
40 match bytes[i] {
41 b'.' => {
42 i += 1;
43 let start = i;
44 if i >= bytes.len() {
45 return Err(format!("expected identifier at position {i}"));
46 }
47 // First char of identifier must be [A-Za-z_].
48 if !is_ident_start(bytes[i]) {
49 return Err(format!(
50 "expected identifier start at position {i}, found {:?}",
51 peek_char(s, i)
52 ));
53 }
54 i += 1;
55 while i < bytes.len() && is_ident_continue(bytes[i]) {
56 i += 1;
57 }
58 // Safe to slice — identifier chars are ASCII.
59 let ident = &s[start..i];
60 segments.push(Segment::Key(ident.to_string()));
61 }
62 b'[' => {
63 i += 1;
64 if i >= bytes.len() {
65 return Err(format!("expected digits or quoted key at position {i}"));
66 }
67 if bytes[i] == b'"' {
68 // Quoted string key — supports hyphens, dots, spaces, unicode.
69 let (key, consumed) = parse_quoted_key(&bytes[i..], i)?;
70 i += consumed;
71 if i >= bytes.len() || bytes[i] != b']' {
72 return Err(format!("expected ']' at position {i}"));
73 }
74 i += 1; // consume ']'
75 segments.push(Segment::Key(key));
76 } else {
77 let start = i;
78 while i < bytes.len() && bytes[i].is_ascii_digit() {
79 i += 1;
80 }
81 if start == i {
82 return Err(format!("expected digits or quoted key at position {start}"));
83 }
84 let digits = &s[start..i];
85 if i >= bytes.len() || bytes[i] != b']' {
86 return Err(format!("expected ']' at position {i}"));
87 }
88 let index: usize = digits
89 .parse()
90 .map_err(|e| format!("invalid index {digits:?}: {e}"))?;
91 i += 1; // consume ']'
92 segments.push(Segment::Index(index));
93 }
94 }
95 _ => {
96 return Err(format!(
97 "expected '.' or '[' at position {i}, found {:?}",
98 peek_char(s, i)
99 ));
100 }
101 }
102 }
103
104 if segments.is_empty() {
105 return Err("empty path".to_string());
106 }
107 Ok(segments)
108}
109
110/// Walk a JSON `Value` along the given `path`. Returns `None` if any segment
111/// misses (wrong type, missing key, out-of-bounds index).
112pub fn extract<'a>(value: &'a Value, path: &[Segment]) -> Option<&'a Value> {
113 let mut cur = value;
114 for seg in path {
115 match seg {
116 Segment::Key(k) => cur = cur.as_object()?.get(k)?,
117 Segment::Index(i) => cur = cur.as_array()?.get(*i)?,
118 }
119 }
120 Some(cur)
121}
122
123fn is_ident_start(b: u8) -> bool {
124 b.is_ascii_alphabetic() || b == b'_'
125}
126
127fn is_ident_continue(b: u8) -> bool {
128 b.is_ascii_alphanumeric() || b == b'_'
129}
130
131/// Read the first char at `byte_offset`, or `'?'` if the offset is past the
132/// end (should never happen where we call this — only there to satisfy the
133/// Option). Using `chars().next()` avoids the "one byte looks like æ when it's
134/// really the first byte of 日" misreporting that `bytes[i] as char` produces.
135fn peek_char(s: &str, byte_offset: usize) -> char {
136 s[byte_offset..].chars().next().unwrap_or('?')
137}
138
139/// Parse a quoted key starting at the leading `"`. `offset` is the absolute
140/// position of that leading `"` in the full source (for error messages).
141/// Returns `(key, consumed_bytes)` where `consumed_bytes` includes both
142/// quotes. Recognizes `\"` and `\\`; any other backslash pair is a parse error
143/// (keeps us honest — we don't silently pass through `\n` etc.).
144fn parse_quoted_key(bytes: &[u8], offset: usize) -> Result<(String, usize), String> {
145 debug_assert!(!bytes.is_empty() && bytes[0] == b'"');
146 let mut out: Vec<u8> = Vec::new();
147 let mut j = 1; // skip opening "
148 while j < bytes.len() {
149 match bytes[j] {
150 b'"' => {
151 // String is UTF-8 because the original `s` was &str.
152 let key = String::from_utf8(out)
153 .map_err(|e| format!("invalid utf-8 in quoted key at {offset}: {e}"))?;
154 return Ok((key, j + 1));
155 }
156 b'\\' => {
157 if j + 1 >= bytes.len() {
158 return Err(format!("unterminated escape at position {}", offset + j));
159 }
160 match bytes[j + 1] {
161 b'"' => out.push(b'"'),
162 b'\\' => out.push(b'\\'),
163 other => {
164 return Err(format!(
165 "unknown escape '\\{}' at position {}",
166 other as char,
167 offset + j
168 ));
169 }
170 }
171 j += 2;
172 }
173 b => {
174 out.push(b);
175 j += 1;
176 }
177 }
178 }
179 Err(format!(
180 "unterminated quoted key starting at position {offset}"
181 ))
182}