Skip to main content

rgx/filter/
json_path.rs

1//! Minimal dotted/indexed path language for JSONL field extraction.
2//!
3//! Grammar:
4//!
5//! ```text
6//! path    := segment+
7//! segment := ('.' ident) | ('[' digits ']') | ('[' quoted ']')
8//! ident   := [A-Za-z_][A-Za-z0-9_]*
9//! quoted  := '"' ( [^"\\] | '\\' ( '"' | '\\' ) )* '"'
10//! ```
11//!
12//! Dotted identifiers cover the common JSONL shape (`.msg`, `.steps[0].text`).
13//! For keys that can't be expressed as a bare identifier — hyphens, dots,
14//! spaces, unicode — use the bracketed-string form: `.["user-id"]`,
15//! `["weird key"]`, or `.["日本語"]`. Only `\"` and `\\` are recognized as
16//! escapes inside the quoted form; anything else is still on the no-wildcards,
17//! no-filters side of scope for `--json`.
18
19use serde_json::Value;
20
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum Segment {
23    Key(String),
24    Index(usize),
25}
26
27/// Parse a dotted/indexed path expression into a list of segments.
28///
29/// Returns `Err` with a message pointing at the character offset on failure.
30pub fn parse_path(s: &str) -> Result<Vec<Segment>, String> {
31    if s.is_empty() {
32        return Err("empty path".to_string());
33    }
34
35    let bytes = s.as_bytes();
36    let mut segments = Vec::new();
37    let mut i = 0;
38
39    while i < bytes.len() {
40        match bytes[i] {
41            b'.' => {
42                i += 1;
43                let start = i;
44                if i >= bytes.len() {
45                    return Err(format!("expected identifier at position {i}"));
46                }
47                // First char of identifier must be [A-Za-z_].
48                if !is_ident_start(bytes[i]) {
49                    return Err(format!(
50                        "expected identifier start at position {i}, found {:?}",
51                        peek_char(s, i)
52                    ));
53                }
54                i += 1;
55                while i < bytes.len() && is_ident_continue(bytes[i]) {
56                    i += 1;
57                }
58                // Safe to slice — identifier chars are ASCII.
59                let ident = &s[start..i];
60                segments.push(Segment::Key(ident.to_string()));
61            }
62            b'[' => {
63                i += 1;
64                if i >= bytes.len() {
65                    return Err(format!("expected digits or quoted key at position {i}"));
66                }
67                if bytes[i] == b'"' {
68                    // Quoted string key — supports hyphens, dots, spaces, unicode.
69                    let (key, consumed) = parse_quoted_key(&bytes[i..], i)?;
70                    i += consumed;
71                    if i >= bytes.len() || bytes[i] != b']' {
72                        return Err(format!("expected ']' at position {i}"));
73                    }
74                    i += 1; // consume ']'
75                    segments.push(Segment::Key(key));
76                } else {
77                    let start = i;
78                    while i < bytes.len() && bytes[i].is_ascii_digit() {
79                        i += 1;
80                    }
81                    if start == i {
82                        return Err(format!("expected digits or quoted key at position {start}"));
83                    }
84                    let digits = &s[start..i];
85                    if i >= bytes.len() || bytes[i] != b']' {
86                        return Err(format!("expected ']' at position {i}"));
87                    }
88                    let index: usize = digits
89                        .parse()
90                        .map_err(|e| format!("invalid index {digits:?}: {e}"))?;
91                    i += 1; // consume ']'
92                    segments.push(Segment::Index(index));
93                }
94            }
95            _ => {
96                return Err(format!(
97                    "expected '.' or '[' at position {i}, found {:?}",
98                    peek_char(s, i)
99                ));
100            }
101        }
102    }
103
104    if segments.is_empty() {
105        return Err("empty path".to_string());
106    }
107    Ok(segments)
108}
109
110/// Walk a JSON `Value` along the given `path`. Returns `None` if any segment
111/// misses (wrong type, missing key, out-of-bounds index).
112pub fn extract<'a>(value: &'a Value, path: &[Segment]) -> Option<&'a Value> {
113    let mut cur = value;
114    for seg in path {
115        match seg {
116            Segment::Key(k) => cur = cur.as_object()?.get(k)?,
117            Segment::Index(i) => cur = cur.as_array()?.get(*i)?,
118        }
119    }
120    Some(cur)
121}
122
123fn is_ident_start(b: u8) -> bool {
124    b.is_ascii_alphabetic() || b == b'_'
125}
126
127fn is_ident_continue(b: u8) -> bool {
128    b.is_ascii_alphanumeric() || b == b'_'
129}
130
131/// Read the first char at `byte_offset`, or `'?'` if the offset is past the
132/// end (should never happen where we call this — only there to satisfy the
133/// Option). Using `chars().next()` avoids the "one byte looks like æ when it's
134/// really the first byte of 日" misreporting that `bytes[i] as char` produces.
135fn peek_char(s: &str, byte_offset: usize) -> char {
136    s[byte_offset..].chars().next().unwrap_or('?')
137}
138
139/// Parse a quoted key starting at the leading `"`. `offset` is the absolute
140/// position of that leading `"` in the full source (for error messages).
141/// Returns `(key, consumed_bytes)` where `consumed_bytes` includes both
142/// quotes. Recognizes `\"` and `\\`; any other backslash pair is a parse error
143/// (keeps us honest — we don't silently pass through `\n` etc.).
144fn parse_quoted_key(bytes: &[u8], offset: usize) -> Result<(String, usize), String> {
145    debug_assert!(!bytes.is_empty() && bytes[0] == b'"');
146    let mut out: Vec<u8> = Vec::new();
147    let mut j = 1; // skip opening "
148    while j < bytes.len() {
149        match bytes[j] {
150            b'"' => {
151                // String is UTF-8 because the original `s` was &str.
152                let key = String::from_utf8(out)
153                    .map_err(|e| format!("invalid utf-8 in quoted key at {offset}: {e}"))?;
154                return Ok((key, j + 1));
155            }
156            b'\\' => {
157                if j + 1 >= bytes.len() {
158                    return Err(format!("unterminated escape at position {}", offset + j));
159                }
160                match bytes[j + 1] {
161                    b'"' => out.push(b'"'),
162                    b'\\' => out.push(b'\\'),
163                    other => {
164                        return Err(format!(
165                            "unknown escape '\\{}' at position {}",
166                            other as char,
167                            offset + j
168                        ));
169                    }
170                }
171                j += 2;
172            }
173            b => {
174                out.push(b);
175                j += 1;
176            }
177        }
178    }
179    Err(format!(
180        "unterminated quoted key starting at position {offset}"
181    ))
182}