#[cfg(not(test))]
use alloc::{
format,
string::{String, ToString},
vec,
vec::Vec,
};
#[cfg(test)]
use std::vec;
use crate::bits::BitVec;
use crate::json::light::{JsonCursor, JsonIndex, StandardJson};
use crate::RankSelect;
#[derive(Debug)]
pub struct NewlineIndex {
line_starts: BitVec,
text_len: usize,
}
impl NewlineIndex {
pub fn build(text: &[u8]) -> Self {
if text.is_empty() {
return Self {
line_starts: BitVec::new(),
text_len: 0,
};
}
let mut bits = vec![0u64; text.len().div_ceil(64)];
let mut i = 0;
while i < text.len() {
match text[i] {
b'\n' => {
let next = i + 1;
if next < text.len() {
bits[next / 64] |= 1 << (next % 64);
}
i += 1;
}
b'\r' => {
let next = if i + 1 < text.len() && text[i + 1] == b'\n' {
i + 2
} else {
i + 1
};
if next < text.len() {
bits[next / 64] |= 1 << (next % 64);
}
i = next;
}
_ => i += 1,
}
}
Self {
line_starts: BitVec::from_words(bits, text.len()),
text_len: text.len(),
}
}
pub fn to_offset(&self, line: usize, column: usize) -> Option<usize> {
if line == 0 || column == 0 {
return None;
}
let line_start = if line == 1 {
0
} else {
self.line_starts.select1(line - 2)?
};
let offset = line_start + column - 1;
if offset < self.text_len {
Some(offset)
} else {
None
}
}
pub fn to_line_column(&self, offset: usize) -> (usize, usize) {
if self.text_len == 0 {
return (1, 1);
}
let markers_before_or_at = self.line_starts.rank1(offset + 1);
let line = 1 + markers_before_or_at;
let line_start = if line == 1 {
0
} else {
self.line_starts.select1(line - 2).unwrap_or(0)
};
let column = offset - line_start + 1;
(line, column)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum PathComponent {
Index(usize),
DotKey(String),
BracketKey(String),
}
impl PathComponent {
fn to_jq_string(&self) -> String {
match self {
PathComponent::Index(i) => format!("[{}]", i),
PathComponent::DotKey(k) => format!(".{}", k),
PathComponent::BracketKey(k) => format!("[\"{}\"]", escape_jq_string(k)),
}
}
}
fn can_use_dot_notation(key: &str) -> bool {
if key.is_empty() {
return false;
}
let mut chars = key.chars();
let first = chars.next().unwrap();
if !first.is_alphabetic() && first != '_' {
return false;
}
chars.all(|c| c.is_alphanumeric() || c == '_')
}
fn escape_jq_string(s: &str) -> String {
let mut result = String::with_capacity(s.len());
for c in s.chars() {
match c {
'"' => result.push_str("\\\""),
'\\' => result.push_str("\\\\"),
'\n' => result.push_str("\\n"),
'\r' => result.push_str("\\r"),
'\t' => result.push_str("\\t"),
c => result.push(c),
}
}
result
}
fn ib_index_to_bp_pos<W: AsRef<[u64]>>(index: &JsonIndex<W>, ib_idx: usize) -> Option<usize> {
let bp = index.bp();
let bp_len = bp.len();
if bp_len == 0 {
return None;
}
let mut lo = 0;
let mut hi = bp_len;
while lo < hi {
let mid = lo + (hi - lo) / 2;
let count = bp.rank1(mid + 1);
if count <= ib_idx {
lo = mid + 1;
} else {
hi = mid;
}
}
if lo < bp_len && bp.rank1(lo + 1) == ib_idx + 1 {
Some(lo)
} else {
None
}
}
fn find_node_at_offset<W: AsRef<[u64]>>(
index: &JsonIndex<W>,
text: &[u8],
offset: usize,
) -> Option<usize> {
if offset >= text.len() {
return None;
}
let rank = index.ib_rank1(offset);
let ib_idx = if let Some(struct_pos) = index.ib_select1(rank) {
if struct_pos == offset {
rank
} else {
if rank > 0 {
rank - 1
} else {
return None;
}
}
} else if rank > 0 {
rank - 1
} else {
return None;
};
ib_index_to_bp_pos(index, ib_idx)
}
fn count_siblings_before<W: AsRef<[u64]>>(
index: &JsonIndex<W>,
container_bp: usize,
target_bp: usize,
) -> usize {
let mut count = 0;
let mut child_bp = match index.bp().first_child(container_bp) {
Some(c) => c,
None => return 0,
};
while child_bp < target_bp {
count += 1;
child_bp = match index.bp().next_sibling(child_bp) {
Some(c) => c,
None => break,
};
}
count
}
fn is_ancestor<W: AsRef<[u64]>>(
index: &JsonIndex<W>,
ancestor_bp: usize,
descendant_bp: usize,
) -> bool {
if ancestor_bp >= descendant_bp {
return false;
}
match index.bp().find_close(ancestor_bp) {
Some(close) => descendant_bp < close,
None => false,
}
}
fn find_key_for_value<W: AsRef<[u64]>>(
index: &JsonIndex<W>,
object_bp: usize,
target_bp: usize,
) -> Option<(usize, usize)> {
let mut child_bp = index.bp().first_child(object_bp)?;
loop {
let key_bp = child_bp;
let value_bp = index.bp().next_sibling(key_bp)?;
if key_bp == target_bp {
return Some((key_bp, value_bp));
}
if value_bp == target_bp || is_ancestor(index, value_bp, target_bp) {
return Some((key_bp, value_bp));
}
child_bp = index.bp().next_sibling(value_bp)?;
}
}
fn extract_key_string<W: AsRef<[u64]>>(cursor: JsonCursor<'_, W>, _text: &[u8]) -> Option<String> {
match cursor.value() {
StandardJson::String(s) => s.as_str().ok().map(|cow| cow.into_owned()),
_ => None,
}
}
pub fn path_to_bp<W: AsRef<[u64]>>(
index: &JsonIndex<W>,
text: &[u8],
target_bp: usize,
) -> Option<String> {
let mut components: Vec<PathComponent> = Vec::new();
let mut current_bp = target_bp;
while let Some(parent_bp) = index.bp().parent(current_bp) {
let parent_cursor = JsonCursor::from_bp_position(index, text, parent_bp);
let parent_pos = parent_cursor.text_position()?;
match text.get(parent_pos)? {
b'[' => {
let idx = count_siblings_before(index, parent_bp, current_bp);
components.push(PathComponent::Index(idx));
}
b'{' => {
let (key_bp, _value_bp) = find_key_for_value(index, parent_bp, current_bp)?;
let key_cursor = JsonCursor::from_bp_position(index, text, key_bp);
let key = extract_key_string(key_cursor, text)?;
if can_use_dot_notation(&key) {
components.push(PathComponent::DotKey(key));
} else {
components.push(PathComponent::BracketKey(key));
}
}
_ => {
break;
}
}
current_bp = parent_bp;
}
components.reverse();
if components.is_empty() {
Some(".".to_string())
} else {
let mut result = String::new();
for (i, comp) in components.iter().enumerate() {
match comp {
PathComponent::Index(_) | PathComponent::BracketKey(_) => {
if i == 0 {
result.push('.');
}
result.push_str(&comp.to_jq_string());
}
PathComponent::DotKey(_) => {
result.push_str(&comp.to_jq_string());
}
}
}
Some(result)
}
}
pub fn locate_offset<W: AsRef<[u64]>>(
index: &JsonIndex<W>,
text: &[u8],
offset: usize,
) -> Option<String> {
let bp_pos = find_node_at_offset(index, text, offset)?;
path_to_bp(index, text, bp_pos)
}
#[derive(Debug, Clone)]
pub struct LocateResult {
pub expression: String,
pub byte_range: (usize, usize),
pub value_type: &'static str,
}
pub fn locate_offset_detailed<W: AsRef<[u64]>>(
index: &JsonIndex<W>,
text: &[u8],
offset: usize,
) -> Option<LocateResult> {
let bp_pos = find_node_at_offset(index, text, offset)?;
let expression = path_to_bp(index, text, bp_pos)?;
let cursor = JsonCursor::from_bp_position(index, text, bp_pos);
let byte_range = cursor.text_range().unwrap_or_else(|| {
let start = cursor.text_position().unwrap_or(0);
(start, text.len())
});
let value_type = match cursor.value() {
StandardJson::Object(_) => "object",
StandardJson::Array(_) => "array",
StandardJson::String(_) => "string",
StandardJson::Number(_) => "number",
StandardJson::Bool(_) => "boolean",
StandardJson::Null => "null",
StandardJson::Error(_) => "error",
};
Some(LocateResult {
expression,
byte_range,
value_type,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_newline_index_empty() {
let index = NewlineIndex::build(b"");
assert_eq!(index.to_offset(1, 1), None);
assert_eq!(index.to_line_column(0), (1, 1));
}
#[test]
fn test_newline_index_no_newlines() {
let text = b"hello world";
let index = NewlineIndex::build(text);
assert_eq!(index.to_offset(1, 1), Some(0));
assert_eq!(index.to_offset(1, 6), Some(5)); assert_eq!(index.to_offset(1, 12), None); assert_eq!(index.to_offset(2, 1), None);
assert_eq!(index.to_line_column(0), (1, 1));
assert_eq!(index.to_line_column(5), (1, 6));
}
#[test]
fn test_newline_index_unix_lf() {
let text = b"line1\nline2\nline3";
let index = NewlineIndex::build(text);
assert_eq!(index.to_offset(1, 1), Some(0));
assert_eq!(index.to_offset(1, 5), Some(4));
assert_eq!(index.to_offset(2, 1), Some(6));
assert_eq!(index.to_offset(2, 5), Some(10));
assert_eq!(index.to_offset(3, 1), Some(12));
assert_eq!(index.to_offset(3, 5), Some(16));
assert_eq!(index.to_line_column(0), (1, 1));
assert_eq!(index.to_line_column(5), (1, 6)); assert_eq!(index.to_line_column(6), (2, 1));
assert_eq!(index.to_line_column(12), (3, 1));
}
#[test]
fn test_newline_index_windows_crlf() {
let text = b"line1\r\nline2\r\nline3";
let index = NewlineIndex::build(text);
assert_eq!(index.to_offset(1, 1), Some(0));
assert_eq!(index.to_offset(1, 5), Some(4));
assert_eq!(index.to_offset(2, 1), Some(7));
assert_eq!(index.to_offset(2, 5), Some(11));
assert_eq!(index.to_offset(3, 1), Some(14));
assert_eq!(index.to_line_column(0), (1, 1));
assert_eq!(index.to_line_column(7), (2, 1));
assert_eq!(index.to_line_column(14), (3, 1));
}
#[test]
fn test_newline_index_classic_mac_cr() {
let text = b"line1\rline2\rline3";
let index = NewlineIndex::build(text);
assert_eq!(index.to_offset(1, 1), Some(0));
assert_eq!(index.to_offset(2, 1), Some(6));
assert_eq!(index.to_offset(3, 1), Some(12));
assert_eq!(index.to_line_column(6), (2, 1));
}
#[test]
fn test_newline_index_invalid_inputs() {
let index = NewlineIndex::build(b"hello\nworld");
assert_eq!(index.to_offset(0, 1), None); assert_eq!(index.to_offset(1, 0), None); }
#[test]
fn test_can_use_dot_notation() {
assert!(can_use_dot_notation("foo"));
assert!(can_use_dot_notation("_bar"));
assert!(can_use_dot_notation("foo123"));
assert!(can_use_dot_notation("foo_bar"));
assert!(!can_use_dot_notation("")); assert!(!can_use_dot_notation("123")); assert!(!can_use_dot_notation("foo-bar")); assert!(!can_use_dot_notation("foo.bar")); assert!(!can_use_dot_notation("foo bar")); }
#[test]
fn test_escape_jq_string() {
assert_eq!(escape_jq_string("hello"), "hello");
assert_eq!(escape_jq_string("hello\"world"), "hello\\\"world");
assert_eq!(escape_jq_string("back\\slash"), "back\\\\slash");
assert_eq!(escape_jq_string("new\nline"), "new\\nline");
}
#[test]
fn test_locate_simple_object() {
let json = br#"{"name": "Alice"}"#;
let index = JsonIndex::build(json);
assert_eq!(locate_offset(&index, json, 0), Some(".".to_string()));
assert_eq!(locate_offset(&index, json, 1), Some(".name".to_string()));
assert_eq!(locate_offset(&index, json, 10), Some(".name".to_string()));
}
#[test]
fn test_locate_nested_array() {
let json = br#"{"users": [{"name": "Bob"}]}"#;
let index = JsonIndex::build(json);
assert_eq!(locate_offset(&index, json, 10), Some(".users".to_string()));
assert_eq!(
locate_offset(&index, json, 11),
Some(".users[0]".to_string())
);
assert_eq!(
locate_offset(&index, json, 12),
Some(".users[0].name".to_string())
);
}
#[test]
fn test_locate_array_indices() {
let json = br#"[1, 2, 3]"#;
let index = JsonIndex::build(json);
assert_eq!(locate_offset(&index, json, 0), Some(".".to_string()));
assert_eq!(locate_offset(&index, json, 1), Some(".[0]".to_string()));
assert_eq!(locate_offset(&index, json, 4), Some(".[1]".to_string()));
assert_eq!(locate_offset(&index, json, 7), Some(".[2]".to_string()));
}
#[test]
fn test_locate_special_key() {
let json = br#"{"foo-bar": 1, "with space": 2}"#;
let index = JsonIndex::build(json);
assert_eq!(
locate_offset(&index, json, 1),
Some(r#".["foo-bar"]"#.to_string())
);
assert_eq!(
locate_offset(&index, json, 15),
Some(r#".["with space"]"#.to_string())
);
}
#[test]
fn test_locate_deeply_nested() {
let json = br#"{"a": {"b": {"c": 42}}}"#;
let index = JsonIndex::build(json);
assert_eq!(locate_offset(&index, json, 18), Some(".a.b.c".to_string()));
}
#[test]
fn test_locate_large_array_deep_in_tree() {
let json = br#"{"outer":{"inner":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49]}}"#;
let index = JsonIndex::build(json);
let elem0_pos = json.windows(1).position(|w| w == b"0").unwrap();
assert_eq!(
locate_offset(&index, json, elem0_pos),
Some(".outer.inner[0]".to_string())
);
let elem49_start = json
.windows(4)
.position(|w| w == b",49]")
.map(|p| p + 1)
.unwrap();
assert_eq!(
locate_offset(&index, json, elem49_start),
Some(".outer.inner[49]".to_string())
);
}
#[test]
fn test_locate_offset_at_word_boundary() {
let mut json = String::from(r#"{"data":["#);
for i in 0..100 {
if i > 0 {
json.push(',');
}
json.push_str(&format!("\"item{}\"", i));
}
json.push_str("]}");
let json_bytes = json.as_bytes();
let index = JsonIndex::build(json_bytes);
let item5_pos = json.find("\"item5\"").unwrap();
assert_eq!(
locate_offset(&index, json_bytes, item5_pos),
Some(".data[5]".to_string())
);
let item50_pos = json.find("\"item50\"").unwrap();
assert_eq!(
locate_offset(&index, json_bytes, item50_pos),
Some(".data[50]".to_string())
);
let item99_pos = json.find("\"item99\"").unwrap();
assert_eq!(
locate_offset(&index, json_bytes, item99_pos),
Some(".data[99]".to_string())
);
}
#[test]
fn test_locate_enclose_word_boundary_regression() {
let json = include_str!("../../tests/testdata/locate_regression.json");
let json_bytes = json.as_bytes();
let index = JsonIndex::build(json_bytes);
let result = locate_offset(&index, json_bytes, 956);
assert!(
result.is_some(),
"Failed to locate offset 956 - this indicates the enclose() word boundary bug"
);
let path = result.unwrap();
assert!(
path.starts_with(".strings["),
"Expected path starting with '.strings[', got: {path}"
);
for offset in [1000, 1500, 2000, 2500] {
if offset < json.len() {
let result = locate_offset(&index, json_bytes, offset);
assert!(
result.is_some(),
"Failed to locate offset {offset} - may indicate enclose() word boundary bug"
);
}
}
}
}