use super::types::{
FLAG_HAS_NULLS, FLAG_HAS_ROOT_KEY, FieldDef, FieldType, IntermediateRepresentation,
SchemaError, SchemaHeader, SchemaValue,
};
pub const ROW_START: char = '◉'; pub const FIELD_SEP: char = '┃'; pub const ARRAY_SEP: char = '◈'; pub const NULL_VALUE: &str = "∅"; pub const SPACE_MARKER: char = '▓'; pub const NEST_SEP: char = '჻';
pub const TYPE_INT: &str = "int";
pub const TYPE_STR: &str = "str";
pub const TYPE_FLOAT: &str = "float";
pub const TYPE_BOOL: &str = "bool";
pub const TYPE_INT_SUPER: char = 'ⁱ'; pub const TYPE_STR_SUPER: char = 'ˢ'; pub const TYPE_FLOAT_SUPER: char = 'ᶠ'; pub const TYPE_BOOL_SUPER: char = 'ᵇ';
pub const TOKEN_MAP_PREFIX: char = '@';
pub const TRUE_MARKER: &str = "⊤"; pub const FALSE_MARKER: &str = "⊥"; pub const EMPTY_ARRAY_MARKER: &str = "⟦⟧"; pub const EMPTY_OBJECT_MARKER: &str = "⟨⟩"; pub const OBJECT_KEY_PREFIX: char = '#'; pub const STRING_PREFIX: char = '"';
pub mod tokens {
pub const RUNIC: &[char] = &[
'ᚠ', 'ᚡ', 'ᚢ', 'ᚣ', 'ᚤ', 'ᚥ', 'ᚦ', 'ᚧ', 'ᚨ', 'ᚩ', 'ᚪ', 'ᚫ', 'ᚬ', 'ᚭ', 'ᚮ', 'ᚯ', 'ᚰ', 'ᚱ',
'ᚲ', 'ᚳ', 'ᚴ', 'ᚵ', 'ᚶ', 'ᚷ', 'ᚸ', 'ᚹ', 'ᚺ', 'ᚻ', 'ᚼ', 'ᚽ', 'ᚾ', 'ᚿ', 'ᛀ', 'ᛁ', 'ᛂ', 'ᛃ',
'ᛄ', 'ᛅ', 'ᛆ', 'ᛇ', 'ᛈ', 'ᛉ', 'ᛊ', 'ᛋ', 'ᛌ', 'ᛍ', 'ᛎ', 'ᛏ', 'ᛐ', 'ᛑ', 'ᛒ', 'ᛓ', 'ᛔ', 'ᛕ',
'ᛖ', 'ᛗ', 'ᛘ', 'ᛙ', 'ᛚ', 'ᛛ', 'ᛜ', 'ᛝ', 'ᛞ', 'ᛟ', 'ᛠ', 'ᛡ', 'ᛢ', 'ᛣ', 'ᛤ', 'ᛥ', 'ᛦ', 'ᛧ',
'ᛨ', 'ᛩ', 'ᛪ', '᛫', '᛬', '᛭', 'ᛮ', 'ᛯ', 'ᛰ', 'ᛱ', 'ᛲ', 'ᛳ', 'ᛴ', 'ᛵ', 'ᛶ', 'ᛷ', 'ᛸ',
];
pub fn get_token(index: usize) -> Option<char> {
if index < RUNIC.len() {
Some(RUNIC[index])
} else {
None
}
}
pub fn is_token(c: char) -> bool {
RUNIC.contains(&c)
}
#[allow(dead_code)]
pub fn token_index(c: char) -> Option<usize> {
RUNIC.iter().position(|&t| t == c)
}
}
pub mod value_tokens {
pub const HIEROGLYPH_START: char = '\u{13000}'; pub const HIEROGLYPH_END: char = '\u{1342F}';
pub fn get_token(index: usize) -> Option<char> {
let code_point = HIEROGLYPH_START as u32 + index as u32;
if code_point <= HIEROGLYPH_END as u32 {
char::from_u32(code_point)
} else {
None
}
}
pub fn is_token(c: char) -> bool {
(HIEROGLYPH_START..=HIEROGLYPH_END).contains(&c)
}
#[allow(dead_code)]
pub fn token_index(c: char) -> Option<usize> {
if is_token(c) {
Some((c as u32 - HIEROGLYPH_START as u32) as usize)
} else {
None
}
}
}
pub fn serialize(ir: &IntermediateRepresentation, minify: bool) -> Result<String, SchemaError> {
serialize_full_options(ir, minify, true, true)
}
pub fn serialize_readable(
ir: &IntermediateRepresentation,
minify: bool,
) -> Result<String, SchemaError> {
serialize_full_options(ir, minify, false, false)
}
pub fn serialize_light(
ir: &IntermediateRepresentation,
minify: bool,
) -> Result<String, SchemaError> {
serialize_full_options(ir, minify, true, false)
}
#[allow(dead_code)]
pub fn serialize_minified(ir: &IntermediateRepresentation) -> Result<String, SchemaError> {
serialize_full_options(ir, true, true, true)
}
fn serialize_full_options(
ir: &IntermediateRepresentation,
minify: bool,
tokenize: bool,
tokenize_values: bool,
) -> Result<String, SchemaError> {
if !tokenize {
return serialize_with_options(ir, minify);
}
let mut output = String::new();
let line_sep = if minify { SPACE_MARKER } else { '\n' };
let mut token_map: Vec<(char, &str)> = Vec::new();
for (idx, field) in ir.header.fields.iter().enumerate() {
if let Some(token) = tokens::get_token(idx) {
token_map.push((token, &field.name));
} else {
return serialize_with_options(ir, minify);
}
}
let value_dict = if tokenize_values {
build_value_dictionary(ir)
} else {
std::collections::HashMap::new()
};
output.push(TOKEN_MAP_PREFIX);
for (idx, (token, name)) in token_map.iter().enumerate() {
if idx > 0 {
output.push(',');
}
output.push(*token);
output.push('=');
output.push_str(name);
}
output.push(line_sep);
if !value_dict.is_empty() {
output.push(TOKEN_MAP_PREFIX);
let mut sorted_values: Vec<_> = value_dict.iter().collect();
sorted_values.sort_by_key(|(_, token)| **token);
for (idx, (value, token)) in sorted_values.iter().enumerate() {
if idx > 0 {
output.push(',');
}
output.push(**token);
output.push('=');
output.push_str(value);
}
output.push(line_sep);
}
output.push('@');
if let Some(ref root_key) = ir.header.root_key {
output.push_str(root_key);
}
if let Some(ref metadata) = ir.header.metadata {
output.push('[');
let mut sorted_keys: Vec<&String> = metadata.keys().collect();
sorted_keys.sort(); for (idx, key) in sorted_keys.iter().enumerate() {
if idx > 0 {
output.push(',');
}
output.push_str(key);
output.push('=');
let value = metadata[*key].replace(' ', &SPACE_MARKER.to_string());
output.push_str(&value);
}
output.push(']');
}
for (idx, field) in ir.header.fields.iter().enumerate() {
output.push(FIELD_SEP);
output.push(token_map[idx].0); if !field.name.ends_with("⟦⟧") {
output.push_str(&field_type_to_str(&field.field_type));
}
}
output.push(line_sep);
let field_count = ir.header.fields.len();
for row in 0..ir.header.row_count {
output.push(ROW_START);
for (field_idx, field) in ir.header.fields.iter().enumerate() {
if field_idx > 0 {
output.push(FIELD_SEP);
}
if ir.is_null(row, field_idx) {
output.push_str(NULL_VALUE);
} else {
let value_idx = row * field_count + field_idx;
let value = &ir.values[value_idx];
let value_str = value_to_str(value, &field.field_type);
if let Some(&token) = value_dict.get(&value_str) {
output.push(token);
} else {
output.push_str(&value_str);
}
}
}
if row < ir.header.row_count - 1 {
output.push(line_sep);
}
}
Ok(output)
}
fn build_value_dictionary(
ir: &IntermediateRepresentation,
) -> std::collections::HashMap<String, char> {
use std::collections::HashMap;
let mut value_counts: HashMap<String, usize> = HashMap::new();
let field_count = ir.header.fields.len();
for row in 0..ir.header.row_count {
for field_idx in 0..ir.header.fields.len() {
if ir.is_null(row, field_idx) {
continue;
}
let value_idx = row * field_count + field_idx;
let value = &ir.values[value_idx];
if let SchemaValue::String(s) = value {
let value_str = s.replace(' ', &SPACE_MARKER.to_string());
*value_counts.entry(value_str).or_insert(0) += 1;
}
}
}
let mut dict: HashMap<String, char> = HashMap::new();
let mut sorted_values: Vec<_> = value_counts
.iter()
.filter(|(_, count)| **count >= 2)
.collect();
sorted_values.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
for (idx, (value, _)) in sorted_values.iter().enumerate() {
if let Some(token) = value_tokens::get_token(idx) {
dict.insert((*value).clone(), token);
} else {
break;
}
}
dict
}
fn serialize_with_options(
ir: &IntermediateRepresentation,
minify: bool,
) -> Result<String, SchemaError> {
let mut output = String::new();
let line_sep = if minify { SPACE_MARKER } else { '\n' };
output.push('@');
if let Some(ref root_key) = ir.header.root_key {
output.push_str(root_key);
}
if let Some(ref metadata) = ir.header.metadata {
output.push('[');
let mut sorted_keys: Vec<&String> = metadata.keys().collect();
sorted_keys.sort(); for (idx, key) in sorted_keys.iter().enumerate() {
if idx > 0 {
output.push(',');
}
output.push_str(key);
output.push('=');
let value = metadata[*key].replace(' ', &SPACE_MARKER.to_string());
output.push_str(&value);
}
output.push(']');
}
for field in &ir.header.fields {
output.push(FIELD_SEP);
output.push_str(&field.name);
if !field.name.ends_with("⟦⟧") {
output.push_str(&field_type_to_str(&field.field_type));
}
}
output.push(line_sep);
let field_count = ir.header.fields.len();
for row in 0..ir.header.row_count {
output.push(ROW_START);
for (field_idx, field) in ir.header.fields.iter().enumerate() {
if field_idx > 0 {
output.push(FIELD_SEP);
}
if ir.is_null(row, field_idx) {
output.push_str(NULL_VALUE);
} else {
let value_idx = row * field_count + field_idx;
let value = &ir.values[value_idx];
output.push_str(&value_to_str(value, &field.field_type));
}
}
if row < ir.header.row_count - 1 {
output.push(line_sep);
}
}
Ok(output)
}
pub fn parse(input: &str) -> Result<IntermediateRepresentation, SchemaError> {
let input = input.trim();
if input.is_empty() {
return Err(SchemaError::InvalidInput("Empty stele input".to_string()));
}
let row_marker = ROW_START.to_string();
let first_row_pos = input.find(&row_marker);
let (schema_part, data_part) = if let Some(pos) = first_row_pos {
(&input[..pos], &input[pos..])
} else {
return Err(SchemaError::InvalidInput(
"No data rows found (missing ◉ row marker)".to_string(),
));
};
let mut token_map: std::collections::HashMap<char, String> = std::collections::HashMap::new();
let mut value_dict: std::collections::HashMap<char, String> = std::collections::HashMap::new();
let schema_part = schema_part.trim();
let effective_schema = {
let lines: Vec<&str> = schema_part
.split(['\n', SPACE_MARKER])
.filter(|s| !s.is_empty())
.collect();
let mut schema_line_idx = 0;
for (idx, line) in lines.iter().enumerate() {
if !line.starts_with('@') || line.len() <= 1 {
schema_line_idx = idx;
break;
}
let after_at = &line[1..];
let first_char = after_at.chars().next();
if let Some(fc) = first_char {
if tokens::is_token(fc) && after_at.contains('=') {
let map_content = after_at;
for pair in map_content.split(',') {
let parts: Vec<&str> = pair.splitn(2, '=').collect();
if parts.len() == 2 {
let token = parts[0].chars().next();
let name = parts[1].to_string();
if let Some(t) = token {
token_map.insert(t, name);
}
}
}
schema_line_idx = idx + 1;
}
else if value_tokens::is_token(fc) && after_at.contains('=') {
let map_content = after_at;
for pair in map_content.split(',') {
let parts: Vec<&str> = pair.splitn(2, '=').collect();
if parts.len() == 2 {
let token = parts[0].chars().next();
let value = parts[1].to_string();
if let Some(t) = token {
value_dict.insert(t, value);
}
}
}
schema_line_idx = idx + 1;
}
else {
schema_line_idx = idx;
break;
}
} else {
schema_line_idx = idx;
break;
}
}
if schema_line_idx < lines.len() {
lines[schema_line_idx..].join(&SPACE_MARKER.to_string())
} else {
return Err(SchemaError::InvalidInput(
"No schema line found after dictionaries".to_string(),
));
}
};
let schema_line = effective_schema.trim().trim_end_matches(SPACE_MARKER);
if !schema_line.starts_with('@') {
return Err(SchemaError::InvalidInput(
"Schema line must start with @".to_string(),
));
}
let schema_content = &schema_line[1..];
let (root_and_metadata, field_defs) = if let Some(sep_pos) = schema_content.find(FIELD_SEP) {
(&schema_content[..sep_pos], &schema_content[sep_pos..])
} else {
return Err(SchemaError::InvalidInput(
"Schema line must contain at least one field definition".to_string(),
));
};
let (root_key, metadata) = if let Some(bracket_start) = root_and_metadata.find('[') {
let root = &root_and_metadata[..bracket_start];
let root_key = if root.is_empty() {
None
} else {
Some(root.to_string())
};
let meta_content = &root_and_metadata[bracket_start + 1..];
let mut depth = 0;
let mut bracket_end = None;
for (idx, ch) in meta_content.char_indices() {
match ch {
'[' => depth += 1,
']' => {
if depth == 0 {
bracket_end = Some(idx);
break;
}
depth -= 1;
}
_ => {}
}
}
if let Some(end_pos) = bracket_end {
let meta_str = &meta_content[..end_pos];
let mut metadata = std::collections::HashMap::new();
let mut current_key = String::new();
let mut current_value = String::new();
let mut in_value = false;
let mut json_depth = 0;
for ch in meta_str.chars() {
match ch {
'=' if !in_value && json_depth == 0 => {
in_value = true;
}
'[' if in_value => {
json_depth += 1;
current_value.push(ch);
}
']' if in_value => {
json_depth -= 1;
current_value.push(ch);
}
',' if in_value && json_depth == 0 => {
let key = current_key.trim().to_string();
let value = current_value.trim().replace(SPACE_MARKER, " ");
if !key.is_empty() {
metadata.insert(key, value);
}
current_key.clear();
current_value.clear();
in_value = false;
}
_ => {
if in_value {
current_value.push(ch);
} else {
current_key.push(ch);
}
}
}
}
if !current_key.is_empty() {
let key = current_key.trim().to_string();
let value = current_value.trim().replace(SPACE_MARKER, " ");
metadata.insert(key, value);
}
(
root_key,
if metadata.is_empty() {
None
} else {
Some(metadata)
},
)
} else {
return Err(SchemaError::InvalidInput(
"Unclosed metadata bracket in schema".to_string(),
));
}
} else {
let root = root_and_metadata.trim();
let root_key = if root.is_empty() || root.contains(':') {
None
} else {
Some(root.to_string())
};
(root_key, None)
};
let schema_parts: Vec<&str> = field_defs.split(FIELD_SEP).collect();
let mut fields = Vec::new();
for part in &schema_parts {
if part.is_empty() {
continue;
}
let (name, field_type) = parse_field_def_with_tokens(part, &token_map)?;
fields.push(FieldDef::new(name, field_type));
}
if fields.is_empty() {
return Err(SchemaError::InvalidInput(
"No field definitions in schema".to_string(),
));
}
let mut values = Vec::new();
let mut null_positions = Vec::new();
let mut row_count = 0;
for row_str in data_part.split(ROW_START) {
let row_str = row_str.trim().trim_end_matches(SPACE_MARKER);
if row_str.is_empty() {
continue;
}
let row_values: Vec<&str> = split_row(row_str, &fields);
if row_values.len() != fields.len() {
return Err(SchemaError::InvalidInput(format!(
"Row {} has {} values, expected {} fields",
row_count,
row_values.len(),
fields.len()
)));
}
for (field_idx, (value_str, field)) in row_values.iter().zip(fields.iter()).enumerate() {
let value_str = value_str.trim();
if value_str == NULL_VALUE {
if matches!(field.field_type, FieldType::Array(_)) {
values.push(SchemaValue::Array(vec![]));
} else {
null_positions.push(row_count * fields.len() + field_idx);
values.push(SchemaValue::Null);
}
} else {
let resolved_value = if value_str.len() == 1 || value_str.chars().count() == 1 {
let first_char = value_str.chars().next().unwrap();
if let Some(expanded) = value_dict.get(&first_char) {
expanded.as_str()
} else {
value_str
}
} else {
value_str
};
let value = parse_value(resolved_value, &field.field_type)?;
values.push(value);
}
}
row_count += 1;
}
let mut header = SchemaHeader::new(row_count, fields);
if root_key.is_some() {
header.root_key = root_key;
header.set_flag(FLAG_HAS_ROOT_KEY);
}
header.metadata = metadata;
if !null_positions.is_empty() {
header.set_flag(FLAG_HAS_NULLS);
let bitmap_size = (row_count * header.fields.len()).div_ceil(8);
let mut bitmap = vec![0u8; bitmap_size];
for pos in null_positions {
let byte_idx = pos / 8;
let bit_idx = pos % 8;
bitmap[byte_idx] |= 1 << bit_idx;
}
header.null_bitmap = Some(bitmap);
}
IntermediateRepresentation::new(header, values)
}
fn field_type_to_str(ft: &FieldType) -> String {
match ft {
FieldType::U64 | FieldType::I64 => TYPE_INT_SUPER.to_string(),
FieldType::F64 => TYPE_FLOAT_SUPER.to_string(),
FieldType::String => TYPE_STR_SUPER.to_string(),
FieldType::Bool => TYPE_BOOL_SUPER.to_string(),
FieldType::Null => TYPE_STR_SUPER.to_string(), FieldType::Array(inner) => {
let inner_str = field_type_to_str(inner);
format!("{}⟦⟧", inner_str)
}
FieldType::Any => TYPE_STR_SUPER.to_string(),
}
}
fn parse_type_str(s: &str) -> Result<FieldType, SchemaError> {
if let Some(inner) = s.strip_suffix("⟦⟧").or_else(|| s.strip_suffix("[]")) {
let inner_type = parse_type_str(inner)?;
return Ok(FieldType::Array(Box::new(inner_type)));
}
let type_int_super = TYPE_INT_SUPER.to_string();
let type_str_super = TYPE_STR_SUPER.to_string();
let type_float_super = TYPE_FLOAT_SUPER.to_string();
let type_bool_super = TYPE_BOOL_SUPER.to_string();
match s {
TYPE_INT => Ok(FieldType::I64),
TYPE_STR => Ok(FieldType::String),
TYPE_FLOAT => Ok(FieldType::F64),
TYPE_BOOL => Ok(FieldType::Bool),
_ if s == type_int_super => Ok(FieldType::I64),
_ if s == type_str_super => Ok(FieldType::String),
_ if s == type_float_super => Ok(FieldType::F64),
_ if s == type_bool_super => Ok(FieldType::Bool),
"@" => Ok(FieldType::Array(Box::new(FieldType::String))),
_ => Err(SchemaError::InvalidInput(format!(
"Unknown type '{}'. Valid types: int/ⁱ, str/ˢ, float/ᶠ, bool/ᵇ, @",
s
))),
}
}
fn parse_field_def_with_tokens(
s: &str,
token_map: &std::collections::HashMap<char, String>,
) -> Result<(String, FieldType), SchemaError> {
let (name_or_token, field_type) = parse_field_def(s)?;
let chars: Vec<char> = name_or_token.chars().collect();
if chars.len() == 1
&& let Some(resolved_name) = token_map.get(&chars[0])
{
return Ok((resolved_name.clone(), field_type));
}
Ok((name_or_token, field_type))
}
fn parse_field_def(s: &str) -> Result<(String, FieldType), SchemaError> {
if let Some(colon_pos) = s.find(':') {
let name = s[..colon_pos].trim().to_string();
let field_type = parse_type_str(s[colon_pos + 1..].trim())?;
return Ok((name, field_type));
}
let (base, is_array) = if let Some(stripped) = s.strip_suffix("⟦⟧") {
(stripped, true)
} else if let Some(stripped) = s.strip_suffix("[]") {
(stripped, true)
} else {
(s, false)
};
let type_markers = [
(TYPE_STR_SUPER, FieldType::String),
(TYPE_INT_SUPER, FieldType::I64),
(TYPE_FLOAT_SUPER, FieldType::F64),
(TYPE_BOOL_SUPER, FieldType::Bool),
];
for (marker, field_type) in &type_markers {
let marker_str = marker.to_string();
if base.ends_with(&marker_str) {
let name = base[..base.len() - marker_str.len()].trim().to_string();
if name.is_empty() {
return Err(SchemaError::InvalidInput(format!(
"Empty field name in '{}'",
s
)));
}
let final_type = if is_array {
FieldType::Array(Box::new(field_type.clone()))
} else {
field_type.clone()
};
return Ok((name, final_type));
}
}
Err(SchemaError::InvalidInput(format!(
"Invalid field definition '{}'. Expected format: name:type or nameˢ/ⁱ/ᶠ/ᵇ",
s
)))
}
fn value_to_str(value: &SchemaValue, field_type: &FieldType) -> String {
match value {
SchemaValue::U64(n) => n.to_string(),
SchemaValue::I64(n) => n.to_string(),
SchemaValue::F64(n) => {
if n.fract() == 0.0 && n.abs() < 1e15 {
format!("{:.1}", n)
} else {
n.to_string()
}
}
SchemaValue::String(s) => s.replace(' ', &SPACE_MARKER.to_string()),
SchemaValue::Bool(b) => b.to_string(),
SchemaValue::Null => NULL_VALUE.to_string(),
SchemaValue::Array(arr) => {
if arr.is_empty() {
return NULL_VALUE.to_string();
}
let inner_type = if let FieldType::Array(inner) = field_type {
inner.as_ref()
} else {
&FieldType::String
};
let elements: Vec<String> = arr.iter().map(|v| value_to_str(v, inner_type)).collect();
elements.join(&ARRAY_SEP.to_string())
}
}
}
fn parse_value(s: &str, field_type: &FieldType) -> Result<SchemaValue, SchemaError> {
if s == NULL_VALUE {
return Ok(SchemaValue::Null);
}
match field_type {
FieldType::U64 => s
.parse::<u64>()
.map(SchemaValue::U64)
.map_err(|_| SchemaError::InvalidInput(format!("Invalid integer: '{}'", s))),
FieldType::I64 => s
.parse::<i64>()
.map(SchemaValue::I64)
.map_err(|_| SchemaError::InvalidInput(format!("Invalid integer: '{}'", s))),
FieldType::F64 => s
.parse::<f64>()
.map(SchemaValue::F64)
.map_err(|_| SchemaError::InvalidInput(format!("Invalid float: '{}'", s))),
FieldType::String => Ok(SchemaValue::String(s.replace(SPACE_MARKER, " "))),
FieldType::Bool => match s {
"true" => Ok(SchemaValue::Bool(true)),
"false" => Ok(SchemaValue::Bool(false)),
_ => Err(SchemaError::InvalidInput(format!(
"Invalid boolean: '{}'. Expected 'true' or 'false'",
s
))),
},
FieldType::Null => Ok(SchemaValue::Null),
FieldType::Array(inner) => {
if s.is_empty() {
return Ok(SchemaValue::Array(vec![]));
}
let elements: Result<Vec<_>, _> = s
.split(ARRAY_SEP)
.map(|elem| parse_value(elem.trim(), inner))
.collect();
elements.map(SchemaValue::Array)
}
FieldType::Any => Ok(SchemaValue::String(s.to_string())),
}
}
fn split_row<'a>(row_str: &'a str, fields: &[FieldDef]) -> Vec<&'a str> {
let sep = FIELD_SEP.to_string();
let parts: Vec<&str> = row_str.splitn(fields.len(), &sep).collect();
parts
}
pub fn serialize_path_mode(json: &str) -> Result<String, SchemaError> {
use serde_json::Value;
use std::collections::HashMap;
let value: Value = serde_json::from_str(json)
.map_err(|e| SchemaError::InvalidInput(format!("Invalid JSON: {}", e)))?;
let mut path_values: Vec<(String, String)> = Vec::new();
collect_paths(&value, String::new(), &mut path_values);
let mut segment_counts: HashMap<String, usize> = HashMap::new();
for (path, _) in &path_values {
for segment in path.split(NEST_SEP) {
if segment.parse::<usize>().is_err() {
*segment_counts.entry(segment.to_string()).or_insert(0) += 1;
}
}
}
let mut path_dict: HashMap<String, char> = HashMap::new();
let mut sorted_segments: Vec<_> = segment_counts
.iter()
.filter(|(_, count)| **count >= 2)
.collect();
sorted_segments.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
for (idx, (segment, _)) in sorted_segments.iter().enumerate() {
if let Some(token) = tokens::get_token(idx) {
path_dict.insert((*segment).clone(), token);
}
}
let mut value_counts: HashMap<String, usize> = HashMap::new();
for (_, val) in &path_values {
if !val.is_empty()
&& val != NULL_VALUE
&& val != TRUE_MARKER
&& val != FALSE_MARKER
&& val != EMPTY_ARRAY_MARKER
&& val != EMPTY_OBJECT_MARKER
&& !val
.chars()
.all(|c| c.is_ascii_digit() || c == '.' || c == '-')
{
*value_counts.entry(val.clone()).or_insert(0) += 1;
}
}
let mut value_dict: HashMap<String, char> = HashMap::new();
let mut sorted_values: Vec<_> = value_counts
.iter()
.filter(|(_, count)| **count >= 2)
.collect();
sorted_values.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
for (idx, (value, _)) in sorted_values.iter().enumerate() {
if let Some(token) = value_tokens::get_token(idx) {
value_dict.insert((*value).clone(), token);
}
}
let mut output = String::new();
if !path_dict.is_empty() {
output.push(TOKEN_MAP_PREFIX);
let mut sorted_dict: Vec<_> = path_dict.iter().collect();
sorted_dict.sort_by_key(|(_, token)| **token);
for (idx, (segment, token)) in sorted_dict.iter().enumerate() {
if idx > 0 {
output.push(',');
}
output.push(**token);
output.push('=');
output.push_str(segment);
}
output.push('\n');
}
if !value_dict.is_empty() {
output.push(TOKEN_MAP_PREFIX);
let mut sorted_dict: Vec<_> = value_dict.iter().collect();
sorted_dict.sort_by_key(|(_, token)| **token);
for (idx, (value, token)) in sorted_dict.iter().enumerate() {
if idx > 0 {
output.push(',');
}
output.push(**token);
output.push('=');
output.push_str(value);
}
output.push('\n');
}
for (path, value) in path_values {
let tokenized_path = tokenize_path(&path, &path_dict);
output.push_str(&tokenized_path);
output.push(FIELD_SEP);
if let Some(&token) = value_dict.get(&value) {
output.push(token);
} else {
output.push_str(&value);
}
output.push('\n');
}
if output.ends_with('\n') {
output.pop();
}
Ok(output)
}
fn tokenize_path(path: &str, dict: &std::collections::HashMap<String, char>) -> String {
let parts: Vec<&str> = path.split(NEST_SEP).collect();
let tokenized_parts: Vec<String> = parts
.iter()
.map(|segment| {
if let Some(&token) = dict.get(*segment) {
token.to_string()
} else {
segment.to_string()
}
})
.collect();
tokenized_parts.join(&NEST_SEP.to_string())
}
fn collect_paths(value: &serde_json::Value, path: String, output: &mut Vec<(String, String)>) {
use serde_json::Value;
match value {
Value::Null => {
output.push((path, NULL_VALUE.to_string()));
}
Value::Bool(b) => {
let val = if *b { TRUE_MARKER } else { FALSE_MARKER };
output.push((path, val.to_string()));
}
Value::Number(n) => {
output.push((path, n.to_string()));
}
Value::String(s) => {
let val = s.replace(' ', &SPACE_MARKER.to_string());
let val = if looks_like_number(&val) {
format!("{}{}", STRING_PREFIX, val)
} else {
val
};
output.push((path, val));
}
Value::Array(arr) => {
if arr.is_empty() {
output.push((path, EMPTY_ARRAY_MARKER.to_string()));
} else {
for (idx, item) in arr.iter().enumerate() {
let item_path = if path.is_empty() {
idx.to_string()
} else {
format!("{}{}{}", path, NEST_SEP, idx)
};
collect_paths(item, item_path, output);
}
}
}
Value::Object(obj) => {
if obj.is_empty() {
output.push((path, EMPTY_OBJECT_MARKER.to_string()));
} else {
for (key, val) in obj {
let marked_key = if key.parse::<usize>().is_ok() {
format!("{}{}", OBJECT_KEY_PREFIX, key)
} else {
key.clone()
};
let key_path = if path.is_empty() {
marked_key
} else {
format!("{}{}{}", path, NEST_SEP, marked_key)
};
collect_paths(val, key_path, output);
}
}
}
}
}
fn looks_like_number(s: &str) -> bool {
if s.is_empty() {
return false;
}
s.parse::<i64>().is_ok() || s.parse::<f64>().is_ok()
}
pub fn parse_path_mode(input: &str) -> Result<String, SchemaError> {
use serde_json::Value;
use std::collections::HashMap;
let input = input.trim();
if input.is_empty() {
return Err(SchemaError::InvalidInput(
"Empty path mode input".to_string(),
));
}
let mut path_dict: HashMap<char, String> = HashMap::new();
let mut value_dict: HashMap<char, String> = HashMap::new();
let lines: Vec<&str> = input.lines().collect();
let mut data_start_idx = 0;
for (idx, line) in lines.iter().enumerate() {
if !line.starts_with(TOKEN_MAP_PREFIX) {
data_start_idx = idx;
break;
}
let dict_content = &line[1..];
if dict_content.is_empty() {
data_start_idx = idx + 1;
continue;
}
let first_token = dict_content.chars().next();
if first_token.is_none() {
data_start_idx = idx + 1;
continue;
}
let ft = first_token.unwrap();
let is_runic = tokens::is_token(ft);
let is_hieroglyph = value_tokens::is_token(ft);
if is_runic {
for pair in dict_content.split(',') {
let parts: Vec<&str> = pair.splitn(2, '=').collect();
if parts.len() == 2
&& let Some(token) = parts[0].chars().next()
{
path_dict.insert(token, parts[1].to_string());
}
}
} else if is_hieroglyph {
for pair in dict_content.split(',') {
let parts: Vec<&str> = pair.splitn(2, '=').collect();
if parts.len() == 2
&& let Some(token) = parts[0].chars().next()
{
value_dict.insert(token, parts[1].to_string());
}
}
}
data_start_idx = idx + 1;
}
let mut paths: HashMap<String, Value> = HashMap::new();
for line in lines.iter().skip(data_start_idx) {
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line.splitn(2, FIELD_SEP).collect();
if parts.len() != 2 {
return Err(SchemaError::InvalidInput(format!(
"Invalid path line: '{}'",
line
)));
}
let tokenized_path = parts[0];
let detokenized_path = detokenize_path(tokenized_path, &path_dict);
let mut value_str = parts[1];
let resolved: String;
if (value_str.len() == 1 || value_str.chars().count() == 1)
&& let Some(c) = value_str.chars().next()
&& let Some(expanded) = value_dict.get(&c)
{
resolved = expanded.clone();
value_str = &resolved;
}
let value = parse_path_value(value_str)?;
paths.insert(detokenized_path, value);
}
let root = build_json_from_paths(&paths)?;
serde_json::to_string(&root)
.map_err(|e| SchemaError::InvalidInput(format!("JSON serialization failed: {}", e)))
}
fn detokenize_path(path: &str, dict: &std::collections::HashMap<char, String>) -> String {
let parts: Vec<&str> = path.split(NEST_SEP).collect();
let detokenized_parts: Vec<String> = parts
.iter()
.map(|segment| {
if (segment.len() == 1 || segment.chars().count() == 1)
&& let Some(c) = segment.chars().next()
&& let Some(expanded) = dict.get(&c)
{
return expanded.clone();
}
segment.to_string()
})
.collect();
detokenized_parts.join(&NEST_SEP.to_string())
}
fn parse_path_value(s: &str) -> Result<serde_json::Value, SchemaError> {
use serde_json::{Value, json};
match s {
NULL_VALUE => Ok(Value::Null),
TRUE_MARKER => Ok(json!(true)),
FALSE_MARKER => Ok(json!(false)),
EMPTY_ARRAY_MARKER => Ok(json!([])),
EMPTY_OBJECT_MARKER => Ok(json!({})),
_ => {
if s.starts_with(STRING_PREFIX) {
let inner = &s[STRING_PREFIX.len_utf8()..];
return Ok(json!(inner.replace(SPACE_MARKER, " ")));
}
if let Ok(n) = s.parse::<i64>() {
return Ok(json!(n));
}
if let Ok(f) = s.parse::<f64>() {
return Ok(json!(f));
}
Ok(json!(s.replace(SPACE_MARKER, " ")))
}
}
}
fn build_json_from_paths(
paths: &std::collections::HashMap<String, serde_json::Value>,
) -> Result<serde_json::Value, SchemaError> {
use serde_json::json;
if paths.is_empty() {
return Ok(json!({}));
}
let mut root = json!({});
for (path, value) in paths {
insert_at_path(&mut root, path, value.clone())?;
}
Ok(root)
}
fn insert_at_path(
root: &mut serde_json::Value,
path: &str,
value: serde_json::Value,
) -> Result<(), SchemaError> {
use serde_json::Value;
if path.is_empty() {
*root = value;
return Ok(());
}
let parts: Vec<&str> = path.split(NEST_SEP).collect();
fn insert_recursive(
current: &mut Value,
parts: &[&str],
value: &Value,
) -> Result<(), SchemaError> {
use serde_json::{Value, json};
if parts.is_empty() {
return Ok(());
}
let part = parts[0];
let is_last = parts.len() == 1;
let is_object_key = part.starts_with(OBJECT_KEY_PREFIX);
let actual_key = if is_object_key {
&part[OBJECT_KEY_PREFIX.len_utf8()..]
} else {
part
};
if !is_object_key && let Ok(index) = part.parse::<usize>() {
if !current.is_array() {
*current = json!([]);
}
let arr = current.as_array_mut().unwrap();
while arr.len() <= index {
arr.push(Value::Null);
}
if is_last {
arr[index] = value.clone();
} else {
let next_part = parts[1];
if next_part.parse::<usize>().is_ok() {
if !arr[index].is_array() {
arr[index] = json!([]);
}
} else if !arr[index].is_object() {
arr[index] = json!({});
}
insert_recursive(&mut arr[index], &parts[1..], value)?;
}
} else {
if !current.is_object() {
*current = json!({});
}
if is_last {
current
.as_object_mut()
.unwrap()
.insert(actual_key.to_string(), value.clone());
} else {
let next_part = parts[1];
let next_is_object_key = next_part.starts_with(OBJECT_KEY_PREFIX);
let obj = current.as_object_mut().unwrap();
if !next_is_object_key && next_part.parse::<usize>().is_ok() {
obj.entry(actual_key.to_string())
.or_insert_with(|| json!([]));
} else {
obj.entry(actual_key.to_string())
.or_insert_with(|| json!({}));
}
let next = obj.get_mut(actual_key).unwrap();
insert_recursive(next, &parts[1..], value)?;
}
}
Ok(())
}
insert_recursive(root, &parts, &value)
}
pub fn serialize_ascii(ir: &IntermediateRepresentation) -> Result<String, SchemaError> {
use std::collections::HashMap;
let mut output = String::new();
let mut value_counts: HashMap<String, usize> = HashMap::new();
let field_count = ir.header.fields.len();
for row in 0..ir.header.row_count {
for field_idx in 0..ir.header.fields.len() {
if ir.is_null(row, field_idx) {
continue;
}
let value_idx = row * field_count + field_idx;
let value = &ir.values[value_idx];
if let SchemaValue::String(s) = value {
*value_counts.entry(s.clone()).or_insert(0) += 1;
} else if let SchemaValue::Bool(b) = value {
let b_str = b.to_string();
*value_counts.entry(b_str).or_insert(0) += 1;
}
}
}
let mut value_dict: HashMap<String, String> = HashMap::new();
let mut reverse_dict: HashMap<String, String> = HashMap::new();
let mut sorted_values: Vec<_> = value_counts
.iter()
.filter(|(_, count)| **count >= 2)
.collect();
sorted_values.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
for (idx, (value, _)) in sorted_values.iter().enumerate() {
let token = format!("V{}", idx + 1);
value_dict.insert((*value).clone(), token.clone());
reverse_dict.insert(token, (*value).clone());
}
for (idx, field) in ir.header.fields.iter().enumerate() {
if idx > 0 {
output.push(',');
}
output.push_str(&field.name);
let type_suffix = match field.field_type {
FieldType::I64 | FieldType::U64 => ":i",
FieldType::String => ":s",
FieldType::F64 => ":f",
FieldType::Bool => ":b",
FieldType::Array(_) => ":a",
_ => "",
};
output.push_str(type_suffix);
}
if !value_dict.is_empty() {
output.push(';');
let mut sorted_dict: Vec<_> = reverse_dict.iter().collect();
sorted_dict.sort_by_key(|(token, _)| *token);
for (idx, (token, value)) in sorted_dict.iter().enumerate() {
if idx > 0 {
output.push(',');
}
output.push_str(token);
output.push('=');
output.push_str(value);
}
}
for row in 0..ir.header.row_count {
output.push(';');
for (field_idx, _field) in ir.header.fields.iter().enumerate() {
if field_idx > 0 {
output.push(',');
}
if ir.is_null(row, field_idx) {
continue;
}
let value_idx = row * field_count + field_idx;
let value = &ir.values[value_idx];
match value {
SchemaValue::U64(n) => output.push_str(&n.to_string()),
SchemaValue::I64(n) => output.push_str(&n.to_string()),
SchemaValue::F64(n) => {
if n.fract() == 0.0 && n.abs() < 1e15 {
output.push_str(&format!("{:.1}", n));
} else {
output.push_str(&n.to_string());
}
}
SchemaValue::String(s) => {
if let Some(token) = value_dict.get(s) {
output.push_str(token);
} else {
output.push_str(s);
}
}
SchemaValue::Bool(b) => {
let b_str = b.to_string();
if let Some(token) = value_dict.get(&b_str) {
output.push_str(token);
} else {
output.push_str(&b_str);
}
}
SchemaValue::Null => {
}
SchemaValue::Array(arr) => {
for (i, elem) in arr.iter().enumerate() {
if i > 0 {
output.push('|');
}
match elem {
SchemaValue::String(s) => output.push_str(s),
SchemaValue::I64(n) => output.push_str(&n.to_string()),
SchemaValue::U64(n) => output.push_str(&n.to_string()),
SchemaValue::F64(n) => output.push_str(&n.to_string()),
SchemaValue::Bool(b) => output.push_str(&b.to_string()),
_ => {}
}
}
}
}
}
}
Ok(output)
}
pub fn serialize_markdown(ir: &IntermediateRepresentation) -> Result<String, SchemaError> {
let mut output = String::new();
let field_count = ir.header.fields.len();
if field_count < 2 {
return Err(SchemaError::InvalidInput(
"Markdown IR requires at least type and content fields".to_string(),
));
}
for row in 0..ir.header.row_count {
if row > 0 {
output.push(';');
}
let type_idx = row * field_count;
let block_type = match &ir.values[type_idx] {
SchemaValue::String(s) => s.as_str(),
_ => continue,
};
let content_idx = row * field_count + 1;
let content = if ir.is_null(row, 1) {
""
} else {
match &ir.values[content_idx] {
SchemaValue::String(s) => s.as_str(),
_ => "",
}
};
let meta = if field_count > 2 && !ir.is_null(row, 2) {
let meta_idx = row * field_count + 2;
match &ir.values[meta_idx] {
SchemaValue::String(s) => Some(s.as_str()),
_ => None,
}
} else {
None
};
match block_type {
"h1" => {
output.push_str("#1 ");
output.push_str(content);
}
"h2" => {
output.push_str("#2 ");
output.push_str(content);
}
"h3" => {
output.push_str("#3 ");
output.push_str(content);
}
"h4" => {
output.push_str("#4 ");
output.push_str(content);
}
"h5" => {
output.push_str("#5 ");
output.push_str(content);
}
"h6" => {
output.push_str("#6 ");
output.push_str(content);
}
"p" => {
output.push_str("p ");
output.push_str(&content.replace('\n', " "));
}
"ul" => {
let mut first = true;
for line in content.split('\n') {
if line.is_empty() {
continue;
}
let trimmed = line.trim_start();
let indent = line.len() - trimmed.len();
let level = (indent / 2) + 1;
for item in trimmed.split(';') {
if item.is_empty() {
continue;
}
if !first {
output.push(';');
}
first = false;
output.push('-');
output.push_str(&level.to_string());
output.push(' ');
output.push_str(item.trim());
}
}
}
"ol" => {
let mut first = true;
for line in content.split('\n') {
if line.is_empty() {
continue;
}
let trimmed = line.trim_start();
let indent = line.len() - trimmed.len();
let level = (indent / 2) + 1;
for item in trimmed.split(';') {
if item.is_empty() {
continue;
}
if !first {
output.push(';');
}
first = false;
output.push('+');
output.push_str(&level.to_string());
output.push(' ');
output.push_str(item.trim());
}
}
}
"code" => {
output.push_str("```");
if let Some(lang) = meta {
output.push_str(lang);
}
output.push(' ');
output.push_str(&content.replace('\n', "↵"));
output.push_str("```");
}
"quote" => {
output.push_str(">1 ");
output.push_str(&content.replace('\n', "↵"));
}
"hr" => {
output.push_str("---");
}
"link" => {
output.push('[');
output.push_str(content);
output.push_str("](");
if let Some(url) = meta {
output.push_str(url);
}
output.push(')');
}
"image" => {
output.push_str(";
if let Some(url) = meta {
output.push_str(url);
}
output.push(')');
}
"table" => {
output.push_str("T ");
output.push_str(content);
if let Some(dims) = meta {
output.push(' ');
output.push_str(dims);
}
}
_ => {
output.push_str(block_type);
output.push(' ');
output.push_str(content);
}
}
}
Ok(output)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_roundtrip() {
let fiche = "@users┃idⁱ┃nameˢ┃activeᵇ
◉1┃alice┃true
◉2┃bob┃false";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.row_count, 2);
assert_eq!(ir.header.fields.len(), 3);
assert_eq!(ir.header.root_key, Some("users".to_string()));
let output = serialize_readable(&ir, false).unwrap();
assert_eq!(output, fiche);
}
#[test]
fn test_tokenized_roundtrip() {
let fiche = "@users┃idⁱ┃nameˢ┃activeᵇ
◉1┃alice┃true
◉2┃bob┃false";
let ir = parse(fiche).unwrap();
let tokenized = serialize(&ir, false).unwrap();
assert!(tokenized.starts_with("@ᚠ=id,ᚡ=name,ᚢ=active\n"));
assert!(tokenized.contains("┃ᚠⁱ┃ᚡˢ┃ᚢᵇ"));
let ir2 = parse(&tokenized).unwrap();
assert_eq!(ir2.header.row_count, 2);
assert_eq!(ir2.header.fields.len(), 3);
assert_eq!(ir2.header.fields[0].name, "id");
assert_eq!(ir2.header.fields[1].name, "name");
assert_eq!(ir2.header.fields[2].name, "active");
}
#[test]
fn test_legacy_type_format_parsing() {
let fiche = "@users┃id:int┃name:str┃active:bool
◉1┃alice┃true
◉2┃bob┃false";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.row_count, 2);
assert_eq!(ir.header.fields.len(), 3);
let output = serialize_readable(&ir, false).unwrap();
assert!(output.contains("idⁱ"));
assert!(output.contains("nameˢ"));
assert!(output.contains("activeᵇ"));
}
#[test]
fn test_arrays_legacy_syntax() {
let fiche = "@users┃id:int┃tags:str[]
◉1┃admin◈editor
◉2┃viewer";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.row_count, 2);
if let Some(SchemaValue::Array(tags)) = ir.get_value(0, 1) {
assert_eq!(tags.len(), 2);
} else {
panic!("Expected array");
}
let output = serialize_readable(&ir, false).unwrap();
assert!(output.contains("tagsˢ⟦⟧"));
}
#[test]
fn test_arrays_new_bracket_syntax() {
let fiche = "@users┃idⁱ┃tagsˢ⟦⟧
◉1┃admin◈editor
◉2┃viewer";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.row_count, 2);
if let Some(SchemaValue::Array(tags)) = ir.get_value(0, 1) {
assert_eq!(tags.len(), 2);
} else {
panic!("Expected array");
}
let output = serialize_readable(&ir, false).unwrap();
assert_eq!(output, fiche);
}
#[test]
fn test_nulls() {
let fiche = "@records┃idⁱ┃scoreᶠ┃notesˢ
◉1┃95.5┃∅
◉2┃∅┃pending";
let ir = parse(fiche).unwrap();
assert!(ir.is_null(0, 2)); assert!(ir.is_null(1, 1));
let output = serialize_readable(&ir, false).unwrap();
assert_eq!(output, fiche);
}
#[test]
fn test_embedded_json() {
let fiche = r#"@logs┃levelˢ┃msgˢ
◉error┃Failed▓to▓parse▓{"key":▓"value"}"#;
let ir = parse(fiche).unwrap();
if let Some(SchemaValue::String(msg)) = ir.get_value(0, 1) {
assert_eq!(msg, r#"Failed to parse {"key": "value"}"#);
} else {
panic!("Expected string");
}
let output = serialize_readable(&ir, false).unwrap();
assert_eq!(output, fiche);
}
#[test]
fn test_no_root_key() {
let fiche = "@┃idⁱ┃nameˢ
◉1┃alice";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.root_key, None);
}
#[test]
fn test_type_parsing() {
assert!(matches!(parse_type_str("int"), Ok(FieldType::I64)));
assert!(matches!(parse_type_str("str"), Ok(FieldType::String)));
assert!(matches!(parse_type_str("float"), Ok(FieldType::F64)));
assert!(matches!(parse_type_str("bool"), Ok(FieldType::Bool)));
assert!(matches!(parse_type_str("ⁱ"), Ok(FieldType::I64)));
assert!(matches!(parse_type_str("ˢ"), Ok(FieldType::String)));
assert!(matches!(parse_type_str("ᶠ"), Ok(FieldType::F64)));
assert!(matches!(parse_type_str("ᵇ"), Ok(FieldType::Bool)));
assert!(matches!(
parse_type_str("str[]"),
Ok(FieldType::Array(box_inner)) if *box_inner == FieldType::String
));
assert!(matches!(
parse_type_str("str⟦⟧"),
Ok(FieldType::Array(box_inner)) if *box_inner == FieldType::String
));
assert!(matches!(
parse_type_str("ˢ⟦⟧"),
Ok(FieldType::Array(box_inner)) if *box_inner == FieldType::String
));
}
#[test]
fn test_nested_arrays() {
let fiche = "@people┃nameˢ┃heightˢ┃filmsˢ⟦⟧┃vehiclesˢ⟦⟧
◉Luke┃172┃film/1◈film/2┃∅
◉Leia┃150┃film/1┃vehicle/30";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.row_count, 2);
assert_eq!(ir.header.fields.len(), 4);
if let Some(SchemaValue::String(name)) = ir.get_value(0, 0) {
assert_eq!(name, "Luke");
} else {
panic!("Expected string");
}
if let Some(SchemaValue::String(height)) = ir.get_value(0, 1) {
assert_eq!(height, "172");
} else {
panic!("Expected string");
}
if let Some(SchemaValue::Array(films)) = ir.get_value(0, 2) {
assert_eq!(films.len(), 2);
if let SchemaValue::String(film) = &films[0] {
assert_eq!(film, "film/1");
} else {
panic!("Expected string");
}
} else {
panic!("Expected array");
}
if let Some(SchemaValue::Array(vehicles)) = ir.get_value(0, 3) {
assert_eq!(vehicles.len(), 0);
} else {
panic!("Expected array");
}
if let Some(SchemaValue::Array(vehicles)) = ir.get_value(1, 3) {
assert_eq!(vehicles.len(), 1);
if let SchemaValue::String(vehicle) = &vehicles[0] {
assert_eq!(vehicle, "vehicle/30");
} else {
panic!("Expected string");
}
} else {
panic!("Expected array");
}
let output = serialize_readable(&ir, false).unwrap();
assert_eq!(output, fiche);
}
#[test]
fn test_space_preservation() {
let fiche = "@people┃nameˢ┃homeˢ
◉Luke▓Skywalker┃Tatooine▓Desert▓Planet
◉Leia▓Organa┃Alderaan";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.row_count, 2);
if let Some(SchemaValue::String(name)) = ir.get_value(0, 0) {
assert_eq!(name, "Luke Skywalker");
} else {
panic!("Expected string");
}
if let Some(SchemaValue::String(home)) = ir.get_value(0, 1) {
assert_eq!(home, "Tatooine Desert Planet");
} else {
panic!("Expected string");
}
let output = serialize_readable(&ir, false).unwrap();
assert!(output.contains("Luke▓Skywalker"));
assert!(output.contains("Tatooine▓Desert▓Planet"));
assert_eq!(output, fiche);
}
#[test]
fn test_minified_output() {
let fiche_normal = "@users┃idⁱ┃nameˢ
◉1┃alice
◉2┃bob";
let ir = parse(fiche_normal).unwrap();
let minified = serialize_minified(&ir).unwrap();
assert!(!minified.contains('\n'), "Minified should have no newlines");
assert!(
minified.contains('▓'),
"Minified should use ▓ as line separator"
);
let ir2 = parse(&minified).unwrap();
assert_eq!(ir2.header.row_count, 2);
assert_eq!(ir2.header.fields[0].name, "id");
assert_eq!(ir2.header.fields[1].name, "name");
if let Some(SchemaValue::String(name)) = ir2.get_value(0, 1) {
assert_eq!(name, "alice");
} else {
panic!("Expected string");
}
if let Some(SchemaValue::String(name)) = ir2.get_value(1, 1) {
assert_eq!(name, "bob");
} else {
panic!("Expected string");
}
}
#[test]
fn test_metadata_annotation() {
let fiche = "@students[class=Year▓1,school_name=Springfield▓High]┃idˢ
◉A1
◉B2";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.root_key, Some("students".to_string()));
assert_eq!(ir.header.row_count, 2);
assert!(ir.header.metadata.is_some());
let metadata = ir.header.metadata.as_ref().unwrap();
assert_eq!(
metadata.get("school_name"),
Some(&"Springfield High".to_string())
);
assert_eq!(metadata.get("class"), Some(&"Year 1".to_string()));
let output = serialize_readable(&ir, false).unwrap();
assert!(output.contains("[class=Year▓1,school_name=Springfield▓High]"));
assert_eq!(output, fiche);
}
#[test]
fn test_metadata_minified() {
let fiche = "@students[class=Year▓1,school_name=Springfield▓High]┃idˢ
◉A1
◉B2";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.row_count, 2);
assert!(ir.header.metadata.is_some());
let metadata = ir.header.metadata.as_ref().unwrap();
assert_eq!(
metadata.get("school_name"),
Some(&"Springfield High".to_string())
);
assert_eq!(metadata.get("class"), Some(&"Year 1".to_string()));
let tokenized = serialize_minified(&ir).unwrap();
let ir2 = parse(&tokenized).unwrap();
assert_eq!(ir2.header.fields[0].name, "id");
assert_eq!(ir2.header.row_count, 2);
}
#[test]
fn test_value_dictionary() {
let fiche = "@logs┃levelˢ┃messageˢ┃serviceˢ
◉info┃Request▓received┃api
◉debug┃Parsing▓payload┃api
◉info┃Auth▓validated┃api
◉error┃Connection▓timeout┃db
◉info┃Response▓sent┃api
◉error┃Query▓failed┃db";
let ir = parse(fiche).unwrap();
assert_eq!(ir.header.row_count, 6);
let tokenized = serialize(&ir, false).unwrap();
assert!(tokenized.starts_with("@ᚠ=level,ᚡ=message,ᚢ=service\n"));
assert!(tokenized.contains("@𓀀="));
let ir2 = parse(&tokenized).unwrap();
assert_eq!(ir2.header.row_count, 6);
assert_eq!(ir2.header.fields.len(), 3);
assert_eq!(ir2.header.fields[0].name, "level");
assert_eq!(ir2.header.fields[1].name, "message");
assert_eq!(ir2.header.fields[2].name, "service");
if let Some(SchemaValue::String(level)) = ir2.get_value(0, 0) {
assert_eq!(level, "info");
} else {
panic!("Expected string value");
}
if let Some(SchemaValue::String(service)) = ir2.get_value(0, 2) {
assert_eq!(service, "api");
} else {
panic!("Expected string value");
}
if let Some(SchemaValue::String(level)) = ir2.get_value(3, 0) {
assert_eq!(level, "error");
} else {
panic!("Expected string value");
}
}
#[test]
fn test_value_dictionary_no_duplicates() {
let fiche = "@data┃idˢ┃nameˢ
◉1┃alice
◉2┃bob
◉3┃carol";
let ir = parse(fiche).unwrap();
let tokenized = serialize(&ir, false).unwrap();
assert!(tokenized.starts_with("@ᚠ=id,ᚡ=name\n"));
let lines: Vec<&str> = tokenized.lines().collect();
assert_eq!(lines.len(), 5); assert!(!tokenized.contains("𓀀")); }
#[test]
fn test_path_mode_roundtrip_simple() {
let json = r#"{"a":1,"b":{"c":"hello","d":true}}"#;
let fiche = serialize_path_mode(json).unwrap();
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(original, parsed);
}
#[test]
fn test_path_mode_roundtrip_arrays() {
let json = r#"{"users":[{"name":"alice"},{"name":"bob"}]}"#;
let fiche = serialize_path_mode(json).unwrap();
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(original, parsed);
}
#[test]
fn test_path_mode_roundtrip_nulls_bools() {
let json = r#"{"active":true,"deleted":false,"data":null}"#;
let fiche = serialize_path_mode(json).unwrap();
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(original, parsed);
}
#[test]
fn test_path_mode_roundtrip_empty_containers() {
let json = r#"{"items":[],"meta":{}}"#;
let fiche = serialize_path_mode(json).unwrap();
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(original, parsed);
}
#[test]
fn test_path_mode_roundtrip_deep_nesting() {
let json = r#"{"a":{"b":{"c":{"d":{"e":1}}}}}"#;
let fiche = serialize_path_mode(json).unwrap();
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(original, parsed);
}
#[test]
fn test_path_mode_roundtrip_with_tokenization() {
let json = r#"{"users":[{"name":"alice","role":"admin"},{"name":"bob","role":"admin"}]}"#;
let fiche = serialize_path_mode(json).unwrap();
assert!(
fiche.chars().any(tokens::is_token),
"Expected runic tokens for path segments"
);
assert!(
fiche.chars().any(value_tokens::is_token),
"Expected hieroglyph token for repeated value"
);
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(original, parsed);
}
#[test]
fn test_ascii_mode_simple() {
use crate::encoders::algorithms::schema::parsers::{InputParser, JsonParser};
let json = r#"{"users":[{"id":1,"name":"Alice","score":95,"active":true},{"id":2,"name":"Bob","score":87,"active":false}]}"#;
let ir = JsonParser::parse(json).unwrap();
let ascii = serialize_ascii(&ir).unwrap();
assert!(ascii.contains(';'));
assert!(ascii.contains(','));
assert!(ascii.is_ascii());
assert!(!ascii.contains('◉'));
assert!(!ascii.contains('┃'));
assert!(!ascii.contains('▓'));
}
#[test]
fn test_ascii_mode_with_dictionary() {
use crate::encoders::algorithms::schema::parsers::{InputParser, JsonParser};
let json = r#"{"logs":[{"level":"info","msg":"start"},{"level":"error","msg":"fail"},{"level":"info","msg":"retry"}]}"#;
let ir = JsonParser::parse(json).unwrap();
let ascii = serialize_ascii(&ir).unwrap();
assert!(ascii.contains("V1="));
assert!(ascii.contains("info") || ascii.contains("V1"));
}
#[test]
fn test_ascii_mode_null_handling() {
use crate::encoders::algorithms::schema::parsers::{InputParser, JsonParser};
let json = r#"{"data":[{"a":1,"b":null},{"a":2,"b":"value"}]}"#;
let ir = JsonParser::parse(json).unwrap();
let ascii = serialize_ascii(&ir).unwrap();
let rows: Vec<&str> = ascii.split(';').collect();
assert!(rows.len() >= 2);
assert!(rows[1].contains("1,"));
}
#[test]
fn test_ascii_mode_space_preservation() {
use crate::encoders::algorithms::schema::parsers::{InputParser, JsonParser};
let json = r#"{"people":[{"name":"Alice Smith","title":"Senior Engineer"}]}"#;
let ir = JsonParser::parse(json).unwrap();
let ascii = serialize_ascii(&ir).unwrap();
assert!(ascii.contains("Alice Smith"));
assert!(ascii.contains("Senior Engineer"));
}
#[test]
fn test_path_mode_numeric_object_keys_roundtrip() {
let json = r#"{"values":{"0":"a","5":"b","10":"c"}}"#;
let fiche = serialize_path_mode(json).unwrap();
assert!(fiche.contains("#0"), "Expected #0 marker for object key");
assert!(fiche.contains("#5"), "Expected #5 marker for object key");
assert!(fiche.contains("#10"), "Expected #10 marker for object key");
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(
original, parsed,
"Sparse object keys should roundtrip correctly"
);
}
#[test]
fn test_path_mode_string_numbers_roundtrip() {
let json = r#"{"id":"1579231263","phone":"5551234567"}"#;
let fiche = serialize_path_mode(json).unwrap();
assert!(fiche.contains('"'), "Expected \" marker for string numbers");
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(
original, parsed,
"String numbers should roundtrip correctly"
);
}
#[test]
fn test_path_mode_mixed_array_and_object() {
let json = r#"{"items":[{"0":"first","1":"second"}],"map":{"0":"zero","1":"one"}}"#;
let fiche = serialize_path_mode(json).unwrap();
let result = parse_path_mode(&fiche).unwrap();
let original: serde_json::Value = serde_json::from_str(json).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(
original, parsed,
"Mixed arrays and numeric-keyed objects should roundtrip"
);
}
}