Skip to main content

schema_index_yaml/
lib.rs

1//! Parse a `*.schema.yml` index definition into the core
2//! [`IndexSchema`](schema_core::IndexSchema).
3//!
4//! A schema file describes one search document: its root table, its fields, and
5//! how related tables fold in through joins and aggregates. Each field is
6//! written **type-first** — `- <type>: <name>` (`keyword: email`,
7//! `has_many: orders`, `count: orderCount`, `geo: location`) — and carries
8//! only the siblings that type allows. Parsing is two stages:
9//!
10//! 1. [`SchemaYaml`] deserializes the file. Each field's type tag selects the
11//!    body shape it parses into (see [`Field`]).
12//!    [`ParseFrom`](schema_core::ParseFrom) also checks the declared `version`
13//!    against [`SUPPORTED_VERSIONS`].
14//! 2. `TryFrom<SchemaYaml>` converts it into the core model, validating
15//!    identifiers and the arity rules YAML alone can't express: a join takes
16//!    exactly the key its verb implies (`column` for `belongs_to`,
17//!    `foreign_key` for `has_one`/`has_many`, `through` for `many_to_many`),
18//!    `sum`/`min`/`max` aggregates need a `column` and a `value_type`, a
19//!    `between` filter takes exactly two values, and a `geo` field needs either
20//!    `lat`+`lon` or a single `column`.
21
22mod conversion;
23mod entities;
24mod parser;
25
26pub use entities::*;
27pub use parser::ParseError;
28
29use serde::Deserialize;
30
31pub const SUPPORTED_VERSIONS: &[u8] = &[1];
32
33/// The JSON Schema (authored as YAML) describing a `*.schema.yml` index file,
34/// embedded from this crate's `schemas/` directory for editor assist and
35/// programmatic access (both re-exported from `schema` and emitted by `flusso
36/// schema index`). Kept in lockstep with this parser by `schema`'s `schema_drift`
37/// test.
38pub const INDEX_SCHEMA: &str = include_str!("../index.schema.yml");
39
40#[derive(thiserror::Error, Debug)]
41pub enum ConversionError {
42    #[error("invalid table name: {0}")]
43    TableName(#[from] schema_core::TableNameError),
44    #[error("invalid column name: {0}")]
45    ColumnName(#[from] schema_core::ColumnNameError),
46    #[error("invalid database schema name: {0}")]
47    DatabaseSchema(#[from] schema_core::DatabaseSchemaError),
48    #[error("`{verb}` join is missing its key: it takes {expected}")]
49    MissingJoinKey {
50        verb: &'static str,
51        expected: &'static str,
52    },
53    #[error("`{verb}` join does not take `{sibling}`; it takes {expected}")]
54    UnexpectedJoinKey {
55        verb: &'static str,
56        sibling: &'static str,
57        expected: &'static str,
58    },
59    #[error("`{verb}` join does not take `{sibling}` (a to-one join picks a single row)")]
60    UnexpectedJoinSibling {
61        verb: &'static str,
62        sibling: &'static str,
63    },
64    #[error("aggregate must specify either `foreign_key` or `through`, not both or neither")]
65    InvalidAggregateKey,
66    #[error("aggregate op '{op}' requires a `column`")]
67    MissingAggregateColumn { op: &'static str },
68    #[error("filter op '{op}' requires a value")]
69    MissingFilterValue { op: &'static str },
70    #[error("filter op 'between' requires exactly 2 values, got {got}")]
71    InvalidBetweenArity { got: usize },
72    #[error("filter op '{op}' requires a sequence value")]
73    ExpectedListValue { op: &'static str },
74    #[error("aggregate op '{op}' requires a `value_type` (its result mirrors the column)")]
75    MissingAggregateType { op: &'static str },
76    #[error(
77        "aggregate op '{op}' `value_type` must be a scalar type — `geo_point` and `custom` \
78         are not valid aggregate result types"
79    )]
80    InvalidAggregateType { op: &'static str },
81    #[error(
82        "aggregate op 'ids' requires an `element_type` (`long` or `keyword`) — it states the \
83         element type of the collected primary keys"
84    )]
85    MissingElementType,
86    #[error(
87        "aggregate op 'ids' `element_type` must be a scalar type — `geo_point` and `custom` \
88         are not valid element types"
89    )]
90    InvalidElementType,
91    #[error(
92        "aggregate op 'ids' does not take `{sibling}` (it always collects the related table's primary key)"
93    )]
94    UnexpectedIdsSibling { sibling: &'static str },
95    #[error("aggregate does not take `{sibling}` (only `ids` does)")]
96    UnexpectedAggregateSibling { sibling: &'static str },
97    #[error(
98        "a `geo` field needs either both `lat` and `lon` (two columns) or a single `column` \
99         holding a combined value — not a mix"
100    )]
101    InvalidGeoSource,
102    #[error(
103        "a `map` field's `values` must be a leaf type — `text`/`keyword` or a number/date kind \
104         (`{got}` is not one); `boolean`, `binary`, `json`, `geo`, and `custom` are not valid \
105         map value types"
106    )]
107    InvalidMapValueType { got: &'static str },
108    #[error(
109        "`doc_id` is not supported yet — the document `_id` is always derived from `primary_key`. \
110         Remove `doc_id` from the schema."
111    )]
112    DocIdUnsupported,
113    #[error(
114        "a `default` must be a scalar value (string, number, bool, or date) — a `{got}` default \
115         is not supported"
116    )]
117    NonScalarDefault { got: &'static str },
118}
119
120#[derive(Debug, Clone, Deserialize)]
121#[serde(deny_unknown_fields)]
122pub struct SchemaYaml {
123    pub version: u8,
124    pub table: String,
125    #[serde(skip_serializing_if = "Option::is_none")]
126    pub schema: Option<String>,
127    #[serde(skip_serializing_if = "Option::is_none")]
128    pub primary_key: Option<String>,
129    #[serde(skip_serializing_if = "Option::is_none")]
130    pub doc_id: Option<String>,
131    #[serde(skip_serializing_if = "Option::is_none")]
132    pub soft_delete: Option<SoftDelete>,
133    /// Root filters: only matching root rows become documents.
134    #[serde(default, skip_serializing_if = "Option::is_none")]
135    pub filters: Option<Vec<Filter>>,
136    pub fields: Vec<Field>,
137}
138
139impl TryFrom<SchemaYaml> for schema_core::IndexSchema {
140    type Error = ConversionError;
141
142    fn try_from(yaml: SchemaYaml) -> Result<Self, Self::Error> {
143        use schema_core::common::{ColumnName, TableName};
144
145        let table = TableName::try_new(yaml.table)?;
146        let db_schema = match yaml.schema {
147            Some(s) => schema_core::DatabaseSchema::try_new(s)?,
148            None => schema_core::DatabaseSchema::default(),
149        };
150        let primary_key = yaml.primary_key.map(ColumnName::try_new).transpose()?;
151        // `doc_id` parses (so existing schemas still deserialize) but is rejected
152        // here: honoring a non-pk `_id` needs the value at delete time, which the
153        // pk-keyed tombstone path can't supply. Tracked as a follow-up feature.
154        if yaml.doc_id.is_some() {
155            return Err(ConversionError::DocIdUnsupported);
156        }
157        let doc_id = yaml.doc_id.map(ColumnName::try_new).transpose()?;
158        let soft_delete = yaml
159            .soft_delete
160            .map(conversion::convert_soft_delete)
161            .transpose()?;
162        let filters = conversion::convert_filters_opt(yaml.filters)?;
163        let fields = yaml
164            .fields
165            .into_iter()
166            .map(conversion::convert_field)
167            .collect::<Result<_, _>>()?;
168
169        Ok(schema_core::IndexSchema {
170            version: yaml.version,
171            table,
172            db_schema,
173            primary_key,
174            doc_id,
175            soft_delete,
176            filters,
177            fields,
178        })
179    }
180}