json_schema_dsl/
token.rs

1use indexmap::IndexMap;
2use logos::{Lexer, Logos};
3use serde::{Deserialize, Serialize};
4use serde_json::{json, Value};
5use std::str::FromStr;
6
7const FORMAT_NAMES: &'static [&'static str] = &[
8    "Date",
9    "Time",
10    "DateTime",
11    "Timestamp",
12    "Interval",
13    "Duration",
14    "Email",
15    "Ipv4",
16    "Ipv6",
17    "Uri",
18    "Hostname",
19    "Domainname",
20    "Uuid",
21    "UUID",
22    "Ulid",
23    "ULID",
24    "Json",
25    "JSON",
26    "Xml",
27    "XML",
28    "Color",
29    "Isbn",
30    "ISBN",
31    "Path",
32    "S3Path",
33    "SemVer",
34    "PhoneNumber",
35    "CreditCard",
36    "Currency",
37    "MimeType",
38    "Language",
39    "Locale",
40    "Base64",
41];
42const NUMBER_NAMES: &'static [&'static str] = &[
43    "price", "rate", "height", "width", "weight", "amount", "total", "percent", "ratio",
44];
45const INTEGER_NAMES: &'static [&'static str] = &[
46    "age", "year", "count", "size", "length", "delay", "time", "duration", "level", "index",
47    "position", "order", "size", "limit", "offset", "page", "quantity", "capacity", "interval",
48    "retries", "max", "min",
49];
50const BOOLEAN_NAMES: &'static [&'static str] = &[
51    "has", "is", "does", "allow", "should", "if", "can", "may", "will", "must",
52];
53
54fn array_type_callback(lex: &mut Lexer<Token>) -> (String, String, String) {
55    let complex_type = lex.slice().to_owned();
56    let offset = complex_type.find('<').unwrap();
57    let container_type = complex_type[..offset].to_owned();
58    let end_offset = complex_type.rfind('>').unwrap();
59    let item_type = complex_type[offset + 1..end_offset].to_owned();
60    let range_type = complex_type[end_offset + 1..].trim();
61    (container_type, item_type, range_type.to_string())
62}
63
64#[derive(Debug, Logos)]
65#[logos(skip r"[ \t\n\f]+")] // Ignore this regex pattern between tokens
66pub enum ArrayToken {
67    #[token("(")]
68    ParenOpen,
69
70    #[token(")")]
71    ParenClose,
72
73    #[token("[")]
74    BracketOpen,
75
76    #[token("]")]
77    BracketClose,
78
79    #[token(",")]
80    Comma,
81
82    #[regex(r"-?(?:0|[1-9]\d*)?",
83    |lex| lex.slice().parse::<i64>().unwrap())]
84    Integer(i64),
85
86    #[regex(r"-?(?:0|[1-9]\d*)(?:\.\d+)(?:[eE][+-]?\d+)?",
87    |lex| lex.slice().parse::<f64>().unwrap())]
88    Number(f64),
89
90    #[regex(r#"'([^']*)'"#, |lex| lex.slice().to_owned())]
91    Text1(String),
92
93    #[regex(r#""([^"\\]|\\["\\bnfrt]|u[a-fA-F0-9]{4})*""#, |lex| lex.slice().to_owned())]
94    Text2(String),
95}
96
97#[derive(Debug, Logos)]
98#[logos(skip r"[ \t\n\f]+")] // Ignore this regex pattern between tokens
99pub enum Token {
100    #[token("{")]
101    BraceOpen,
102
103    #[token("}")]
104    BraceClose,
105
106    #[token(":")]
107    Colon,
108
109    #[token(",")]
110    Comma,
111
112    #[token("...")]
113    Ellipsis,
114
115    #[regex(r#"(integer|int|long|bigint|number|float|double|real|decimal)\([^)]+\)"#,
116    |lex| lex.slice().to_owned())]
117    RangeType(String),
118
119    #[regex(r#"(string|bytes|varchar|String|Text)\([^)]+\)"#,
120    |lex| lex.slice().to_owned())]
121    StringLengthType(String),
122
123    #[regex(r#"\[[^]]+\]"#, |lex| lex.slice().to_owned())]
124    TupleType(String),
125
126    #[regex("integer|Integer|int|int32|int64|int96|int128|long|bigint|serial|bigserial|number|Number|float|double|real|decimal|boolean|Boolean|bool|string|bytes|bytea|varchar|String|Text",
127    |lex| lex.slice().to_owned())]
128    PrimitiveType(String),
129
130    #[regex("Ulid|ULID|Color|Isbn|ISBN|Path|S3Path|SemVer|PhoneNumber|CreditCard|Currency|MimeType|Language|Locale|Base64",
131    |lex| lex.slice().to_owned())]
132    ExtraType(String),
133
134    #[regex("Date|Time|DateTime|Duration|Email|Ipv4|Ipv6|Uri|Hostname|Domainname|Uuid|UUID",
135    |lex| lex.slice().to_owned())]
136    FormatType(String),
137
138    #[regex(r#"(List|list|Set|set|Array|array)<(integer|Integer|int|long|bigint|number|Number|float|double|real|decimal|boolean|Boolean|bool|string|bytes|bytea|varchar|String|Text|Date|Time|DateTime|Timestamp|Interval|Duration|Email|Ipv4|Ipv6|Uri|Hostname|Domainname|Uuid|UUID|Ulid|ULID|Color|Isbn|ISBN|Path|S3Path|SemVer|PhoneNumber|CreditCard|Currency|MimeType|Language|Locale|Base64)>(\([^)]+\))?"#,
139        array_type_callback
140    )]
141    ArrayType((String, String, String)),
142
143    #[regex("(integer|Integer|int|int32|int64|int96|int128|long|bigint|number|Number|float|double|real|decimal|boolean|Boolean|bool|string|bytes|bytea|varchar|String|Text|Date|Time|DateTime|Timestamp|Interval|Duration|Email|Ipv4|Ipv6|Uri|Hostname|Uuid|UUID)([|](integer|Integer|int|int32|int64|int96|int128|long|bigint|number|Number|float|double|real|decimal|boolean|Boolean|bool|string|bytes|bytea|varchar|String|Text|Date|Time|DateTime|Timestamp|Interval|Duration|Email|Ipv4|Ipv6|Uri|Hostname|Domainname|Uuid|UUID|Ulid|ULID|Color|Isbn|ISBN|Path|S3Path|SemVer|PhoneNumber|CreditCard|Currency|MimeType|Language|Locale|Base64))+",
144    |lex| lex.slice().to_owned())]
145    AnyOf(String),
146
147    #[regex(r#"enum\([^)]+\)"#, |lex| lex.slice().to_owned())]
148    EnumType(String),
149
150    #[regex(r#"regex\(((['][^']+['])|([\"][^\"]+[\"]))\)"#, |lex| lex.slice().to_owned())]
151    RegexType(String),
152
153    #[regex("[A-Z][a-zA-Z0-9_]+", |lex| lex.slice().to_owned())]
154    ObjectName(String),
155
156    #[regex(r#"[a-z0-9][a-zA-Z0-9_]+[\?]?"#, |lex| lex.slice().to_owned())]
157    FieldName(String),
158}
159
160#[derive(Serialize, Deserialize, Debug)]
161pub struct JsonSchema {
162    #[serde(skip_serializing_if = "Option::is_none")]
163    #[serde(rename = "$schema")]
164    pub version: Option<String>,
165    #[serde(skip_serializing_if = "Option::is_none")]
166    #[serde(rename = "$id")]
167    pub id: Option<String>,
168    pub title: String,
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub description: Option<String>,
171    #[serde(rename = "type")]
172    pub type_name: String,
173    #[serde(skip_serializing_if = "Option::is_none")]
174    pub properties: Option<IndexMap<String, JsonSchemaEntry>>,
175    #[serde(skip_serializing_if = "Option::is_none")]
176    pub required: Option<Vec<String>>,
177    #[serde(skip_serializing_if = "Option::is_none")]
178    #[serde(rename = "additionalProperties")]
179    pub additional_properties: Option<bool>,
180}
181
182#[derive(Serialize, Deserialize, Debug, Default, Clone)]
183pub struct JsonSchemaEntry {
184    #[serde(skip)]
185    pub name: String,
186    #[serde(skip)]
187    pub optional: bool,
188    #[serde(rename = "type")]
189    #[serde(skip_serializing_if = "String::is_empty")]
190    pub type_name: String,
191    #[serde(skip_serializing_if = "Option::is_none")]
192    pub description: Option<String>,
193    #[serde(skip_serializing_if = "Option::is_none")]
194    pub format: Option<String>,
195    #[serde(skip_serializing_if = "Option::is_none")]
196    pub pattern: Option<String>,
197    #[serde(skip_serializing_if = "Option::is_none")]
198    #[serde(rename = "anyOf")]
199    pub any_of: Option<Vec<JsonSchemaEntry>>,
200    #[serde(skip_serializing_if = "Option::is_none")]
201    pub items: Option<Value>,
202    #[serde(skip_serializing_if = "Option::is_none")]
203    #[serde(rename = "minItems")]
204    pub min_items: Option<u32>,
205    #[serde(skip_serializing_if = "Option::is_none")]
206    #[serde(rename = "maxItems")]
207    pub max_items: Option<u32>,
208    #[serde(skip_serializing_if = "Option::is_none")]
209    #[serde(rename = "enum")]
210    pub enums: Option<Vec<Value>>,
211    #[serde(skip_serializing_if = "Option::is_none")]
212    pub minimum: Option<Value>,
213    #[serde(skip_serializing_if = "Option::is_none")]
214    pub maximum: Option<Value>,
215    #[serde(skip_serializing_if = "Option::is_none")]
216    #[serde(rename = "minLength")]
217    pub min_length: Option<u32>,
218    #[serde(skip_serializing_if = "Option::is_none")]
219    #[serde(rename = "maxLength")]
220    pub max_length: Option<u32>,
221    #[serde(skip_serializing_if = "Option::is_none")]
222    #[serde(rename = "uniqueItems")]
223    pub unique_items: Option<bool>,
224    #[serde(skip_serializing_if = "Option::is_none")]
225    pub properties: Option<IndexMap<String, JsonSchemaEntry>>,
226    #[serde(skip_serializing_if = "Option::is_none")]
227    pub required: Option<Vec<String>>,
228    #[serde(skip_serializing_if = "Option::is_none")]
229    pub additional_properties: Option<bool>,
230}
231
232impl JsonSchema {
233    pub fn new(title: &str) -> Self {
234        JsonSchema {
235            version: None,
236            id: None,
237            title: title.to_string(),
238            description: None,
239            type_name: "object".to_owned(),
240            properties: Some(IndexMap::new()),
241            required: None,
242            additional_properties: None,
243        }
244    }
245    pub fn version_2020(title: &str) -> Self {
246        JsonSchema {
247            version: Some("https://json-schema.org/draft/2020-12/schema".to_owned()),
248            id: None,
249            title: title.to_string(),
250            description: None,
251            type_name: "object".to_owned(),
252            properties: Some(IndexMap::new()),
253            required: None,
254            additional_properties: None,
255        }
256    }
257}
258
259impl JsonSchemaEntry {
260    pub fn new(type_name: &str) -> Self {
261        JsonSchemaEntry {
262            type_name: convert_to_json_type(type_name),
263            ..Default::default()
264        }
265    }
266
267    pub fn format(format_name: &str) -> Self {
268        JsonSchemaEntry {
269            type_name: "string".to_string(),
270            format: Some(convert_to_json_format(format_name)),
271            ..Default::default()
272        }
273    }
274
275    pub fn revise(&mut self) {
276        if self.type_name.is_empty() {
277            let field_name = &self.name;
278            if field_name.contains("time") || field_name.contains("_at") {
279                self.type_name = "string".to_owned();
280                self.format = Some("date-time".to_owned());
281            } else if field_name.contains("date") {
282                self.type_name = "string".to_owned();
283                self.format = Some("date".to_owned());
284            } else if BOOLEAN_NAMES
285                .iter()
286                .any(|&item| field_name.starts_with(item))
287            {
288                self.type_name = "boolean".to_owned();
289            } else if NUMBER_NAMES.iter().any(|&item| field_name.contains(item)) {
290                self.type_name = "number".to_owned();
291            } else if INTEGER_NAMES.iter().any(|&item| field_name.contains(item)) {
292                self.type_name = "integer".to_owned();
293            } else {
294                self.type_name = "string".to_owned();
295            }
296        }
297    }
298
299    pub fn add_entry(&mut self, name: &str, type_name: &str) {
300        if self.properties.is_none() {
301            self.properties = Some(IndexMap::new());
302        }
303        if let Some(ref mut properties) = self.properties {
304            let entry = JsonSchemaEntry {
305                type_name: type_name.to_string(),
306                ..Default::default()
307            };
308            properties.insert(name.to_string(), entry);
309        }
310    }
311}
312
313pub fn to_json_schema(struct_text: &str) -> Result<JsonSchema, String> {
314    let offset = struct_text.find('{').unwrap();
315    let title = struct_text[0..offset].trim();
316    let lexer_text = &struct_text[offset..].trim().trim_matches(&['{', '}']);
317    let mut json_schema = JsonSchema::version_2020(title);
318    let mut entries: IndexMap<String, JsonSchemaEntry> = IndexMap::new();
319    let mut entry: JsonSchemaEntry = Default::default();
320    let mut parent_entries: Option<IndexMap<String, JsonSchemaEntry>> = None;
321    let mut parent_entry: Option<JsonSchemaEntry> = None;
322    let mut lexer = Token::lexer(lexer_text);
323    while let Some(result) = lexer.next() {
324        if let Ok(token) = result {
325            match token {
326                Token::BraceOpen => {
327                    //backup entry and entries
328                    parent_entry = Some(entry.clone());
329                    parent_entries = Some(entries.clone());
330                    // reset
331                    entry = Default::default();
332                    entries = IndexMap::new();
333                }
334                Token::BraceClose => {
335                    if !entry.name.is_empty() {
336                        entry.revise();
337                        entries.insert(entry.name.clone(), entry.clone());
338                    }
339                    let additional_properties = entry.additional_properties.clone();
340                    // bring back parent
341                    entry = parent_entry.clone().unwrap();
342                    entry.required = find_required_fields(&entries);
343                    entry.properties = Some(entries.clone());
344                    entry.additional_properties = additional_properties;
345                    entries = parent_entries.clone().unwrap();
346                    entries.insert(entry.name.clone(), entry.clone());
347                    // reset
348                    entry = Default::default();
349                    parent_entries = None;
350                    parent_entry = None;
351                }
352                Token::Colon => {}
353                Token::Comma => {
354                    if !entry.name.is_empty() {
355                        entry.revise();
356                        entries.insert(entry.name.clone(), entry.clone());
357                    }
358                    entry = Default::default();
359                }
360                Token::PrimitiveType(type_name) => {
361                    entry.type_name = convert_to_json_type(&type_name);
362                }
363                Token::ExtraType(type_name) => {
364                    entry.type_name = convert_to_json_type(&type_name);
365                }
366                Token::FormatType(format_name) => {
367                    entry.type_name = "string".to_owned();
368                    entry.format = Some(convert_to_json_format(&format_name));
369                }
370                Token::ArrayType(array) => {
371                    entry.type_name = "array".to_owned();
372                    let container_type = array.0;
373                    if container_type.to_lowercase().starts_with("set") {
374                        entry.unique_items = Some(true);
375                    }
376                    let item_type = array.1;
377                    let item_entry = if FORMAT_NAMES.contains(&item_type.as_str()) {
378                        let format = convert_to_json_format(&item_type);
379                        json!({
380                            "type": "string",
381                            "format": format
382                        })
383                    } else {
384                        json!({
385                            "type": "string"
386                        })
387                    };
388                    let range_type = array.2;
389                    if range_type.starts_with('(') {
390                        let items_text = range_type.trim_matches(&['(', ')']).trim();
391                        if !items_text.contains(",") {
392                            entry.min_items = Some(items_text.parse().unwrap());
393                            entry.max_items = Some(items_text.parse().unwrap());
394                        } else if items_text.starts_with(",") {
395                            //maxItems
396                            entry.max_items = Some(items_text[1..].parse().unwrap());
397                        } else if items_text.ends_with(",") {
398                            //minItems
399                            entry.min_items =
400                                Some(items_text[..items_text.len() - 1].parse().unwrap());
401                        } else {
402                            let items = items_text.split(',').collect::<Vec<&str>>();
403                            if items.len() == 2 {
404                                entry.min_items = Some(items[0].parse().unwrap());
405                                entry.max_items = Some(items[1].parse().unwrap());
406                            }
407                        }
408                    }
409                    entry.items = Some(item_entry);
410                }
411                Token::AnyOf(any_of) => {
412                    let mut types: Vec<JsonSchemaEntry> = vec![];
413                    for type_name in any_of.split('|') {
414                        types.push(JsonSchemaEntry::new(type_name));
415                    }
416                    entry.any_of = Some(types);
417                }
418                Token::EnumType(enum_type) => {
419                    let items_text = enum_type[4..].trim();
420                    let items = parse_array(items_text);
421                    if !items.is_empty() {
422                        entry.enums = Some(items);
423                    }
424                }
425                Token::TupleType(tuple_type) => {
426                    entry.type_name = "array".to_owned();
427                    let items_text = tuple_type.trim_matches(&['[', ']']).trim();
428                    let values = items_text
429                        .split(',')
430                        .map(|item| {
431                            json!({
432                                "type": convert_to_json_type(item.trim())
433                            })
434                        })
435                        .collect::<Vec<Value>>();
436                    entry.items = Some(Value::from(values));
437                }
438                Token::RangeType(range_type) => {
439                    let offset = range_type.find('(').unwrap();
440                    let type_name = convert_to_json_type(range_type[..offset].trim());
441                    entry.type_name = type_name;
442                    let items_text = range_type[offset..].trim_matches(&['(', ')']).trim();
443                    if items_text.starts_with(",") {
444                        //maximum
445                        entry.maximum = Some(Value::from_str(items_text[1..].trim()).unwrap());
446                    } else if items_text.ends_with(",") {
447                        //minimum
448                        entry.minimum = Some(
449                            Value::from_str(items_text[..items_text.len() - 1].trim()).unwrap(),
450                        );
451                    } else {
452                        let items = items_text.split(',').collect::<Vec<&str>>();
453                        if items.len() == 2 {
454                            entry.minimum = Some(Value::from_str(items[0].trim()).unwrap());
455                            entry.maximum = Some(Value::from_str(items[1].trim()).unwrap());
456                        }
457                    }
458                }
459                Token::StringLengthType(length_type) => {
460                    let offset = length_type.find('(').unwrap();
461                    let type_name = length_type[0..offset].trim().to_lowercase();
462                    entry.type_name = "string".to_owned();
463                    let items_text = length_type[offset..].trim_matches(&['(', ')']).trim();
464                    if !items_text.contains(',') {
465                        let length = items_text.parse().unwrap();
466                        if type_name == "varchar" {
467                            entry.max_length = Some(length);
468                        } else {
469                            entry.min_length = Some(length);
470                            entry.max_length = Some(length);
471                        }
472                    } else if items_text.starts_with(",") {
473                        //maxLength
474                        entry.max_length = Some(items_text[1..].trim().parse().unwrap());
475                    } else if items_text.ends_with(",") {
476                        //minLength
477                        entry.min_length = Some(items_text[1..].trim().parse().unwrap());
478                    } else {
479                        let items = items_text.split(',').collect::<Vec<&str>>();
480                        if items.len() == 2 {
481                            entry.min_length = Some(items[0].trim().parse().unwrap());
482                            entry.max_length = Some(items[1].trim().parse().unwrap());
483                        }
484                    }
485                }
486                Token::RegexType(regex_type) => {
487                    let pattern = regex_type[5..]
488                        .trim()
489                        .trim_matches(&['(', ')'])
490                        .trim()
491                        .trim_matches(&['"', '\'']);
492                    entry.pattern = Some(pattern.to_string());
493                }
494                Token::ObjectName(_object_name) => {
495                    entry.type_name = "object".to_string();
496                }
497                Token::FieldName(field_name) => {
498                    if field_name.ends_with('?') {
499                        entry.optional = true;
500                        entry.name = field_name[..field_name.len() - 1].to_string();
501                    } else {
502                        entry.name = field_name;
503                    }
504                }
505                Token::Ellipsis => {
506                    entry.additional_properties = Some(true);
507                }
508            }
509        } else {
510            return Err("Failed to parse struct".to_string());
511        }
512    }
513    if !entry.name.is_empty() {
514        entry.revise();
515        entries.insert(entry.name.clone(), entry.clone());
516    } else if entry.additional_properties.is_some() {
517        json_schema.additional_properties = entry.additional_properties.clone();
518    }
519    json_schema.required = find_required_fields(&entries);
520    json_schema.properties = Some(entries);
521
522    Ok(json_schema)
523}
524
525fn parse_array(text: &str) -> Vec<Value> {
526    let mut lexer = ArrayToken::lexer(text);
527    let mut items: Vec<Value> = vec![];
528    while let Some(result) = lexer.next() {
529        if let Ok(token) = result {
530            match token {
531                ArrayToken::Integer(value) => {
532                    items.push(Value::from(value));
533                }
534                ArrayToken::Number(value) => {
535                    items.push(Value::from(value));
536                }
537                ArrayToken::Text1(value) => {
538                    let temp = value.trim_matches('\'').replace("\"", "\\\"");
539                    let text2 = format!("\"{}\"", temp);
540                    items.push(Value::from_str(&text2).unwrap());
541                }
542                ArrayToken::Text2(value) => {
543                    items.push(Value::from_str(&value).unwrap());
544                }
545                _ => {}
546            }
547        }
548    }
549    items
550}
551
552fn convert_to_json_type(type_name: &str) -> String {
553    let name = type_name.to_lowercase();
554    match name.as_str() {
555        "varchar" | "text" | "bytes" | "bytea" => "string".to_string(),
556        "isbn" | "ulid" | "path" | "s3path" | "semver" | "phonenumber" | "creditcard"
557        | "currency" | "mimetype" | "language" | "locale" | "base64" => "string".to_string(),
558        "int" | "long" | "bigint" | "serial" | "bigserial" => "integer".to_string(),
559        "float" | "double" | "real" | "decimal" => "number".to_string(),
560        "bool" => "boolean".to_string(),
561        _ => name,
562    }
563}
564
565fn convert_to_json_format(format_name: &str) -> String {
566    let name = format_name.to_lowercase();
567    match name.as_str() {
568        "datetime" | "timestamp" => "date-time".to_string(),
569        "interval" => "duration".to_string(),
570        "domainname" => "hostname".to_string(),
571        "json" | "xml" => "string".to_string(),
572        _ => name,
573    }
574}
575
576fn find_required_fields(entries: &IndexMap<String, JsonSchemaEntry>) -> Option<Vec<String>> {
577    let mut required: Vec<String> = vec![];
578    for entry in entries {
579        if !entry.1.optional {
580            required.push(entry.0.clone());
581        }
582    }
583    if !required.is_empty() {
584        Some(required)
585    } else {
586        None
587    }
588}
589
590#[cfg(test)]
591mod tests {
592    use super::*;
593
594    #[test]
595    fn test_lexer() {
596        let text = r#"User { id: int, nick: string(6,64) }"#;
597        let mut lexer = Token::lexer(text);
598        while let Some(token) = lexer.next() {
599            println!("{:?}", token);
600        }
601    }
602    #[test]
603    fn test_parse() {
604        let text = r#"User { id: int, tags: list<string>(2,4) }"#;
605        let json_schema = to_json_schema(text).unwrap();
606        println!("{}", serde_json::to_string_pretty(&json_schema).unwrap())
607    }
608
609    #[test]
610    fn test_parse_nested() {
611        let text = r#"User { id: int, contact: Contact { phone: string, email: string, ... }, status: enum('First',"Second", 1, 2) }"#;
612        let json_schema = to_json_schema(text).unwrap();
613        println!("{}", serde_json::to_string_pretty(&json_schema).unwrap())
614    }
615
616    #[test]
617    fn test_to_json() {
618        let mut json_schema = JsonSchema::version_2020("User");
619        let mut entries: IndexMap<String, JsonSchemaEntry> = IndexMap::new();
620        entries.insert("nick".to_owned(), JsonSchemaEntry::new("string"));
621        entries.insert("email".to_owned(), JsonSchemaEntry::format("Email"));
622        json_schema.properties = Some(entries);
623        println!("{}", serde_json::to_string_pretty(&json_schema).unwrap())
624    }
625
626    #[test]
627    fn test_array() {
628        let text = r#"['First',"Second", -1, 2, 3.0]"#;
629        let array = parse_array(text);
630        println!("{}", serde_json::to_string_pretty(&array).unwrap())
631    }
632}