1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
use flate2::write::GzEncoder;
use flate2::Compression;
use regex::Regex;
// use std::fs::File;
use std::io::prelude::*;
// use std::io::BufReader;

// fn main() -> std::io::Result<()> {
//     let test_files = 4;

//     for i in 1..=test_files {
//         let target_num: String;
//         match i {
//             _ if i < 10 => target_num = "0".to_owned() + &i.to_owned().to_string(),
//             _ => target_num = i.to_owned().to_string(),
//         }
//         println!(
//             "================== Test: {}.mws ==================",
//             target_num
//         );

//         let file = format!("./test/{}.mws", target_num);
//         let f = File::open(file)?;
//         let reader = BufReader::new(f);

//         for line in reader.lines() {
//             let line = line?;
//             parse_line(&line);
//         }
//     }

//     Ok(())
// }


//? Split a string using regex. Keep the matched string.
fn split_keep<'a>(r: &Regex, text: &'a str) -> Vec<&'a str> {
    let mut result = Vec::new();
    let mut last = 0;
    for (index, matched) in text.match_indices(r) {
        if last != index {
            result.push(&text[last..index]);
        }
        result.push(matched);
        last = index + matched.len();
    }
    if last < text.len() {
        result.push(&text[last..]);
    }
    result
}

pub fn parse_line(line: &str) {
    let split_re = Regex::new(r"([ ,:;]+)").expect("Invalid regex");
    let tokens = split_keep(&split_re, line)
        .into_iter()
        .map(|n| n.trim())
        .filter(|n| !n.is_empty())
        .collect::<Vec<_>>();
    let mut tokenized_line: Vec<String> = vec![];
    let mut minimized_line: Vec<String> = vec![];
    for token in tokens {
        let token = tokenizer(token);
        println!("{} = \"{}\"", token.token_type, token.value);
        if token.token_type == "unknown" {
            println!("================== FAILED! ==================");
            println!("Unknown token: {}", token.value);
            println!("================== FAILED! ==================");
        } else {
            tokenized_line.push(token.token_type);
            minimized_line.push(token.value);
        }
    }
    println!("{:?}", tokenized_line);
    println!("{:?}", minimized_line);
    println!("{:?}", compress_string(&minimized_line.join(" ")));
}

struct Token {
    token_type: String,
    value: String,
}

fn tokenizer(token: &str) -> Token {
    let token_type;

    //? Regex:
    let operators_re = Regex::new(r"=|\+|-|\\").unwrap();
    let keywords_re = Regex::new(r"let|pub").unwrap();
    let identifier_re = Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*$").unwrap();
    let punctuator_re = Regex::new(r"\(|\)|\{|\}|;|:").unwrap();
    // literals
    let string_re = Regex::new(r#""[^"]*""#).unwrap();
    let number_re = Regex::new(r"^\d+$").unwrap();
    let bool_re = Regex::new(r"true|false").unwrap();
    let null_re = Regex::new(r"null").unwrap();
    let float_re = Regex::new(r"^\d+\.\d+$").unwrap();
    // types
    let float_type_re = Regex::new(r"float").unwrap();
    let int_type_re = Regex::new(r"int").unwrap();
    let string_type_re = Regex::new(r"string").unwrap();
    let bool_type_re = Regex::new(r"bool").unwrap();
    let null_type_re = Regex::new(r"null").unwrap();
    let object_type_re = Regex::new(r"object").unwrap();
    let array_type_re = Regex::new(r"array").unwrap();
    //whitespace
    let whitespace_re = Regex::new(r"^\s+$").unwrap();
    //comment
    let comment_re = Regex::new(r"//.*").unwrap();
    let comment_block_re = Regex::new(r"/\*.*\*/").unwrap();
    //? End of regex

    match token {
        _ if keywords_re.is_match(token) => token_type = "keyword",
        _ if operators_re.is_match(token) => token_type = "operator",
        _ if punctuator_re.is_match(token) => token_type = "punctuator",

        _ if number_re.is_match(token) => token_type = "number",
        _ if float_re.is_match(token) => token_type = "float",
        _ if bool_re.is_match(token) => token_type = "bool",
        _ if null_re.is_match(token) => token_type = "null",
        _ if string_re.is_match(token) => token_type = "string",

        _ if float_type_re.is_match(token) => token_type = "type",
        _ if int_type_re.is_match(token) => token_type = "type",
        _ if string_type_re.is_match(token) => token_type = "type",
        _ if bool_type_re.is_match(token) => token_type = "type",
        _ if null_type_re.is_match(token) => token_type = "type",
        _ if object_type_re.is_match(token) => token_type = "type",
        _ if array_type_re.is_match(token) => token_type = "type",

        _ if whitespace_re.is_match(token) => token_type = "whitespace",

        _ if comment_re.is_match(token) => token_type = "comment",
        _ if comment_block_re.is_match(token) => token_type = "comment",

        _ if identifier_re.is_match(token) => token_type = "identifier",
        _ => token_type = "unknown",
    }

    return Token {
        token_type: token_type.to_string(),
        value: token.to_string(),
    };
}

fn compress_string(input: &str) -> Vec<u8> {
    let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
    encoder.write_all(input.as_bytes()).unwrap();
    encoder.finish().unwrap()
}