fbxscii 0.1.0

ASCII FBX Parser
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
use std::{collections::VecDeque, io::BufRead};

#[derive(Debug, PartialEq, Clone)]
pub enum Token {
    OpenBrace,
    CloseBrace,
    Data(String),
    Comma,
    Key(String),
}

impl std::fmt::Display for Token {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Token::OpenBrace => write!(f, "{{"),
            Token::CloseBrace => write!(f, "}}"),
            Token::Data(data) => write!(f, "\"{}\"", data),
            Token::Comma => write!(f, ","),
            Token::Key(key) => write!(f, "{}:", key),
        }
    }
}

#[derive(Debug, PartialEq)]
pub struct TokenData {
    pub data: Token,
    pub starting_line_number: usize,
    pub starting_char_index: usize,
}

impl std::fmt::Display for TokenData {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "pos:({}:{}) {}",
            self.starting_line_number, self.starting_char_index, self.data
        )
    }
}

#[derive(Debug, PartialEq, Clone)]
pub enum TokenizerError {
    ReadError(String),
}

pub struct Tokenizer<R: BufRead> {
    reader: R,
    char_buffer_queue: VecDeque<Vec<char>>,
    line_number: usize,
    char_index: usize,
}

impl<R: BufRead> Tokenizer<R> {
    pub fn new(reader: R) -> Self {
        Self {
            reader,
            char_buffer_queue: VecDeque::new(),
            line_number: 0,
            char_index: 0,
        }
    }
}

impl<R: BufRead> Iterator for Tokenizer<R> {
    type Item = Result<TokenData, TokenizerError>;

    fn next(&mut self) -> Option<Self::Item> {
        // Algorithm Description:
        // We need to read each token from the input stream one by one.
        // Since what we are reading is a stream of characters, it is more efficient to read line by line to save up on utf-8 validation time.
        // Technically, the FBX format is not utf-8, but rather ASCII, but operating on utf-8 gives us less headaches.
        // Each step, our goal is to start reading at char_index, assuming that all the previous characters were correctly read.

        // Load Line, if our buffer queue is empty
        if self.char_buffer_queue.is_empty() {
            let mut line = String::new();
            let result = self.reader.read_line(&mut line);
            // We are unable to read the line, so we return an error.
            if let Err(e) = result {
                return Some(Err(TokenizerError::ReadError(e.to_string())));
            }
            // If the line is empty, we are at the end of the file.
            if result.unwrap() == 0 {
                return None;
            }
            self.char_buffer_queue.push_back(line.chars().collect());
            self.line_number += 1;
            self.char_index = 0;
        }

        // We know that the line is not empty, so we can safely unwrap.
        // If our char_index is out of bounds or we have no data to read, we pop and try again.
        let line = self.char_buffer_queue.front().unwrap();
        if line.is_empty() || self.char_index >= line.len() {
            self.char_buffer_queue.pop_front();
            return self.next();
        }

        // Handle one-shot cases first.
        // These are cases where we can determine the token type and value without having to read further than the current character.
        let char = line[self.char_index];
        match char {
            '{' => {
                // We have found an open brace.
                self.char_index += 1;
                return Some(Ok(TokenData {
                    data: Token::OpenBrace,
                    starting_line_number: self.line_number - 1,
                    starting_char_index: self.char_index - 1,
                }));
            }
            '}' => {
                // We have found a close brace.
                self.char_index += 1;
                return Some(Ok(TokenData {
                    data: Token::CloseBrace,
                    starting_line_number: self.line_number - 1,
                    starting_char_index: self.char_index - 1,
                }));
            }
            ',' => {
                // We have found a comma.
                self.char_index += 1;
                return Some(Ok(TokenData {
                    data: Token::Comma,
                    starting_line_number: self.line_number - 1,
                    starting_char_index: self.char_index - 1,
                }));
            }
            ';' => {
                // The rest of this line is a comment. Pop the line from the queue
                self.char_buffer_queue.pop_front();
                return self.next();
            }
            '"' => {
                // If we are in double quotes, we keep reading until we find the next double quote.
                // We need to handle the case where the double quote is escaped and skip it.
                // The quotes also can go across lines, so we need to keep reading until we find the next double quote.

                // Remember where we discovered the double quote.
                let discover_index = self.char_index;
                let discover_line_number = self.line_number - 1;

                // This is a buffer to build up the data string.
                let mut char_buffer: Vec<char> = Vec::new();
                let mut buffer_ref: &Vec<char> = line;
                let mut line_char_buffer: Option<Vec<char>> = None;
                let mut lines_read = 0;
                loop {
                    self.char_index += 1;
                    if self.char_index >= buffer_ref.len() {
                        let mut new_line = String::new();
                        let result = self.reader.read_line(&mut new_line);
                        // We are unable to read the line, so we return an error.
                        if let Err(e) = result {
                            return Some(Err(TokenizerError::ReadError(e.to_string())));
                        }
                        // If the line is empty, we are at the end of the file.
                        if result.unwrap() == 0 {
                            break;
                        }
                        line_char_buffer = Some(new_line.chars().collect());
                        buffer_ref = line_char_buffer.as_ref().unwrap();
                        lines_read += 1;
                        self.char_index = 0;
                    }
                    if buffer_ref[self.char_index] == '"' {
                        break;
                    }
                    char_buffer.push(buffer_ref[self.char_index]);
                }

                if let Some(line_char_buffer) = line_char_buffer {
                    for _ in 0..lines_read {
                        self.char_buffer_queue.pop_front();
                    }
                    self.line_number += lines_read;
                    self.char_buffer_queue.push_back(line_char_buffer);
                }

                // We now have the data string, but we need to check if it is a key or data.
                // We do this by checking if the next non-whitespace, non-endline character is a colon.
                self.char_index += 1;
                return Some(Ok(TokenData {
                    data: Token::Data(char_buffer.iter().collect()),
                    starting_line_number: discover_line_number,
                    starting_char_index: discover_index,
                }));
            }
            ' ' | '\t' | '\n' | '\r' | '\0' | '\u{000C}' => {
                // If we find a whitespace, there is no data to start with.
                // @todo: we can save some checks by peeking ahead for more whitespace and adjusting the char_index accordingly.
                self.char_index += 1;
                return self.next();
            }
            _ => {}
        }

        // If we are here, what remains is either a key or data
        // If it is data, we need to read until we find a comma, whitespace/newline, or special character.
        // In the case that we find a whitespace/newline, we must read further to confirm there is no colon.
        // If there is a colon, it is a key, rather than data.

        let token_begin = self.char_index;
        let mut token_end = line.len();
        for i in token_begin..line.len() {
            let char = line[i];
            match char {
                '"' | '{' | '}' | ',' | ';' => {
                    // These characters denote the end of token as data.
                    // We set char_index to i to ensure the character is read as the next token.
                    self.char_index = i;
                    token_end = i;
                    return Some(Ok(TokenData {
                        data: Token::Data(line[token_begin..token_end].iter().collect()),
                        starting_line_number: self.line_number - 1,
                        starting_char_index: token_begin,
                    }));
                }
                ':' => {
                    // These characters denote the end of token as key.
                    // We set char_index to i + 1 to ensure the colon is not read as part of the next token.
                    self.char_index = i + 1;
                    if token_begin == i {
                        // If there is no characters before the colon, we ignore it.
                        return self.next();
                    }
                    // Return the key token up to the colon.
                    return Some(Ok(TokenData {
                        data: Token::Key(line[token_begin..i].iter().collect()),
                        starting_line_number: self.line_number - 1,
                        starting_char_index: token_begin,
                    }));
                }
                '\n' | '\r' | '\0' | '\u{000C}' => {
                    // These characters denote the end of the token data.
                    // We still need to check that there is no colon beyond.
                    token_end = i;
                    break;
                }
                c if c.is_whitespace() => {
                    // Whitespace denotes the end of the token data.
                    // We still need to check that there is no colon beyond.
                    token_end = i;
                    break;
                }
                _ => {}
            }
        }

        // whilst we know when the token ends, we do not know if it is a key or data.
        // we need to check if the next non-whitespace character is a colon.
        // keep loading lines until we find a non-whitespace character.
        let starting_line_number = self.line_number - 1;
        let key_or_data: String = line[token_begin..token_end].iter().collect();

        // we have already read the token at token_end, so we start at the next character.
        let mut read_start = token_end + 1;
        if read_start >= line.len() {
            // pop the line if we have read everything.
            self.char_buffer_queue.pop_front();
        }

        loop {
            if self.char_buffer_queue.is_empty() {
                let mut line = String::new();
                let result = self.reader.read_line(&mut line);
                if let Err(e) = result {
                    return Some(Err(TokenizerError::ReadError(e.to_string())));
                }
                // If the line is empty, we are at the end of the file.
                // In this case, we return what we have so far as a data token.
                if result.unwrap() == 0 {
                    return Some(Ok(TokenData {
                        data: Token::Data(key_or_data),
                        starting_line_number,
                        starting_char_index: token_begin,
                    }));
                }
                self.char_buffer_queue.push_back(line.chars().collect());
                self.line_number += 1;
                self.char_index = 0;
                read_start = 0;
            }
            let line = self.char_buffer_queue.front().unwrap();
            // Read Start is guarenteed to be in bounds.
            for (index, char) in line[read_start..].iter().enumerate() {
                match char {
                    c if c.is_whitespace() => {}
                    '\n' | '\r' | '\0' | '\u{000C}' => {}
                    ':' => {
                        // If we find a colon, we have a key.
                        // We set char_index to index + 1 to ensure the colon is not read as part of the next token.
                        self.char_index = read_start + index + 1;
                        return Some(Ok(TokenData {
                            data: Token::Key(key_or_data),
                            starting_line_number,
                            starting_char_index: token_begin,
                        }));
                    }
                    _ => {
                        // If we find a non-whitespace, non-endline character, we have a data token.
                        // We set char_index to index to ensure the character is read as the next token.
                        self.char_index = read_start + index;
                        return Some(Ok(TokenData {
                            data: Token::Data(key_or_data),
                            starting_line_number,
                            starting_char_index: token_begin,
                        }));
                    }
                }
            }
            // pop the line if we have read everything.
            self.char_buffer_queue.pop_front();
        }
    }
}

#[cfg(test)]
mod tests {
    use std::io::BufReader;

    use super::*;

    #[test]
    fn test_read_line_empty() {
        let input = "";
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(tokenizer.next(), None);
    }
    #[test]
    fn test_read_line_empty_line() {
        let input = "\n";
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn test_read_line_comment() {
        let input = "; This is a comment\n";
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn test_read_line_key() {
        let input = "Key: Value\n";
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Key("Key".to_string()),
                starting_line_number: 0,
                starting_char_index: 0,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Value".to_string()),
                starting_line_number: 0,
                starting_char_index: 5,
            }))
        );
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn test_read_line_key_with_whitespace() {
        let input = "Key : Value\n";
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Key("Key".to_string()),
                starting_line_number: 0,
                starting_char_index: 0,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Value".to_string()),
                starting_line_number: 0,
                starting_char_index: 6,
            }))
        );
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn test_read_line_braces() {
        let input = "{Hello World}";
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::OpenBrace,
                starting_line_number: 0,
                starting_char_index: 0,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Hello".to_string()),
                starting_line_number: 0,
                starting_char_index: 1,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("World".to_string()),
                starting_line_number: 0,
                starting_char_index: 7,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::CloseBrace,
                starting_line_number: 0,
                starting_char_index: 12,
            }))
        );
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn test_read_line() {
        let input = r#"
FBXHeaderExtension:  {
    FBXHeaderVersion: 1003
}"#;
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Key("FBXHeaderExtension".to_string()),
                starting_line_number: 1,
                starting_char_index: 0,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::OpenBrace,
                starting_line_number: 1,
                starting_char_index: 21,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Key("FBXHeaderVersion".to_string()),
                starting_line_number: 2,
                starting_char_index: 4,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("1003".to_string()),
                starting_line_number: 2,
                starting_char_index: 22,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::CloseBrace,
                starting_line_number: 3,
                starting_char_index: 0,
            }))
        );
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn test_read_line_data_with_quotes() {
        let input = r#""Hello World""#;
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Hello World".to_string()),
                starting_line_number: 0,
                starting_char_index: 0,
            }))
        );
    }

    #[test]
    fn test_read_line_data_with_quotes_and_whitespace() {
        let input = "\t\tVertices: *6324 {";
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        let mut token = tokenizer.next();
        assert_eq!(
            token,
            Some(Ok(TokenData {
                data: Token::Key("Vertices".to_string()),
                starting_line_number: 0,
                starting_char_index: 2,
            }))
        );
        token = tokenizer.next();
        assert_eq!(
            token,
            Some(Ok(TokenData {
                data: Token::Data("*6324".to_string()),
                starting_line_number: 0,
                starting_char_index: 12,
            }))
        );
        token = tokenizer.next();
        assert_eq!(
            token,
            Some(Ok(TokenData {
                data: Token::OpenBrace,
                starting_line_number: 0,
                starting_char_index: 18,
            }))
        );
    }

    #[test]
    fn test_read_line_data_with_multiple_quotes() {
        let input = r#""Hello World" "Goodbye World""#;
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Hello World".to_string()),
                starting_line_number: 0,
                starting_char_index: 0,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Goodbye World".to_string()),
                starting_line_number: 0,
                starting_char_index: 14,
            }))
        );
    }

    #[test]
    fn test_read_line_data_key_value() {
        let input = r#"Key: "Value""#;
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Key("Key".to_string()),
                starting_line_number: 0,
                starting_char_index: 0,
            }))
        );
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Value".to_string()),
                starting_line_number: 0,
                starting_char_index: 5,
            }))
        );
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn test_read_line_unescape_quote() {
        let input = r#" "Hello World"#;
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Hello World".to_string()),
                starting_line_number: 0,
                starting_char_index: 1,
            }))
        );
    }

    #[test]
    fn test_read_multiline_data_with_quotes() {
        let input = r#""Hello World
Hello World
Hello World""#;
        let mut tokenizer = Tokenizer::new(BufReader::new(input.as_bytes()));
        assert_eq!(
            tokenizer.next(),
            Some(Ok(TokenData {
                data: Token::Data("Hello World\nHello World\nHello World".to_string()),
                starting_line_number: 0,
                starting_char_index: 0,
            }))
        );
    }
}