brack_tokenizer/
tokenize.rs

1use anyhow::Result;
2use std::{fs::File, io::Read, path::Path};
3
4use crate::{dispatch::dispatch, tokenizer::Tokenizer, tokens::Token};
5
6pub fn tokenize<P: AsRef<Path>>(path: P) -> Result<Vec<Token>> {
7    let mut file = File::open(&path)?;
8    let mut text = String::new();
9    file.read_to_string(&mut text)?;
10    let t = Tokenizer {
11        line: Some(0),
12        column: Some(0),
13        token_start_line: Some(0),
14        token_start_column: Some(0),
15        untreated: Some(text),
16        ..Default::default()
17    };
18    Ok(dispatch(&t))
19}
20
21#[cfg(test)]
22mod tests {
23    use super::tokenize;
24    use crate::tokens::{Location, LocationData, Token};
25    use anyhow::Result;
26
27    #[test]
28    fn test_split_no_commands() -> Result<()> {
29        let pwd = std::env::current_dir()?;
30        let uri = pwd
31            .join("text/split_no_commands.[]")
32            .to_string_lossy()
33            .to_string();
34        let tokens = tokenize(uri.clone())?;
35        assert_eq!(
36            tokens,
37            vec![
38                Token::Text(
39                    "Hello, World!".to_string(),
40                    Location {
41                        start: LocationData {
42                            line: 0,
43                            character: 0,
44                        },
45                        end: LocationData {
46                            line: 0,
47                            character: 13,
48                        }
49                    },
50                ),
51                Token::EOF(Location {
52                    start: LocationData {
53                        line: 0,
54                        character: 13,
55                    },
56                    end: LocationData {
57                        line: 0,
58                        character: 13,
59                    }
60                }),
61            ]
62        );
63        Ok(())
64    }
65
66    #[test]
67    fn test_split_commands_with_an_argument_includes_square_brackets() -> Result<()> {
68        let pwd = std::env::current_dir()?;
69        let uri = pwd
70            .join("text/split_commands_with_an_argument_includes_square_brackets.[]")
71            .to_string_lossy()
72            .to_string();
73        let tokens = tokenize(uri.clone())?;
74        assert_eq!(
75            tokens,
76            vec![
77                Token::Text(
78                    "Hello, ".to_string(),
79                    Location {
80                        start: LocationData {
81                            line: 0,
82                            character: 0,
83                        },
84                        end: LocationData {
85                            line: 0,
86                            character: 7,
87                        },
88                    },
89                ),
90                Token::SquareBracketOpen(Location {
91                    start: LocationData {
92                        line: 0,
93                        character: 7,
94                    },
95                    end: LocationData {
96                        line: 0,
97                        character: 8,
98                    },
99                }),
100                Token::Module(
101                    "std".to_string(),
102                    Location {
103                        start: LocationData {
104                            line: 0,
105                            character: 8,
106                        },
107                        end: LocationData {
108                            line: 0,
109                            character: 11,
110                        },
111                    }
112                ),
113                Token::Dot(Location {
114                    start: LocationData {
115                        line: 0,
116                        character: 11,
117                    },
118                    end: LocationData {
119                        line: 0,
120                        character: 12,
121                    },
122                }),
123                Token::Ident(
124                    "*".to_string(),
125                    Location {
126                        start: LocationData {
127                            line: 0,
128                            character: 12,
129                        },
130                        end: LocationData {
131                            line: 0,
132                            character: 13,
133                        },
134                    }
135                ),
136                Token::Text(
137                    "World!".to_string(),
138                    Location {
139                        start: LocationData {
140                            line: 0,
141                            character: 14,
142                        },
143                        end: LocationData {
144                            line: 0,
145                            character: 20,
146                        },
147                    }
148                ),
149                Token::SquareBracketClose(Location {
150                    start: LocationData {
151                        line: 0,
152                        character: 20,
153                    },
154                    end: LocationData {
155                        line: 0,
156                        character: 21,
157                    },
158                }),
159                Token::EOF(Location {
160                    start: LocationData {
161                        line: 0,
162                        character: 21,
163                    },
164                    end: LocationData {
165                        line: 0,
166                        character: 21,
167                    },
168                }),
169            ]
170        );
171        Ok(())
172    }
173
174    #[test]
175    fn test_split_commands_with_an_argument_includes_curly_brackets() -> Result<()> {
176        let pwd = std::env::current_dir()?;
177        let uri = pwd
178            .join("text/split_commands_with_an_argument_includes_curly_brackets.[]")
179            .to_string_lossy()
180            .to_string();
181        let tokens = tokenize(uri.clone())?;
182        assert_eq!(
183            tokens,
184            vec![
185                Token::Text(
186                    "Hello, ".to_string(),
187                    Location {
188                        start: LocationData {
189                            line: 0,
190                            character: 0,
191                        },
192                        end: LocationData {
193                            line: 0,
194                            character: 7,
195                        },
196                    }
197                ),
198                Token::CurlyBracketOpen(Location {
199                    start: LocationData {
200                        line: 0,
201                        character: 7,
202                    },
203                    end: LocationData {
204                        line: 0,
205                        character: 8,
206                    },
207                }),
208                Token::Module(
209                    "std".to_string(),
210                    Location {
211                        start: LocationData {
212                            line: 0,
213                            character: 8,
214                        },
215                        end: LocationData {
216                            line: 0,
217                            character: 11,
218                        },
219                    }
220                ),
221                Token::Dot(Location {
222                    start: LocationData {
223                        line: 0,
224                        character: 11,
225                    },
226                    end: LocationData {
227                        line: 0,
228                        character: 12,
229                    },
230                }),
231                Token::Ident(
232                    "*".to_string(),
233                    Location {
234                        start: LocationData {
235                            line: 0,
236                            character: 12,
237                        },
238                        end: LocationData {
239                            line: 0,
240                            character: 13,
241                        },
242                    }
243                ),
244                Token::Text(
245                    "World!".to_string(),
246                    Location {
247                        start: LocationData {
248                            line: 0,
249                            character: 14,
250                        },
251                        end: LocationData {
252                            line: 0,
253                            character: 20,
254                        },
255                    }
256                ),
257                Token::CurlyBracketClose(Location {
258                    start: LocationData {
259                        line: 0,
260                        character: 20,
261                    },
262                    end: LocationData {
263                        line: 0,
264                        character: 21,
265                    },
266                }),
267                Token::EOF(Location {
268                    start: LocationData {
269                        line: 0,
270                        character: 21,
271                    },
272                    end: LocationData {
273                        line: 0,
274                        character: 21,
275                    },
276                }),
277            ]
278        );
279        Ok(())
280    }
281
282    #[test]
283    fn test_split_commands_with_an_argument_includes_angle_brackets() -> Result<()> {
284        let pwd = std::env::current_dir()?;
285        let uri = pwd
286            .join("text/split_commands_with_an_argument_includes_angle_brackets.[]")
287            .to_string_lossy()
288            .to_string();
289        let tokens = tokenize(uri.clone())?;
290        assert_eq!(
291            tokens,
292            vec![
293                Token::Text(
294                    "Hello, ".to_string(),
295                    Location {
296                        start: LocationData {
297                            line: 0,
298                            character: 0,
299                        },
300                        end: LocationData {
301                            line: 0,
302                            character: 7,
303                        },
304                    }
305                ),
306                Token::AngleBracketOpen(Location {
307                    start: LocationData {
308                        line: 0,
309                        character: 7,
310                    },
311                    end: LocationData {
312                        line: 0,
313                        character: 8,
314                    },
315                }),
316                Token::Ident(
317                    "*".to_string(),
318                    Location {
319                        start: LocationData {
320                            line: 0,
321                            character: 8,
322                        },
323                        end: LocationData {
324                            line: 0,
325                            character: 9,
326                        },
327                    }
328                ),
329                Token::Text(
330                    "World!".to_string(),
331                    Location {
332                        start: LocationData {
333                            line: 0,
334                            character: 10,
335                        },
336                        end: LocationData {
337                            line: 0,
338                            character: 16,
339                        },
340                    }
341                ),
342                Token::AngleBracketClose(Location {
343                    start: LocationData {
344                        line: 0,
345                        character: 16,
346                    },
347                    end: LocationData {
348                        line: 0,
349                        character: 17,
350                    },
351                }),
352                Token::EOF(Location {
353                    start: LocationData {
354                        line: 0,
355                        character: 17,
356                    },
357                    end: LocationData {
358                        line: 0,
359                        character: 17,
360                    },
361                }),
362            ]
363        );
364        Ok(())
365    }
366
367    #[test]
368    fn test_split_commands_with_two_arguments_includes_square_brackets() -> Result<()> {
369        let pwd = std::env::current_dir()?;
370        let uri = pwd
371            .join("text/split_commands_with_two_arguments_includes_square_brackets.[]")
372            .to_string_lossy()
373            .to_string();
374        let tokens = tokenize(uri.clone())?;
375        assert_eq!(
376            tokens,
377            vec![
378                Token::Text(
379                    "Hello, ".to_string(),
380                    Location {
381                        start: LocationData {
382                            line: 0,
383                            character: 0,
384                        },
385                        end: LocationData {
386                            line: 0,
387                            character: 7,
388                        },
389                    }
390                ),
391                Token::SquareBracketOpen(Location {
392                    start: LocationData {
393                        line: 0,
394                        character: 7,
395                    },
396                    end: LocationData {
397                        line: 0,
398                        character: 8,
399                    },
400                }),
401                Token::Module(
402                    "std".to_string(),
403                    Location {
404                        start: LocationData {
405                            line: 0,
406                            character: 8,
407                        },
408                        end: LocationData {
409                            line: 0,
410                            character: 11,
411                        },
412                    }
413                ),
414                Token::Dot(Location {
415                    start: LocationData {
416                        line: 0,
417                        character: 11,
418                    },
419                    end: LocationData {
420                        line: 0,
421                        character: 12,
422                    },
423                }),
424                Token::Ident(
425                    "@".to_string(),
426                    Location {
427                        start: LocationData {
428                            line: 0,
429                            character: 12,
430                        },
431                        end: LocationData {
432                            line: 0,
433                            character: 13,
434                        },
435                    }
436                ),
437                Token::Text(
438                    "World!".to_string(),
439                    Location {
440                        start: LocationData {
441                            line: 0,
442                            character: 14,
443                        },
444                        end: LocationData {
445                            line: 0,
446                            character: 20,
447                        },
448                    }
449                ),
450                Token::Comma(Location {
451                    start: LocationData {
452                        line: 0,
453                        character: 20,
454                    },
455                    end: LocationData {
456                        line: 0,
457                        character: 21,
458                    },
459                }),
460                Token::Text(
461                    "https://example.com/".to_string(),
462                    Location {
463                        start: LocationData {
464                            line: 0,
465                            character: 22,
466                        },
467                        end: LocationData {
468                            line: 0,
469                            character: 42,
470                        },
471                    }
472                ),
473                Token::SquareBracketClose(Location {
474                    start: LocationData {
475                        line: 0,
476                        character: 42,
477                    },
478                    end: LocationData {
479                        line: 0,
480                        character: 43,
481                    },
482                }),
483                Token::EOF(Location {
484                    start: LocationData {
485                        line: 0,
486                        character: 43,
487                    },
488                    end: LocationData {
489                        line: 0,
490                        character: 43,
491                    },
492                }),
493            ]
494        );
495        Ok(())
496    }
497
498    #[test]
499    fn test_split_nesting_commands() -> Result<()> {
500        let pwd = std::env::current_dir()?;
501        let uri = pwd
502            .join("text/split_nesting_commands.[]")
503            .to_string_lossy()
504            .to_string();
505        let tokens = tokenize(uri.clone())?;
506        assert_eq!(
507            tokens,
508            vec![
509                Token::Text(
510                    "Hello, ".to_string(),
511                    Location {
512                        start: LocationData {
513                            line: 0,
514                            character: 0,
515                        },
516                        end: LocationData {
517                            line: 0,
518                            character: 7,
519                        },
520                    }
521                ),
522                Token::SquareBracketOpen(Location {
523                    start: LocationData {
524                        line: 0,
525                        character: 7,
526                    },
527                    end: LocationData {
528                        line: 0,
529                        character: 8,
530                    },
531                }),
532                Token::Module(
533                    "std".to_string(),
534                    Location {
535                        start: LocationData {
536                            line: 0,
537                            character: 8,
538                        },
539                        end: LocationData {
540                            line: 0,
541                            character: 11,
542                        },
543                    }
544                ),
545                Token::Dot(Location {
546                    start: LocationData {
547                        line: 0,
548                        character: 11,
549                    },
550                    end: LocationData {
551                        line: 0,
552                        character: 12,
553                    },
554                }),
555                Token::Ident(
556                    "*".to_string(),
557                    Location {
558                        start: LocationData {
559                            line: 0,
560                            character: 12,
561                        },
562                        end: LocationData {
563                            line: 0,
564                            character: 13,
565                        },
566                    }
567                ),
568                Token::SquareBracketOpen(Location {
569                    start: LocationData {
570                        line: 0,
571                        character: 14,
572                    },
573                    end: LocationData {
574                        line: 0,
575                        character: 15,
576                    },
577                }),
578                Token::Module(
579                    "std".to_string(),
580                    Location {
581                        start: LocationData {
582                            line: 0,
583                            character: 15,
584                        },
585                        end: LocationData {
586                            line: 0,
587                            character: 18,
588                        },
589                    }
590                ),
591                Token::Dot(Location {
592                    start: LocationData {
593                        line: 0,
594                        character: 18,
595                    },
596                    end: LocationData {
597                        line: 0,
598                        character: 19,
599                    },
600                }),
601                Token::Ident(
602                    "@".to_string(),
603                    Location {
604                        start: LocationData {
605                            line: 0,
606                            character: 19,
607                        },
608                        end: LocationData {
609                            line: 0,
610                            character: 20,
611                        },
612                    }
613                ),
614                Token::Text(
615                    "World!".to_string(),
616                    Location {
617                        start: LocationData {
618                            line: 0,
619                            character: 21,
620                        },
621                        end: LocationData {
622                            line: 0,
623                            character: 27,
624                        },
625                    }
626                ),
627                Token::Comma(Location {
628                    start: LocationData {
629                        line: 0,
630                        character: 27,
631                    },
632                    end: LocationData {
633                        line: 0,
634                        character: 28,
635                    },
636                }),
637                Token::Text(
638                    "https://example.com/".to_string(),
639                    Location {
640                        start: LocationData {
641                            line: 0,
642                            character: 29,
643                        },
644                        end: LocationData {
645                            line: 0,
646                            character: 49,
647                        },
648                    }
649                ),
650                Token::SquareBracketClose(Location {
651                    start: LocationData {
652                        line: 0,
653                        character: 49,
654                    },
655                    end: LocationData {
656                        line: 0,
657                        character: 50,
658                    },
659                }),
660                Token::SquareBracketClose(Location {
661                    start: LocationData {
662                        line: 0,
663                        character: 50,
664                    },
665                    end: LocationData {
666                        line: 0,
667                        character: 51,
668                    },
669                }),
670                Token::EOF(Location {
671                    start: LocationData {
672                        line: 0,
673                        character: 51,
674                    },
675                    end: LocationData {
676                        line: 0,
677                        character: 51,
678                    },
679                }),
680            ]
681        );
682        Ok(())
683    }
684
685    #[test]
686    fn test_split_newlines() -> Result<()> {
687        let pwd = std::env::current_dir()?;
688        let uri = pwd
689            .join("text/split_newlines.[]")
690            .to_string_lossy()
691            .to_string();
692        let tokens = tokenize(uri.clone())?;
693
694        assert_eq!(
695            tokens,
696            vec![
697                Token::Text(
698                    "Hello,".to_string(),
699                    Location {
700                        start: LocationData {
701                            line: 0,
702                            character: 0,
703                        },
704                        end: LocationData {
705                            line: 0,
706                            character: 6,
707                        },
708                    }
709                ),
710                Token::NewLine(Location {
711                    start: LocationData {
712                        line: 0,
713                        character: 6,
714                    },
715                    end: LocationData {
716                        line: 0,
717                        character: 7,
718                    },
719                }),
720                Token::Text(
721                    "World,".to_string(),
722                    Location {
723                        start: LocationData {
724                            line: 1,
725                            character: 0,
726                        },
727                        end: LocationData {
728                            line: 1,
729                            character: 6,
730                        },
731                    }
732                ),
733                Token::NewLine(Location {
734                    start: LocationData {
735                        line: 1,
736                        character: 6,
737                    },
738                    end: LocationData {
739                        line: 1,
740                        character: 7,
741                    },
742                }),
743                Token::CurlyBracketOpen(Location {
744                    start: LocationData {
745                        line: 2,
746                        character: 0,
747                    },
748                    end: LocationData {
749                        line: 2,
750                        character: 1,
751                    },
752                }),
753                Token::Module(
754                    "std".to_string(),
755                    Location {
756                        start: LocationData {
757                            line: 2,
758                            character: 1,
759                        },
760                        end: LocationData {
761                            line: 2,
762                            character: 4,
763                        },
764                    }
765                ),
766                Token::Dot(Location {
767                    start: LocationData {
768                        line: 2,
769                        character: 4,
770                    },
771                    end: LocationData {
772                        line: 2,
773                        character: 5,
774                    },
775                }),
776                Token::Ident(
777                    "**".to_string(),
778                    Location {
779                        start: LocationData {
780                            line: 2,
781                            character: 5,
782                        },
783                        end: LocationData {
784                            line: 2,
785                            character: 7,
786                        },
787                    }
788                ),
789                Token::Text(
790                    "Contact".to_string(),
791                    Location {
792                        start: LocationData {
793                            line: 2,
794                            character: 8,
795                        },
796                        end: LocationData {
797                            line: 2,
798                            character: 15,
799                        },
800                    }
801                ),
802                Token::CurlyBracketClose(Location {
803                    start: LocationData {
804                        line: 2,
805                        character: 15,
806                    },
807                    end: LocationData {
808                        line: 2,
809                        character: 16,
810                    },
811                }),
812                Token::NewLine(Location {
813                    start: LocationData {
814                        line: 2,
815                        character: 16,
816                    },
817                    end: LocationData {
818                        line: 2,
819                        character: 17,
820                    },
821                }),
822                Token::SquareBracketOpen(Location {
823                    start: LocationData {
824                        line: 3,
825                        character: 0,
826                    },
827                    end: LocationData {
828                        line: 3,
829                        character: 1,
830                    },
831                }),
832                Token::Module(
833                    "std".to_string(),
834                    Location {
835                        start: LocationData {
836                            line: 3,
837                            character: 1,
838                        },
839                        end: LocationData {
840                            line: 3,
841                            character: 4,
842                        },
843                    }
844                ),
845                Token::Dot(Location {
846                    start: LocationData {
847                        line: 3,
848                        character: 4,
849                    },
850                    end: LocationData {
851                        line: 3,
852                        character: 5,
853                    },
854                }),
855                Token::Ident(
856                    "@".to_string(),
857                    Location {
858                        start: LocationData {
859                            line: 3,
860                            character: 5,
861                        },
862                        end: LocationData {
863                            line: 3,
864                            character: 6,
865                        },
866                    }
867                ),
868                Token::Text(
869                    "My website".to_string(),
870                    Location {
871                        start: LocationData {
872                            line: 3,
873                            character: 7,
874                        },
875                        end: LocationData {
876                            line: 3,
877                            character: 17,
878                        },
879                    }
880                ),
881                Token::Comma(Location {
882                    start: LocationData {
883                        line: 3,
884                        character: 17,
885                    },
886                    end: LocationData {
887                        line: 3,
888                        character: 18,
889                    },
890                }),
891                Token::Text(
892                    "https://example.com/".to_string(),
893                    Location {
894                        start: LocationData {
895                            line: 3,
896                            character: 19,
897                        },
898                        end: LocationData {
899                            line: 3,
900                            character: 39,
901                        },
902                    }
903                ),
904                Token::SquareBracketClose(Location {
905                    start: LocationData {
906                        line: 3,
907                        character: 39,
908                    },
909                    end: LocationData {
910                        line: 3,
911                        character: 40,
912                    },
913                }),
914                Token::NewLine(Location {
915                    start: LocationData {
916                        line: 3,
917                        character: 40,
918                    },
919                    end: LocationData {
920                        line: 3,
921                        character: 41,
922                    },
923                }),
924                Token::NewLine(Location {
925                    start: LocationData {
926                        line: 4,
927                        character: 0,
928                    },
929                    end: LocationData {
930                        line: 4,
931                        character: 1,
932                    },
933                }),
934                Token::Text(
935                    "2023.12.28".to_string(),
936                    Location {
937                        start: LocationData {
938                            line: 5,
939                            character: 0,
940                        },
941                        end: LocationData {
942                            line: 5,
943                            character: 10,
944                        },
945                    }
946                ),
947                Token::NewLine(Location {
948                    start: LocationData {
949                        line: 5,
950                        character: 10,
951                    },
952                    end: LocationData {
953                        line: 5,
954                        character: 11,
955                    },
956                }),
957                Token::EOF(Location {
958                    start: LocationData {
959                        line: 6,
960                        character: 0,
961                    },
962                    end: LocationData {
963                        line: 6,
964                        character: 0,
965                    },
966                }),
967            ]
968        );
969        Ok(())
970    }
971
972    #[test]
973    fn test_split_japanese_and_emoji() -> Result<()> {
974        let pwd = std::env::current_dir()?;
975        let uri = pwd
976            .join("text/split_japanese_and_emoji.[]")
977            .to_string_lossy()
978            .to_string();
979        let tokens = tokenize(uri.clone())?;
980        assert_eq!(
981            tokens,
982            vec![
983                Token::Text(
984                    "こんにちは!🇯🇵".to_string(),
985                    Location {
986                        start: LocationData {
987                            line: 0,
988                            character: 0,
989                        },
990                        end: LocationData {
991                            line: 0,
992                            character: 7,
993                        },
994                    }
995                ),
996                Token::EOF(Location {
997                    start: LocationData {
998                        line: 0,
999                        character: 7,
1000                    },
1001                    end: LocationData {
1002                        line: 0,
1003                        character: 7,
1004                    },
1005                }),
1006            ]
1007        );
1008        Ok(())
1009    }
1010}