brack_tokenizer/tokenize.rs
1use anyhow::Result;
2use std::{fs::File, io::Read, path::Path};
3
4use crate::{dispatch::dispatch, tokenizer::Tokenizer, tokens::Token};
5
6pub fn tokenize<P: AsRef<Path>>(path: P) -> Result<Vec<Token>> {
7 let mut file = File::open(&path)?;
8 let mut text = String::new();
9 file.read_to_string(&mut text)?;
10 let t = Tokenizer {
11 line: Some(0),
12 column: Some(0),
13 token_start_line: Some(0),
14 token_start_column: Some(0),
15 untreated: Some(text),
16 ..Default::default()
17 };
18 Ok(dispatch(&t))
19}
20
21#[cfg(test)]
22mod tests {
23 use super::tokenize;
24 use crate::tokens::{Location, LocationData, Token};
25 use anyhow::Result;
26
27 #[test]
28 fn test_split_no_commands() -> Result<()> {
29 let pwd = std::env::current_dir()?;
30 let uri = pwd
31 .join("text/split_no_commands.[]")
32 .to_string_lossy()
33 .to_string();
34 let tokens = tokenize(uri.clone())?;
35 assert_eq!(
36 tokens,
37 vec![
38 Token::Text(
39 "Hello, World!".to_string(),
40 Location {
41 start: LocationData {
42 line: 0,
43 character: 0,
44 },
45 end: LocationData {
46 line: 0,
47 character: 13,
48 }
49 },
50 ),
51 Token::EOF(Location {
52 start: LocationData {
53 line: 0,
54 character: 13,
55 },
56 end: LocationData {
57 line: 0,
58 character: 13,
59 }
60 }),
61 ]
62 );
63 Ok(())
64 }
65
66 #[test]
67 fn test_split_commands_with_an_argument_includes_square_brackets() -> Result<()> {
68 let pwd = std::env::current_dir()?;
69 let uri = pwd
70 .join("text/split_commands_with_an_argument_includes_square_brackets.[]")
71 .to_string_lossy()
72 .to_string();
73 let tokens = tokenize(uri.clone())?;
74 assert_eq!(
75 tokens,
76 vec![
77 Token::Text(
78 "Hello, ".to_string(),
79 Location {
80 start: LocationData {
81 line: 0,
82 character: 0,
83 },
84 end: LocationData {
85 line: 0,
86 character: 7,
87 },
88 },
89 ),
90 Token::SquareBracketOpen(Location {
91 start: LocationData {
92 line: 0,
93 character: 7,
94 },
95 end: LocationData {
96 line: 0,
97 character: 8,
98 },
99 }),
100 Token::Module(
101 "std".to_string(),
102 Location {
103 start: LocationData {
104 line: 0,
105 character: 8,
106 },
107 end: LocationData {
108 line: 0,
109 character: 11,
110 },
111 }
112 ),
113 Token::Dot(Location {
114 start: LocationData {
115 line: 0,
116 character: 11,
117 },
118 end: LocationData {
119 line: 0,
120 character: 12,
121 },
122 }),
123 Token::Ident(
124 "*".to_string(),
125 Location {
126 start: LocationData {
127 line: 0,
128 character: 12,
129 },
130 end: LocationData {
131 line: 0,
132 character: 13,
133 },
134 }
135 ),
136 Token::Text(
137 "World!".to_string(),
138 Location {
139 start: LocationData {
140 line: 0,
141 character: 14,
142 },
143 end: LocationData {
144 line: 0,
145 character: 20,
146 },
147 }
148 ),
149 Token::SquareBracketClose(Location {
150 start: LocationData {
151 line: 0,
152 character: 20,
153 },
154 end: LocationData {
155 line: 0,
156 character: 21,
157 },
158 }),
159 Token::EOF(Location {
160 start: LocationData {
161 line: 0,
162 character: 21,
163 },
164 end: LocationData {
165 line: 0,
166 character: 21,
167 },
168 }),
169 ]
170 );
171 Ok(())
172 }
173
174 #[test]
175 fn test_split_commands_with_an_argument_includes_curly_brackets() -> Result<()> {
176 let pwd = std::env::current_dir()?;
177 let uri = pwd
178 .join("text/split_commands_with_an_argument_includes_curly_brackets.[]")
179 .to_string_lossy()
180 .to_string();
181 let tokens = tokenize(uri.clone())?;
182 assert_eq!(
183 tokens,
184 vec![
185 Token::Text(
186 "Hello, ".to_string(),
187 Location {
188 start: LocationData {
189 line: 0,
190 character: 0,
191 },
192 end: LocationData {
193 line: 0,
194 character: 7,
195 },
196 }
197 ),
198 Token::CurlyBracketOpen(Location {
199 start: LocationData {
200 line: 0,
201 character: 7,
202 },
203 end: LocationData {
204 line: 0,
205 character: 8,
206 },
207 }),
208 Token::Module(
209 "std".to_string(),
210 Location {
211 start: LocationData {
212 line: 0,
213 character: 8,
214 },
215 end: LocationData {
216 line: 0,
217 character: 11,
218 },
219 }
220 ),
221 Token::Dot(Location {
222 start: LocationData {
223 line: 0,
224 character: 11,
225 },
226 end: LocationData {
227 line: 0,
228 character: 12,
229 },
230 }),
231 Token::Ident(
232 "*".to_string(),
233 Location {
234 start: LocationData {
235 line: 0,
236 character: 12,
237 },
238 end: LocationData {
239 line: 0,
240 character: 13,
241 },
242 }
243 ),
244 Token::Text(
245 "World!".to_string(),
246 Location {
247 start: LocationData {
248 line: 0,
249 character: 14,
250 },
251 end: LocationData {
252 line: 0,
253 character: 20,
254 },
255 }
256 ),
257 Token::CurlyBracketClose(Location {
258 start: LocationData {
259 line: 0,
260 character: 20,
261 },
262 end: LocationData {
263 line: 0,
264 character: 21,
265 },
266 }),
267 Token::EOF(Location {
268 start: LocationData {
269 line: 0,
270 character: 21,
271 },
272 end: LocationData {
273 line: 0,
274 character: 21,
275 },
276 }),
277 ]
278 );
279 Ok(())
280 }
281
282 #[test]
283 fn test_split_commands_with_an_argument_includes_angle_brackets() -> Result<()> {
284 let pwd = std::env::current_dir()?;
285 let uri = pwd
286 .join("text/split_commands_with_an_argument_includes_angle_brackets.[]")
287 .to_string_lossy()
288 .to_string();
289 let tokens = tokenize(uri.clone())?;
290 assert_eq!(
291 tokens,
292 vec![
293 Token::Text(
294 "Hello, ".to_string(),
295 Location {
296 start: LocationData {
297 line: 0,
298 character: 0,
299 },
300 end: LocationData {
301 line: 0,
302 character: 7,
303 },
304 }
305 ),
306 Token::AngleBracketOpen(Location {
307 start: LocationData {
308 line: 0,
309 character: 7,
310 },
311 end: LocationData {
312 line: 0,
313 character: 8,
314 },
315 }),
316 Token::Ident(
317 "*".to_string(),
318 Location {
319 start: LocationData {
320 line: 0,
321 character: 8,
322 },
323 end: LocationData {
324 line: 0,
325 character: 9,
326 },
327 }
328 ),
329 Token::Text(
330 "World!".to_string(),
331 Location {
332 start: LocationData {
333 line: 0,
334 character: 10,
335 },
336 end: LocationData {
337 line: 0,
338 character: 16,
339 },
340 }
341 ),
342 Token::AngleBracketClose(Location {
343 start: LocationData {
344 line: 0,
345 character: 16,
346 },
347 end: LocationData {
348 line: 0,
349 character: 17,
350 },
351 }),
352 Token::EOF(Location {
353 start: LocationData {
354 line: 0,
355 character: 17,
356 },
357 end: LocationData {
358 line: 0,
359 character: 17,
360 },
361 }),
362 ]
363 );
364 Ok(())
365 }
366
367 #[test]
368 fn test_split_commands_with_two_arguments_includes_square_brackets() -> Result<()> {
369 let pwd = std::env::current_dir()?;
370 let uri = pwd
371 .join("text/split_commands_with_two_arguments_includes_square_brackets.[]")
372 .to_string_lossy()
373 .to_string();
374 let tokens = tokenize(uri.clone())?;
375 assert_eq!(
376 tokens,
377 vec![
378 Token::Text(
379 "Hello, ".to_string(),
380 Location {
381 start: LocationData {
382 line: 0,
383 character: 0,
384 },
385 end: LocationData {
386 line: 0,
387 character: 7,
388 },
389 }
390 ),
391 Token::SquareBracketOpen(Location {
392 start: LocationData {
393 line: 0,
394 character: 7,
395 },
396 end: LocationData {
397 line: 0,
398 character: 8,
399 },
400 }),
401 Token::Module(
402 "std".to_string(),
403 Location {
404 start: LocationData {
405 line: 0,
406 character: 8,
407 },
408 end: LocationData {
409 line: 0,
410 character: 11,
411 },
412 }
413 ),
414 Token::Dot(Location {
415 start: LocationData {
416 line: 0,
417 character: 11,
418 },
419 end: LocationData {
420 line: 0,
421 character: 12,
422 },
423 }),
424 Token::Ident(
425 "@".to_string(),
426 Location {
427 start: LocationData {
428 line: 0,
429 character: 12,
430 },
431 end: LocationData {
432 line: 0,
433 character: 13,
434 },
435 }
436 ),
437 Token::Text(
438 "World!".to_string(),
439 Location {
440 start: LocationData {
441 line: 0,
442 character: 14,
443 },
444 end: LocationData {
445 line: 0,
446 character: 20,
447 },
448 }
449 ),
450 Token::Comma(Location {
451 start: LocationData {
452 line: 0,
453 character: 20,
454 },
455 end: LocationData {
456 line: 0,
457 character: 21,
458 },
459 }),
460 Token::Text(
461 "https://example.com/".to_string(),
462 Location {
463 start: LocationData {
464 line: 0,
465 character: 22,
466 },
467 end: LocationData {
468 line: 0,
469 character: 42,
470 },
471 }
472 ),
473 Token::SquareBracketClose(Location {
474 start: LocationData {
475 line: 0,
476 character: 42,
477 },
478 end: LocationData {
479 line: 0,
480 character: 43,
481 },
482 }),
483 Token::EOF(Location {
484 start: LocationData {
485 line: 0,
486 character: 43,
487 },
488 end: LocationData {
489 line: 0,
490 character: 43,
491 },
492 }),
493 ]
494 );
495 Ok(())
496 }
497
498 #[test]
499 fn test_split_nesting_commands() -> Result<()> {
500 let pwd = std::env::current_dir()?;
501 let uri = pwd
502 .join("text/split_nesting_commands.[]")
503 .to_string_lossy()
504 .to_string();
505 let tokens = tokenize(uri.clone())?;
506 assert_eq!(
507 tokens,
508 vec![
509 Token::Text(
510 "Hello, ".to_string(),
511 Location {
512 start: LocationData {
513 line: 0,
514 character: 0,
515 },
516 end: LocationData {
517 line: 0,
518 character: 7,
519 },
520 }
521 ),
522 Token::SquareBracketOpen(Location {
523 start: LocationData {
524 line: 0,
525 character: 7,
526 },
527 end: LocationData {
528 line: 0,
529 character: 8,
530 },
531 }),
532 Token::Module(
533 "std".to_string(),
534 Location {
535 start: LocationData {
536 line: 0,
537 character: 8,
538 },
539 end: LocationData {
540 line: 0,
541 character: 11,
542 },
543 }
544 ),
545 Token::Dot(Location {
546 start: LocationData {
547 line: 0,
548 character: 11,
549 },
550 end: LocationData {
551 line: 0,
552 character: 12,
553 },
554 }),
555 Token::Ident(
556 "*".to_string(),
557 Location {
558 start: LocationData {
559 line: 0,
560 character: 12,
561 },
562 end: LocationData {
563 line: 0,
564 character: 13,
565 },
566 }
567 ),
568 Token::SquareBracketOpen(Location {
569 start: LocationData {
570 line: 0,
571 character: 14,
572 },
573 end: LocationData {
574 line: 0,
575 character: 15,
576 },
577 }),
578 Token::Module(
579 "std".to_string(),
580 Location {
581 start: LocationData {
582 line: 0,
583 character: 15,
584 },
585 end: LocationData {
586 line: 0,
587 character: 18,
588 },
589 }
590 ),
591 Token::Dot(Location {
592 start: LocationData {
593 line: 0,
594 character: 18,
595 },
596 end: LocationData {
597 line: 0,
598 character: 19,
599 },
600 }),
601 Token::Ident(
602 "@".to_string(),
603 Location {
604 start: LocationData {
605 line: 0,
606 character: 19,
607 },
608 end: LocationData {
609 line: 0,
610 character: 20,
611 },
612 }
613 ),
614 Token::Text(
615 "World!".to_string(),
616 Location {
617 start: LocationData {
618 line: 0,
619 character: 21,
620 },
621 end: LocationData {
622 line: 0,
623 character: 27,
624 },
625 }
626 ),
627 Token::Comma(Location {
628 start: LocationData {
629 line: 0,
630 character: 27,
631 },
632 end: LocationData {
633 line: 0,
634 character: 28,
635 },
636 }),
637 Token::Text(
638 "https://example.com/".to_string(),
639 Location {
640 start: LocationData {
641 line: 0,
642 character: 29,
643 },
644 end: LocationData {
645 line: 0,
646 character: 49,
647 },
648 }
649 ),
650 Token::SquareBracketClose(Location {
651 start: LocationData {
652 line: 0,
653 character: 49,
654 },
655 end: LocationData {
656 line: 0,
657 character: 50,
658 },
659 }),
660 Token::SquareBracketClose(Location {
661 start: LocationData {
662 line: 0,
663 character: 50,
664 },
665 end: LocationData {
666 line: 0,
667 character: 51,
668 },
669 }),
670 Token::EOF(Location {
671 start: LocationData {
672 line: 0,
673 character: 51,
674 },
675 end: LocationData {
676 line: 0,
677 character: 51,
678 },
679 }),
680 ]
681 );
682 Ok(())
683 }
684
685 #[test]
686 fn test_split_newlines() -> Result<()> {
687 let pwd = std::env::current_dir()?;
688 let uri = pwd
689 .join("text/split_newlines.[]")
690 .to_string_lossy()
691 .to_string();
692 let tokens = tokenize(uri.clone())?;
693
694 assert_eq!(
695 tokens,
696 vec![
697 Token::Text(
698 "Hello,".to_string(),
699 Location {
700 start: LocationData {
701 line: 0,
702 character: 0,
703 },
704 end: LocationData {
705 line: 0,
706 character: 6,
707 },
708 }
709 ),
710 Token::NewLine(Location {
711 start: LocationData {
712 line: 0,
713 character: 6,
714 },
715 end: LocationData {
716 line: 0,
717 character: 7,
718 },
719 }),
720 Token::Text(
721 "World,".to_string(),
722 Location {
723 start: LocationData {
724 line: 1,
725 character: 0,
726 },
727 end: LocationData {
728 line: 1,
729 character: 6,
730 },
731 }
732 ),
733 Token::NewLine(Location {
734 start: LocationData {
735 line: 1,
736 character: 6,
737 },
738 end: LocationData {
739 line: 1,
740 character: 7,
741 },
742 }),
743 Token::CurlyBracketOpen(Location {
744 start: LocationData {
745 line: 2,
746 character: 0,
747 },
748 end: LocationData {
749 line: 2,
750 character: 1,
751 },
752 }),
753 Token::Module(
754 "std".to_string(),
755 Location {
756 start: LocationData {
757 line: 2,
758 character: 1,
759 },
760 end: LocationData {
761 line: 2,
762 character: 4,
763 },
764 }
765 ),
766 Token::Dot(Location {
767 start: LocationData {
768 line: 2,
769 character: 4,
770 },
771 end: LocationData {
772 line: 2,
773 character: 5,
774 },
775 }),
776 Token::Ident(
777 "**".to_string(),
778 Location {
779 start: LocationData {
780 line: 2,
781 character: 5,
782 },
783 end: LocationData {
784 line: 2,
785 character: 7,
786 },
787 }
788 ),
789 Token::Text(
790 "Contact".to_string(),
791 Location {
792 start: LocationData {
793 line: 2,
794 character: 8,
795 },
796 end: LocationData {
797 line: 2,
798 character: 15,
799 },
800 }
801 ),
802 Token::CurlyBracketClose(Location {
803 start: LocationData {
804 line: 2,
805 character: 15,
806 },
807 end: LocationData {
808 line: 2,
809 character: 16,
810 },
811 }),
812 Token::NewLine(Location {
813 start: LocationData {
814 line: 2,
815 character: 16,
816 },
817 end: LocationData {
818 line: 2,
819 character: 17,
820 },
821 }),
822 Token::SquareBracketOpen(Location {
823 start: LocationData {
824 line: 3,
825 character: 0,
826 },
827 end: LocationData {
828 line: 3,
829 character: 1,
830 },
831 }),
832 Token::Module(
833 "std".to_string(),
834 Location {
835 start: LocationData {
836 line: 3,
837 character: 1,
838 },
839 end: LocationData {
840 line: 3,
841 character: 4,
842 },
843 }
844 ),
845 Token::Dot(Location {
846 start: LocationData {
847 line: 3,
848 character: 4,
849 },
850 end: LocationData {
851 line: 3,
852 character: 5,
853 },
854 }),
855 Token::Ident(
856 "@".to_string(),
857 Location {
858 start: LocationData {
859 line: 3,
860 character: 5,
861 },
862 end: LocationData {
863 line: 3,
864 character: 6,
865 },
866 }
867 ),
868 Token::Text(
869 "My website".to_string(),
870 Location {
871 start: LocationData {
872 line: 3,
873 character: 7,
874 },
875 end: LocationData {
876 line: 3,
877 character: 17,
878 },
879 }
880 ),
881 Token::Comma(Location {
882 start: LocationData {
883 line: 3,
884 character: 17,
885 },
886 end: LocationData {
887 line: 3,
888 character: 18,
889 },
890 }),
891 Token::Text(
892 "https://example.com/".to_string(),
893 Location {
894 start: LocationData {
895 line: 3,
896 character: 19,
897 },
898 end: LocationData {
899 line: 3,
900 character: 39,
901 },
902 }
903 ),
904 Token::SquareBracketClose(Location {
905 start: LocationData {
906 line: 3,
907 character: 39,
908 },
909 end: LocationData {
910 line: 3,
911 character: 40,
912 },
913 }),
914 Token::NewLine(Location {
915 start: LocationData {
916 line: 3,
917 character: 40,
918 },
919 end: LocationData {
920 line: 3,
921 character: 41,
922 },
923 }),
924 Token::NewLine(Location {
925 start: LocationData {
926 line: 4,
927 character: 0,
928 },
929 end: LocationData {
930 line: 4,
931 character: 1,
932 },
933 }),
934 Token::Text(
935 "2023.12.28".to_string(),
936 Location {
937 start: LocationData {
938 line: 5,
939 character: 0,
940 },
941 end: LocationData {
942 line: 5,
943 character: 10,
944 },
945 }
946 ),
947 Token::NewLine(Location {
948 start: LocationData {
949 line: 5,
950 character: 10,
951 },
952 end: LocationData {
953 line: 5,
954 character: 11,
955 },
956 }),
957 Token::EOF(Location {
958 start: LocationData {
959 line: 6,
960 character: 0,
961 },
962 end: LocationData {
963 line: 6,
964 character: 0,
965 },
966 }),
967 ]
968 );
969 Ok(())
970 }
971
972 #[test]
973 fn test_split_japanese_and_emoji() -> Result<()> {
974 let pwd = std::env::current_dir()?;
975 let uri = pwd
976 .join("text/split_japanese_and_emoji.[]")
977 .to_string_lossy()
978 .to_string();
979 let tokens = tokenize(uri.clone())?;
980 assert_eq!(
981 tokens,
982 vec![
983 Token::Text(
984 "こんにちは!🇯🇵".to_string(),
985 Location {
986 start: LocationData {
987 line: 0,
988 character: 0,
989 },
990 end: LocationData {
991 line: 0,
992 character: 7,
993 },
994 }
995 ),
996 Token::EOF(Location {
997 start: LocationData {
998 line: 0,
999 character: 7,
1000 },
1001 end: LocationData {
1002 line: 0,
1003 character: 7,
1004 },
1005 }),
1006 ]
1007 );
1008 Ok(())
1009 }
1010}