ass_core/tokenizer/
tokens.rs

1//! Token definitions for ASS script tokenization
2//!
3//! Provides zero-copy token types for lexical analysis of ASS subtitle scripts.
4//! All tokens maintain references to the original source text via lifetime parameters.
5//!
6//! # Token Design
7//!
8//! - Zero-copy via `&'a str` spans referencing source
9//! - Location tracking for error reporting and editor integration
10//! - Semantic token types for context-aware parsing
11//! - Efficient discriminant matching for hot parsing paths
12//!
13//! # Example
14//!
15//! ```rust
16//! use ass_core::tokenizer::{Token, TokenType};
17//!
18//! let source = "[Script Info]";
19//! // Token would be created by tokenizer with span referencing source
20//! let token = Token {
21//!     token_type: TokenType::SectionHeader,
22//!     span: &source[0..12], // "[Script Info"
23//!     line: 1,
24//!     column: 1,
25//! };
26//! ```
27
28use core::fmt;
29
30#[cfg(not(feature = "std"))]
31extern crate alloc;
32/// Token produced by ASS tokenizer with zero-copy span
33///
34/// Represents a lexical unit in ASS script with location information.
35/// The span references the original source text to avoid allocations.
36#[derive(Debug, Clone, PartialEq, Eq)]
37pub struct Token<'a> {
38    /// Token type discriminant
39    pub token_type: TokenType,
40
41    /// Zero-copy span referencing source text
42    pub span: &'a str,
43
44    /// Line number where token starts (1-based)
45    pub line: usize,
46
47    /// Column number where token starts (1-based)
48    pub column: usize,
49}
50
51impl<'a> Token<'a> {
52    /// Create new token with full location information
53    #[must_use]
54    pub const fn new(token_type: TokenType, span: &'a str, line: usize, column: usize) -> Self {
55        Self {
56            token_type,
57            span,
58            line,
59            column,
60        }
61    }
62
63    /// Get token length in characters
64    #[must_use]
65    pub fn len(&self) -> usize {
66        self.span.chars().count()
67    }
68
69    /// Check if token is empty (should not happen in normal tokenization)
70    #[must_use]
71    pub const fn is_empty(&self) -> bool {
72        self.span.is_empty()
73    }
74
75    /// Get end column position
76    #[must_use]
77    pub fn end_column(&self) -> usize {
78        self.column + self.len()
79    }
80
81    /// Check if this token represents whitespace
82    #[must_use]
83    pub const fn is_whitespace(&self) -> bool {
84        matches!(self.token_type, TokenType::Whitespace)
85    }
86
87    /// Check if this token represents a delimiter
88    #[must_use]
89    pub const fn is_delimiter(&self) -> bool {
90        matches!(
91            self.token_type,
92            TokenType::Comma
93                | TokenType::Colon
94                | TokenType::SectionOpen
95                | TokenType::SectionClose
96                | TokenType::OverrideOpen
97                | TokenType::OverrideClose
98        )
99    }
100
101    /// Check if this token represents content (text, numbers, etc.)
102    #[must_use]
103    pub const fn is_content(&self) -> bool {
104        matches!(
105            self.token_type,
106            TokenType::Text
107                | TokenType::Number
108                | TokenType::HexValue
109                | TokenType::SectionName
110                | TokenType::OverrideBlock
111        )
112    }
113
114    /// Validate that span references valid UTF-8
115    #[must_use]
116    pub const fn validate_utf8(&self) -> bool {
117        true
118    }
119}
120
121impl fmt::Display for Token<'_> {
122    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
123        write!(
124            f,
125            "{:?}@{}:{} '{}'",
126            self.token_type, self.line, self.column, self.span
127        )
128    }
129}
130
131/// Token type discriminant for efficient pattern matching
132///
133/// Represents the semantic type of a lexical unit in ASS scripts.
134/// Ordered roughly by parsing frequency for optimization.
135#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
136pub enum TokenType {
137    /// Plain text content
138    Text,
139
140    /// Numeric value (integer or float)
141    Number,
142
143    /// Hexadecimal value (colors, etc.)
144    HexValue,
145
146    /// Field separator (:)
147    Colon,
148
149    /// Value separator (,)
150    Comma,
151
152    /// Line ending (\n, \r\n)
153    Newline,
154
155    /// Section header opening [
156    SectionOpen,
157
158    /// Section header closing ]
159    SectionClose,
160
161    /// Section name inside brackets
162    SectionName,
163
164    /// Complete section header token
165    SectionHeader,
166
167    /// Style override opening {
168    OverrideOpen,
169
170    /// Style override closing }
171    OverrideClose,
172
173    /// Style override block content
174    OverrideBlock,
175
176    /// Comment line (; or !:)
177    Comment,
178
179    /// Whitespace (spaces, tabs)
180    Whitespace,
181
182    /// Drawing mode scale indicator (\p)
183    DrawingScale,
184
185    /// UU-encoded data line
186    UuEncodedLine,
187
188    /// Font filename declaration
189    FontFilename,
190
191    /// Graphic filename declaration
192    GraphicFilename,
193
194    /// Format declaration line
195    FormatLine,
196
197    /// Event type (Dialogue, Comment, etc.)
198    EventType,
199
200    /// Time value (H:MM:SS.CC)
201    TimeValue,
202
203    /// Boolean value (-1, 0, 1)
204    BooleanValue,
205
206    /// Percentage value (scale, alpha)
207    PercentageValue,
208
209    /// String literal (quoted text)
210    StringLiteral,
211
212    /// Invalid/unrecognized token
213    Invalid,
214
215    /// End of file marker
216    Eof,
217}
218
219impl TokenType {
220    /// Check if token type represents a delimiter
221    #[must_use]
222    pub const fn is_delimiter(self) -> bool {
223        matches!(
224            self,
225            Self::Colon
226                | Self::Comma
227                | Self::SectionOpen
228                | Self::SectionClose
229                | Self::OverrideOpen
230                | Self::OverrideClose
231        )
232    }
233
234    /// Check if token type represents structural elements
235    #[must_use]
236    pub const fn is_structural(self) -> bool {
237        matches!(
238            self,
239            Self::SectionHeader
240                | Self::SectionOpen
241                | Self::SectionClose
242                | Self::FormatLine
243                | Self::Newline
244        )
245    }
246
247    /// Check if token type represents data content
248    #[must_use]
249    pub const fn is_content(self) -> bool {
250        matches!(
251            self,
252            Self::Text
253                | Self::Number
254                | Self::HexValue
255                | Self::TimeValue
256                | Self::BooleanValue
257                | Self::PercentageValue
258                | Self::StringLiteral
259        )
260    }
261
262    /// Check if token type can be skipped during parsing
263    #[must_use]
264    pub const fn is_skippable(self) -> bool {
265        matches!(self, Self::Whitespace | Self::Comment)
266    }
267
268    /// Get human-readable name for error messages
269    #[must_use]
270    pub const fn name(self) -> &'static str {
271        match self {
272            Self::Text => "text",
273            Self::Number => "number",
274            Self::HexValue => "hex value",
275            Self::Colon => "colon",
276            Self::Comma => "comma",
277            Self::Newline => "newline",
278            Self::SectionOpen => "section open",
279            Self::SectionClose => "section close",
280            Self::SectionName => "section name",
281            Self::SectionHeader => "section header",
282            Self::OverrideOpen => "override open",
283            Self::OverrideClose => "override close",
284            Self::OverrideBlock => "override block",
285            Self::Comment => "comment",
286            Self::Whitespace => "whitespace",
287            Self::DrawingScale => "drawing scale",
288            Self::UuEncodedLine => "UU-encoded line",
289            Self::FontFilename => "font filename",
290            Self::GraphicFilename => "graphic filename",
291            Self::FormatLine => "format line",
292            Self::EventType => "event type",
293            Self::TimeValue => "time value",
294            Self::BooleanValue => "boolean value",
295            Self::PercentageValue => "percentage value",
296            Self::StringLiteral => "string literal",
297            Self::Invalid => "invalid token",
298            Self::Eof => "end of file",
299        }
300    }
301}
302
303impl fmt::Display for TokenType {
304    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
305        write!(f, "{}", self.name())
306    }
307}
308
309/// Delimiter type for context-aware tokenization
310///
311/// Helps tokenizer understand context when encountering delimiter characters
312/// that may have different meanings in different sections.
313#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
314pub enum DelimiterType {
315    /// Field separator in key-value pairs
316    FieldSeparator,
317
318    /// Value separator in CSV format
319    ValueSeparator,
320
321    /// Section boundary marker
322    SectionBoundary,
323
324    /// Style override boundary
325    OverrideBoundary,
326
327    /// Comment marker
328    CommentMarker,
329
330    /// Line terminator
331    LineTerminator,
332
333    /// Drawing command separator
334    DrawingSeparator,
335
336    /// Time component separator
337    TimeSeparator,
338
339    /// Color component separator
340    ColorSeparator,
341}
342
343impl DelimiterType {
344    /// Get expected character(s) for this delimiter type
345    #[must_use]
346    pub const fn chars(self) -> &'static [char] {
347        match self {
348            Self::FieldSeparator => &[':'],
349            Self::ValueSeparator => &[','],
350            Self::SectionBoundary => &['[', ']'],
351            Self::OverrideBoundary => &['{', '}'],
352            Self::CommentMarker => &[';'],
353            Self::LineTerminator => &['\n', '\r'],
354            Self::DrawingSeparator => &[' ', '\t'],
355            Self::TimeSeparator => &[':', '.'],
356            Self::ColorSeparator => &['&', 'H'],
357        }
358    }
359
360    /// Check if character matches this delimiter type
361    #[must_use]
362    pub fn matches(self, ch: char) -> bool {
363        self.chars().contains(&ch)
364    }
365}
366
367/// Token stream position for streaming tokenization
368#[derive(Debug, Clone, Copy, PartialEq, Eq)]
369pub struct TokenPosition {
370    /// Byte offset in source
371    pub offset: usize,
372
373    /// Line number (1-based)
374    pub line: usize,
375
376    /// Column number (1-based)
377    pub column: usize,
378}
379
380impl TokenPosition {
381    /// Create new position
382    #[must_use]
383    pub const fn new(offset: usize, line: usize, column: usize) -> Self {
384        Self {
385            offset,
386            line,
387            column,
388        }
389    }
390
391    /// Create position at start of input
392    #[must_use]
393    pub const fn start() -> Self {
394        Self::new(0, 1, 1)
395    }
396
397    /// Advance position by one character
398    #[must_use]
399    pub const fn advance(mut self, ch: char) -> Self {
400        self.offset += ch.len_utf8();
401        if ch == '\n' {
402            self.line += 1;
403            self.column = 1;
404        } else {
405            self.column += 1;
406        }
407        self
408    }
409
410    /// Advance position by string length
411    #[must_use]
412    pub fn advance_by_str(mut self, s: &str) -> Self {
413        for ch in s.chars() {
414            self = self.advance(ch);
415        }
416        self
417    }
418}
419
420impl Default for TokenPosition {
421    fn default() -> Self {
422        Self::start()
423    }
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429    #[cfg(not(feature = "std"))]
430    use alloc::format;
431
432    #[test]
433    fn token_creation() {
434        let span = "test";
435        let token = Token::new(TokenType::Text, span, 1, 5);
436
437        assert_eq!(token.token_type, TokenType::Text);
438        assert_eq!(token.span, "test");
439        assert_eq!(token.line, 1);
440        assert_eq!(token.column, 5);
441        assert_eq!(token.len(), 4);
442        assert_eq!(token.end_column(), 9);
443    }
444
445    #[test]
446    fn token_empty_check() {
447        let empty_token = Token::new(TokenType::Text, "", 1, 1);
448        assert!(empty_token.is_empty());
449
450        let normal_token = Token::new(TokenType::Text, "text", 1, 1);
451        assert!(!normal_token.is_empty());
452    }
453
454    #[test]
455    fn token_type_checks() {
456        assert!(TokenType::Comma.is_delimiter());
457        assert!(TokenType::SectionHeader.is_structural());
458        assert!(TokenType::Text.is_content());
459        assert!(TokenType::Whitespace.is_skippable());
460        assert!(TokenType::Comment.is_skippable());
461    }
462
463    #[test]
464    fn token_classification() {
465        let text_token = Token::new(TokenType::Text, "hello", 1, 1);
466        assert!(text_token.is_content());
467        assert!(!text_token.is_delimiter());
468        assert!(!text_token.is_whitespace());
469
470        let comma_token = Token::new(TokenType::Comma, ",", 1, 6);
471        assert!(comma_token.is_delimiter());
472        assert!(!comma_token.is_content());
473
474        let ws_token = Token::new(TokenType::Whitespace, " ", 1, 7);
475        assert!(ws_token.is_whitespace());
476        assert!(!ws_token.is_content());
477    }
478
479    #[test]
480    fn delimiter_type_matching() {
481        assert!(DelimiterType::FieldSeparator.matches(':'));
482        assert!(DelimiterType::ValueSeparator.matches(','));
483        assert!(DelimiterType::SectionBoundary.matches('['));
484        assert!(DelimiterType::SectionBoundary.matches(']'));
485        assert!(DelimiterType::LineTerminator.matches('\n'));
486
487        assert!(!DelimiterType::FieldSeparator.matches(','));
488        assert!(!DelimiterType::ValueSeparator.matches(':'));
489    }
490
491    #[test]
492    fn token_position_advance() {
493        let mut pos = TokenPosition::start();
494        assert_eq!(pos.line, 1);
495        assert_eq!(pos.column, 1);
496        assert_eq!(pos.offset, 0);
497
498        pos = pos.advance('a');
499        assert_eq!(pos.line, 1);
500        assert_eq!(pos.column, 2);
501        assert_eq!(pos.offset, 1);
502
503        pos = pos.advance('\n');
504        assert_eq!(pos.line, 2);
505        assert_eq!(pos.column, 1);
506        assert_eq!(pos.offset, 2);
507    }
508
509    #[test]
510    fn token_position_advance_string() {
511        let pos = TokenPosition::start();
512        let new_pos = pos.advance_by_str("hello\nworld");
513
514        assert_eq!(new_pos.line, 2);
515        assert_eq!(new_pos.column, 6); // "world" = 5 chars + 1
516        assert_eq!(new_pos.offset, 11); // "hello\nworld".len()
517    }
518
519    #[test]
520    fn token_type_names() {
521        assert_eq!(TokenType::Text.name(), "text");
522        assert_eq!(TokenType::Number.name(), "number");
523        assert_eq!(TokenType::HexValue.name(), "hex value");
524        assert_eq!(TokenType::Invalid.name(), "invalid token");
525    }
526
527    #[test]
528    fn token_display() {
529        let token = Token::new(TokenType::Text, "hello", 2, 5);
530        let display = format!("{token}");
531        assert!(display.contains("Text"));
532        assert!(display.contains("2:5"));
533        assert!(display.contains("hello"));
534    }
535
536    #[test]
537    fn token_utf8_validation() {
538        let token = Token::new(TokenType::Text, "valid utf8", 1, 1);
539        assert!(token.validate_utf8());
540
541        let unicode_token = Token::new(TokenType::Text, "🎵", 1, 1);
542        assert!(unicode_token.validate_utf8());
543    }
544
545    #[test]
546    fn delimiter_token_types_are_delimiters() {
547        assert!(TokenType::Colon.is_delimiter());
548        assert!(TokenType::Comma.is_delimiter());
549        assert!(TokenType::SectionOpen.is_delimiter());
550        assert!(TokenType::SectionClose.is_delimiter());
551        assert!(TokenType::OverrideOpen.is_delimiter());
552        assert!(TokenType::OverrideClose.is_delimiter());
553    }
554
555    #[test]
556    fn non_delimiter_token_types_are_not_delimiters() {
557        assert!(!TokenType::Text.is_delimiter());
558        assert!(!TokenType::Number.is_delimiter());
559        assert!(!TokenType::HexValue.is_delimiter());
560        assert!(!TokenType::SectionName.is_delimiter());
561        assert!(!TokenType::SectionHeader.is_delimiter());
562        assert!(!TokenType::OverrideBlock.is_delimiter());
563        assert!(!TokenType::Comment.is_delimiter());
564        assert!(!TokenType::Whitespace.is_delimiter());
565        assert!(!TokenType::Newline.is_delimiter());
566        assert!(!TokenType::DrawingScale.is_delimiter());
567        assert!(!TokenType::UuEncodedLine.is_delimiter());
568        assert!(!TokenType::FontFilename.is_delimiter());
569        assert!(!TokenType::GraphicFilename.is_delimiter());
570        assert!(!TokenType::FormatLine.is_delimiter());
571        assert!(!TokenType::EventType.is_delimiter());
572        assert!(!TokenType::TimeValue.is_delimiter());
573        assert!(!TokenType::BooleanValue.is_delimiter());
574        assert!(!TokenType::PercentageValue.is_delimiter());
575        assert!(!TokenType::StringLiteral.is_delimiter());
576        assert!(!TokenType::Invalid.is_delimiter());
577        assert!(!TokenType::Eof.is_delimiter());
578    }
579
580    #[test]
581    fn structural_token_types_are_structural() {
582        assert!(TokenType::SectionHeader.is_structural());
583        assert!(TokenType::SectionOpen.is_structural());
584        assert!(TokenType::SectionClose.is_structural());
585        assert!(TokenType::FormatLine.is_structural());
586        assert!(TokenType::Newline.is_structural());
587    }
588
589    #[test]
590    fn non_structural_token_types_are_not_structural() {
591        assert!(!TokenType::Text.is_structural());
592        assert!(!TokenType::Number.is_structural());
593        assert!(!TokenType::HexValue.is_structural());
594        assert!(!TokenType::Colon.is_structural());
595        assert!(!TokenType::Comma.is_structural());
596        assert!(!TokenType::SectionName.is_structural());
597        assert!(!TokenType::OverrideOpen.is_structural());
598        assert!(!TokenType::OverrideClose.is_structural());
599        assert!(!TokenType::OverrideBlock.is_structural());
600        assert!(!TokenType::Comment.is_structural());
601        assert!(!TokenType::Whitespace.is_structural());
602        assert!(!TokenType::DrawingScale.is_structural());
603        assert!(!TokenType::UuEncodedLine.is_structural());
604        assert!(!TokenType::FontFilename.is_structural());
605        assert!(!TokenType::GraphicFilename.is_structural());
606        assert!(!TokenType::EventType.is_structural());
607        assert!(!TokenType::TimeValue.is_structural());
608        assert!(!TokenType::BooleanValue.is_structural());
609        assert!(!TokenType::PercentageValue.is_structural());
610        assert!(!TokenType::StringLiteral.is_structural());
611        assert!(!TokenType::Invalid.is_structural());
612        assert!(!TokenType::Eof.is_structural());
613    }
614
615    #[test]
616    fn content_token_types_are_content() {
617        assert!(TokenType::Text.is_content());
618        assert!(TokenType::Number.is_content());
619        assert!(TokenType::HexValue.is_content());
620        assert!(TokenType::TimeValue.is_content());
621        assert!(TokenType::BooleanValue.is_content());
622        assert!(TokenType::PercentageValue.is_content());
623        assert!(TokenType::StringLiteral.is_content());
624    }
625
626    #[test]
627    fn non_content_token_types_are_not_content() {
628        assert!(!TokenType::Colon.is_content());
629        assert!(!TokenType::Comma.is_content());
630        assert!(!TokenType::Newline.is_content());
631        assert!(!TokenType::SectionOpen.is_content());
632        assert!(!TokenType::SectionClose.is_content());
633        assert!(!TokenType::SectionName.is_content());
634        assert!(!TokenType::SectionHeader.is_content());
635        assert!(!TokenType::OverrideOpen.is_content());
636        assert!(!TokenType::OverrideClose.is_content());
637        assert!(!TokenType::OverrideBlock.is_content());
638        assert!(!TokenType::Comment.is_content());
639        assert!(!TokenType::Whitespace.is_content());
640        assert!(!TokenType::DrawingScale.is_content());
641        assert!(!TokenType::UuEncodedLine.is_content());
642        assert!(!TokenType::FontFilename.is_content());
643        assert!(!TokenType::GraphicFilename.is_content());
644        assert!(!TokenType::FormatLine.is_content());
645        assert!(!TokenType::EventType.is_content());
646        assert!(!TokenType::Invalid.is_content());
647        assert!(!TokenType::Eof.is_content());
648    }
649
650    #[test]
651    fn skippable_token_types_are_skippable() {
652        assert!(TokenType::Whitespace.is_skippable());
653        assert!(TokenType::Comment.is_skippable());
654    }
655
656    #[test]
657    fn basic_non_skippable_token_types() {
658        assert!(!TokenType::Text.is_skippable());
659        assert!(!TokenType::Number.is_skippable());
660        assert!(!TokenType::HexValue.is_skippable());
661        assert!(!TokenType::Colon.is_skippable());
662        assert!(!TokenType::Comma.is_skippable());
663        assert!(!TokenType::Newline.is_skippable());
664    }
665
666    #[test]
667    fn section_non_skippable_token_types() {
668        assert!(!TokenType::SectionOpen.is_skippable());
669        assert!(!TokenType::SectionClose.is_skippable());
670        assert!(!TokenType::SectionName.is_skippable());
671        assert!(!TokenType::SectionHeader.is_skippable());
672    }
673
674    #[test]
675    fn override_non_skippable_token_types() {
676        assert!(!TokenType::OverrideOpen.is_skippable());
677        assert!(!TokenType::OverrideClose.is_skippable());
678        assert!(!TokenType::OverrideBlock.is_skippable());
679    }
680
681    #[test]
682    fn special_non_skippable_token_types() {
683        assert!(!TokenType::DrawingScale.is_skippable());
684        assert!(!TokenType::UuEncodedLine.is_skippable());
685        assert!(!TokenType::FontFilename.is_skippable());
686        assert!(!TokenType::GraphicFilename.is_skippable());
687        assert!(!TokenType::FormatLine.is_skippable());
688        assert!(!TokenType::EventType.is_skippable());
689        assert!(!TokenType::TimeValue.is_skippable());
690        assert!(!TokenType::BooleanValue.is_skippable());
691        assert!(!TokenType::PercentageValue.is_skippable());
692        assert!(!TokenType::StringLiteral.is_skippable());
693        assert!(!TokenType::Invalid.is_skippable());
694        assert!(!TokenType::Eof.is_skippable());
695    }
696
697    #[test]
698    fn basic_token_type_names() {
699        assert_eq!(TokenType::Text.name(), "text");
700        assert_eq!(TokenType::Number.name(), "number");
701        assert_eq!(TokenType::HexValue.name(), "hex value");
702        assert_eq!(TokenType::Colon.name(), "colon");
703        assert_eq!(TokenType::Comma.name(), "comma");
704        assert_eq!(TokenType::Newline.name(), "newline");
705        assert_eq!(TokenType::Invalid.name(), "invalid token");
706        assert_eq!(TokenType::Eof.name(), "end of file");
707    }
708
709    #[test]
710    fn section_token_type_names() {
711        assert_eq!(TokenType::SectionOpen.name(), "section open");
712        assert_eq!(TokenType::SectionClose.name(), "section close");
713        assert_eq!(TokenType::SectionName.name(), "section name");
714        assert_eq!(TokenType::SectionHeader.name(), "section header");
715    }
716
717    #[test]
718    fn override_token_type_names() {
719        assert_eq!(TokenType::OverrideOpen.name(), "override open");
720        assert_eq!(TokenType::OverrideClose.name(), "override close");
721        assert_eq!(TokenType::OverrideBlock.name(), "override block");
722    }
723
724    #[test]
725    fn special_token_type_names() {
726        assert_eq!(TokenType::Comment.name(), "comment");
727        assert_eq!(TokenType::Whitespace.name(), "whitespace");
728        assert_eq!(TokenType::DrawingScale.name(), "drawing scale");
729        assert_eq!(TokenType::UuEncodedLine.name(), "UU-encoded line");
730        assert_eq!(TokenType::FontFilename.name(), "font filename");
731        assert_eq!(TokenType::GraphicFilename.name(), "graphic filename");
732        assert_eq!(TokenType::FormatLine.name(), "format line");
733        assert_eq!(TokenType::EventType.name(), "event type");
734        assert_eq!(TokenType::TimeValue.name(), "time value");
735        assert_eq!(TokenType::BooleanValue.name(), "boolean value");
736        assert_eq!(TokenType::PercentageValue.name(), "percentage value");
737        assert_eq!(TokenType::StringLiteral.name(), "string literal");
738    }
739
740    #[test]
741    fn token_type_display() {
742        assert_eq!(format!("{}", TokenType::Text), "text");
743        assert_eq!(format!("{}", TokenType::Number), "number");
744        assert_eq!(format!("{}", TokenType::Invalid), "invalid token");
745        assert_eq!(format!("{}", TokenType::Eof), "end of file");
746    }
747
748    #[test]
749    fn all_delimiter_types_chars() {
750        assert_eq!(DelimiterType::FieldSeparator.chars(), &[':']);
751        assert_eq!(DelimiterType::ValueSeparator.chars(), &[',']);
752        assert_eq!(DelimiterType::SectionBoundary.chars(), &['[', ']']);
753        assert_eq!(DelimiterType::OverrideBoundary.chars(), &['{', '}']);
754        assert_eq!(DelimiterType::CommentMarker.chars(), &[';']);
755        assert_eq!(DelimiterType::LineTerminator.chars(), &['\n', '\r']);
756        assert_eq!(DelimiterType::DrawingSeparator.chars(), &[' ', '\t']);
757        assert_eq!(DelimiterType::TimeSeparator.chars(), &[':', '.']);
758        assert_eq!(DelimiterType::ColorSeparator.chars(), &['&', 'H']);
759    }
760
761    #[test]
762    fn field_and_value_separator_matching() {
763        assert!(DelimiterType::FieldSeparator.matches(':'));
764        assert!(!DelimiterType::FieldSeparator.matches(','));
765        assert!(!DelimiterType::FieldSeparator.matches('['));
766
767        assert!(DelimiterType::ValueSeparator.matches(','));
768        assert!(!DelimiterType::ValueSeparator.matches(':'));
769        assert!(!DelimiterType::ValueSeparator.matches('['));
770    }
771
772    #[test]
773    fn boundary_delimiter_matching() {
774        assert!(DelimiterType::SectionBoundary.matches('['));
775        assert!(DelimiterType::SectionBoundary.matches(']'));
776        assert!(!DelimiterType::SectionBoundary.matches('{'));
777
778        assert!(DelimiterType::OverrideBoundary.matches('{'));
779        assert!(DelimiterType::OverrideBoundary.matches('}'));
780        assert!(!DelimiterType::OverrideBoundary.matches('['));
781    }
782
783    #[test]
784    fn line_and_comment_delimiter_matching() {
785        assert!(DelimiterType::CommentMarker.matches(';'));
786        assert!(!DelimiterType::CommentMarker.matches('#'));
787
788        assert!(DelimiterType::LineTerminator.matches('\n'));
789        assert!(DelimiterType::LineTerminator.matches('\r'));
790        assert!(!DelimiterType::LineTerminator.matches('\t'));
791    }
792
793    #[test]
794    fn special_delimiter_matching() {
795        assert!(DelimiterType::DrawingSeparator.matches(' '));
796        assert!(DelimiterType::DrawingSeparator.matches('\t'));
797        assert!(!DelimiterType::DrawingSeparator.matches('\n'));
798
799        assert!(DelimiterType::TimeSeparator.matches(':'));
800        assert!(DelimiterType::TimeSeparator.matches('.'));
801        assert!(!DelimiterType::TimeSeparator.matches(','));
802
803        assert!(DelimiterType::ColorSeparator.matches('&'));
804        assert!(DelimiterType::ColorSeparator.matches('H'));
805        assert!(!DelimiterType::ColorSeparator.matches('#'));
806    }
807
808    #[test]
809    fn token_position_edge_cases() {
810        let pos = TokenPosition::new(100, 50, 25);
811        assert_eq!(pos.offset, 100);
812        assert_eq!(pos.line, 50);
813        assert_eq!(pos.column, 25);
814
815        // Test default implementation
816        let default_pos = TokenPosition::default();
817        assert_eq!(default_pos.offset, 0);
818        assert_eq!(default_pos.line, 1);
819        assert_eq!(default_pos.column, 1);
820
821        // Test start method
822        let start_pos = TokenPosition::start();
823        assert_eq!(start_pos.offset, 0);
824        assert_eq!(start_pos.line, 1);
825        assert_eq!(start_pos.column, 1);
826    }
827
828    #[test]
829    fn token_position_unicode_advance() {
830        let mut pos = TokenPosition::start();
831
832        // Test multibyte UTF-8 character
833        pos = pos.advance('🎵'); // 4-byte UTF-8 character
834        assert_eq!(pos.offset, 4);
835        assert_eq!(pos.line, 1);
836        assert_eq!(pos.column, 2);
837
838        // Test combination of ASCII and Unicode
839        pos = pos.advance('a');
840        assert_eq!(pos.offset, 5);
841        assert_eq!(pos.line, 1);
842        assert_eq!(pos.column, 3);
843
844        // Test newline after Unicode
845        pos = pos.advance('\n');
846        assert_eq!(pos.offset, 6);
847        assert_eq!(pos.line, 2);
848        assert_eq!(pos.column, 1);
849    }
850
851    #[test]
852    fn token_position_advance_by_str_edge_cases() {
853        let pos = TokenPosition::start();
854
855        // Empty string
856        let pos = pos.advance_by_str("");
857        assert_eq!(pos.offset, 0);
858        assert_eq!(pos.line, 1);
859        assert_eq!(pos.column, 1);
860
861        // String with only newlines
862        let pos = pos.advance_by_str("\n\n\n");
863        assert_eq!(pos.offset, 3);
864        assert_eq!(pos.line, 4);
865        assert_eq!(pos.column, 1);
866
867        // Mixed content with Unicode
868        let pos = pos.advance_by_str("hello🎵world\ntest");
869        assert_eq!(pos.offset, 3 + 19); // previous 3 + "hello🎵world\ntest".len()
870        assert_eq!(pos.line, 5);
871        assert_eq!(pos.column, 5); // "test".len() + 1
872    }
873
874    #[test]
875    fn token_unicode_length() {
876        // Test token with Unicode characters
877        let unicode_token = Token::new(TokenType::Text, "🎵🎶🎤", 1, 1);
878        assert_eq!(unicode_token.len(), 3); // 3 Unicode characters
879        assert_eq!(unicode_token.end_column(), 4); // column 1 + 3 chars
880
881        // Test token with mixed ASCII and Unicode
882        let mixed_token = Token::new(TokenType::Text, "hello🎵world", 1, 1);
883        assert_eq!(mixed_token.len(), 11); // 5 + 1 + 5 characters
884        assert_eq!(mixed_token.end_column(), 12);
885
886        // Test empty span
887        let empty_token = Token::new(TokenType::Text, "", 1, 1);
888        assert_eq!(empty_token.len(), 0);
889        assert_eq!(empty_token.end_column(), 1);
890    }
891
892    #[test]
893    fn token_comprehensive_classification() {
894        // Test instance methods match TokenType methods
895        let text_token = Token::new(TokenType::Text, "text", 1, 1);
896        assert_eq!(text_token.is_content(), TokenType::Text.is_content());
897        assert_eq!(text_token.is_delimiter(), TokenType::Text.is_delimiter());
898
899        let comma_token = Token::new(TokenType::Comma, ",", 1, 1);
900        assert_eq!(comma_token.is_content(), TokenType::Comma.is_content());
901        assert_eq!(comma_token.is_delimiter(), TokenType::Comma.is_delimiter());
902
903        let whitespace_token = Token::new(TokenType::Whitespace, " ", 1, 1);
904        assert_eq!(
905            whitespace_token.is_whitespace(),
906            matches!(TokenType::Whitespace, TokenType::Whitespace)
907        );
908    }
909
910    #[test]
911    fn token_equality_and_cloning() {
912        let token1 = Token::new(TokenType::Text, "test", 1, 5);
913        let token2 = Token::new(TokenType::Text, "test", 1, 5);
914        let token3 = Token::new(TokenType::Number, "test", 1, 5);
915        let token4 = Token::new(TokenType::Text, "different", 1, 5);
916
917        assert_eq!(token1, token2);
918        assert_ne!(token1, token3);
919        assert_ne!(token1, token4);
920
921        let cloned = token1.clone();
922        assert_eq!(token1, cloned);
923    }
924
925    #[test]
926    fn token_debug_formatting() {
927        let token = Token::new(TokenType::SectionHeader, "[Script Info]", 2, 1);
928        let debug_output = format!("{token:?}");
929        assert!(debug_output.contains("SectionHeader"));
930        assert!(debug_output.contains("[Script Info]"));
931        assert!(debug_output.contains("line: 2"));
932        assert!(debug_output.contains("column: 1"));
933    }
934}
ass_core/tokenizer/tokens.rs

ass_core/tokenizer/
tokens.rs