Struct TokenizerConfig

Source

pub struct TokenizerConfig {Show 14 fields
    pub keywords: HashMap<String, TokenType>,
    pub single_tokens: HashMap<char, TokenType>,
    pub quotes: HashMap<String, String>,
    pub identifiers: HashMap<char, char>,
    pub comments: HashMap<String, Option<String>>,
    pub string_escapes: Vec<char>,
    pub nested_comments: bool,
    pub escape_follow_chars: Vec<char>,
    pub b_prefix_is_byte_string: bool,
    pub numeric_literals: HashMap<String, String>,
    pub identifiers_can_start_with_digit: bool,
    pub hex_number_strings: bool,
    pub hex_string_is_integer_type: bool,
    pub string_escapes_allowed_in_raw_strings: bool,
}

Expand description

Tokenizer configuration for a dialect

Fields§

§keywords: HashMap<String, TokenType>

Keywords mapping (uppercase keyword -> token type)

§single_tokens: HashMap<char, TokenType>

Single character tokens

§quotes: HashMap<String, String>

Quote characters (start -> end)

§identifiers: HashMap<char, char>

Identifier quote characters (start -> end)

§comments: HashMap<String, Option<String>>

Comment definitions (start -> optional end)

§string_escapes: Vec<char>

String escape characters

§nested_comments: bool

Whether to support nested comments

§escape_follow_chars: Vec<char>

Valid escape follow characters (for MySQL-style escaping). When a backslash is followed by a character NOT in this list, the backslash is discarded. When empty, all backslash escapes preserve the backslash for unrecognized sequences.

§b_prefix_is_byte_string: bool

Whether b’…’ is a byte string (true for BigQuery) or bit string (false for standard SQL). Default is false (bit string).

§numeric_literals: HashMap<String, String>

Numeric literal suffixes (uppercase suffix -> type name), e.g. {“L”: “BIGINT”, “S”: “SMALLINT”} Used by Hive/Spark to parse 1L as CAST(1 AS BIGINT)

§identifiers_can_start_with_digit: bool

Whether unquoted identifiers can start with a digit (e.g., 1a, 1_a). When true, a number followed by letters/underscore is treated as an identifier. Used by Hive, Spark, MySQL, ClickHouse.

§hex_number_strings: bool

Whether 0x/0X prefix should be treated as hex literals. When true, 0XCC is tokenized instead of Number(“0”) + Identifier(“XCC”). Used by BigQuery, SQLite, Teradata.

§hex_string_is_integer_type: bool

Whether hex string literals from 0x prefix represent integer values. When true (BigQuery), 0xA is tokenized as HexNumber (integer in hex notation). When false (SQLite, Teradata), 0xCC is tokenized as HexString (binary/blob value).

§string_escapes_allowed_in_raw_strings: bool

Whether string escape sequences (like ') are allowed in raw strings. When true (BigQuery default), ' inside r’…’ escapes the quote. When false (Spark/Databricks), backslashes in raw strings are always literal. Python sqlglot: STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS (default True)

TokenizerConfig

Struct TokenizerConfig Copy item path

Fields§

Trait Implementations§

impl Clone for TokenizerConfig

fn clone(&self) -> TokenizerConfig

fn clone_from(&mut self, source: &Self)

impl Debug for TokenizerConfig

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for TokenizerConfig

fn default() -> Self

Auto Trait Implementations§

impl Freeze for TokenizerConfig

impl RefUnwindSafe for TokenizerConfig

impl Send for TokenizerConfig

impl Sync for TokenizerConfig

impl Unpin for TokenizerConfig

impl UnwindSafe for TokenizerConfig

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct TokenizerConfig

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,