Struct TokenizerMap

Source

pub struct TokenizerMap {Show 14 fields
    pub id: String,
    pub version: String,
    pub vocab_size: i64,
    pub vocab: Option<HashMap<String, u32>>,
    pub tokens: Option<HashMap<String, String>>,
    pub encoder: Option<String>,
    pub merges: Option<Vec<String>>,
    pub pre_tokenizer_pattern: Option<String>,
    pub pre_tokenizer_program: Option<PreTokProgram>,
    pub byte_fallback_start: Option<i64>,
    pub byte_fallback_end: Option<i64>,
    pub special_tokens: Option<HashMap<String, u32>>,
    pub tool_calling: Option<ToolCallingBlock>,
    pub published_at: Option<String>,
}

Expand description

A per-model tokenizer dialect — the data needed to encode text into token IDs and decode IDs back to text.

Maps are immutable once published; a new model version publishes a new map at a new URL with a new sha256 hash.

Schema v2: TokenizerMap::vocab is the raw HuggingFace tokenizer.json form (byte-level GPT-2-encoded chars or ▁-prefixed metaspace strings). TokenizerMap::tokens is the legacy v1 field, kept for backwards compatibility — the crate::Detokenizer reads from whichever is present.

Fields§

§id: String

Stable, globally unique tokenizer identifier (e.g. "qwen/qwen2").

§version: String

Schema version. "2" for v2 maps; "1" for legacy v1.

§vocab_size: i64

Total number of token IDs in the vocabulary.

§vocab: Option<HashMap<String, u32>>

Vocabulary as { raw_token_text → id }. v2 schema field.

§tokens: Option<HashMap<String, String>>

Legacy v1 vocabulary as { id_string → decoded_text }.

§encoder: Option<String>

Encoder family: "byte_level", "metaspace", or omitted (identity).

§merges: Option<Vec<String>>

BPE merges in priority order (lower index = higher priority).

§pre_tokenizer_pattern: Option<String>

Pre-tokenizer regex pattern. Required for byte_level BPE when pre_tokenizer_program is absent.

§pre_tokenizer_program: Option<PreTokProgram>

Compiled pre-tokenizer program. Preferred over pre_tokenizer_pattern when present — the runtime executes the ops directly with no regex engine, which unblocks the GPT-2-family maps whose (?i:...) and (?!\S) syntax the regex crate doesn’t support. See crate::pretok_program::PreTokProgram and spec/PRETOKENIZER_PROGRAM.md.

§byte_fallback_start: Option<i64>

First ID in the byte-fallback range (inclusive). SentencePiece only.

§byte_fallback_end: Option<i64>

Last ID in the byte-fallback range (inclusive). SentencePiece only.

§special_tokens: Option<HashMap<String, u32>>

Named special tokens. Skipped during text rendering by default.

§tool_calling: Option<ToolCallingBlock>

Per-model tool-calling convention. Optional; populated by @codecai/maps-cli when it detects a known chat-template signature. Absent on maps generated before this block existed; readers MUST treat absence as “convention not declared” rather than as an error. See spec/PROTOCOL.md § “Tool-call calling conventions in the map”.

§published_at: Option<String>

ISO 8601 publish timestamp. Informational.

Struct TokenizerMap Copy item path

Fields§

Implementations§

impl TokenizerMap

pub fn from_json(json: &[u8]) -> Result<Self, TokenizerMapError>

pub fn from_json_str(json: &str) -> Result<Self, TokenizerMapError>

pub fn verify_sha256( bytes: &[u8], expected: &str, ) -> Result<String, (String, String)>

pub fn validate(map: &Self) -> Result<(), TokenizerMapError>

Trait Implementations§

impl Clone for TokenizerMap

fn clone(&self) -> TokenizerMap

fn clone_from(&mut self, source: &Self)

impl Debug for TokenizerMap

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for TokenizerMap

fn default() -> TokenizerMap

impl<'de> Deserialize<'de> for TokenizerMap

fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where __D: Deserializer<'de>,

impl Serialize for TokenizerMap

fn serialize<__S>(&self, __serializer: __S) -> Result<__S::Ok, __S::Error>where __S: Serializer,

Auto Trait Implementations§

impl Freeze for TokenizerMap

impl RefUnwindSafe for TokenizerMap

impl Send for TokenizerMap

impl Sync for TokenizerMap

impl Unpin for TokenizerMap

impl UnsafeUnpin for TokenizerMap

impl UnwindSafe for TokenizerMap

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> PolicyExt for Twhere T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>where T: Policy<B, E>, P: Policy<B, E>,

impl<T> Same for T

type Output = T

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

impl<T> DeserializeOwned for Twhere T: for<'de> Deserialize<'de>,

Struct TokenizerMap

fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where __D: Deserializer<'de>,

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> PolicyExt for T
where T: ?Sized,

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> DeserializeOwned for T
where T: for<'de> Deserialize<'de>,