pub struct MiDataset {
pub tokens: Vec<usize>,
pub tokenizer: MiTokenizer,
}Expand description
MI-tokenized dataset. Single-pass mutual information tokenizer (BA-37). ~500x faster than BPE on large corpora. φ-threshold significance gating.
Fields§
§tokens: Vec<usize>§tokenizer: MiTokenizerImplementations§
Source§impl MiDataset
impl MiDataset
pub fn from_text(text: &str, target_vocab: usize) -> Self
pub fn from_jsonl(path: &Path, target_vocab: usize) -> Result<Self>
pub fn from_file(path: &Path, target_vocab: usize) -> Result<Self>
Sourcepub fn from_dir(path: &Path, target_vocab: usize) -> Result<Self>
pub fn from_dir(path: &Path, target_vocab: usize) -> Result<Self>
Load from a directory of .txt files with MI tokenization.
Sourcepub fn from_path(path: &Path, target_vocab: usize) -> Result<Self>
pub fn from_path(path: &Path, target_vocab: usize) -> Result<Self>
Auto-detect format: .jsonl, .txt file, or directory of .txt files.
pub fn vocab_size(&self) -> usize
pub fn len(&self) -> usize
pub fn decode(&self, tokens: &[usize]) -> String
Auto Trait Implementations§
impl Freeze for MiDataset
impl RefUnwindSafe for MiDataset
impl Send for MiDataset
impl Sync for MiDataset
impl Unpin for MiDataset
impl UnsafeUnpin for MiDataset
impl UnwindSafe for MiDataset
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more