Expand description
TrustformeRS Tokenizers - Tokenization library for transformer models
Re-exports§
pub use advanced_vocab_intelligence::ActionableRecommendation;pub use advanced_vocab_intelligence::CompressionAnalysis;pub use advanced_vocab_intelligence::CompressionOpportunity;pub use advanced_vocab_intelligence::CompressionOpportunityType;pub use advanced_vocab_intelligence::CrossLingualAnalysis;pub use advanced_vocab_intelligence::DeclineToken;pub use advanced_vocab_intelligence::DomainAnalysis;pub use advanced_vocab_intelligence::DomainDistribution;pub use advanced_vocab_intelligence::EvolutionAnalysis;pub use advanced_vocab_intelligence::EvolutionPrediction;pub use advanced_vocab_intelligence::ImplementationDifficulty;pub use advanced_vocab_intelligence::LanguageCoverage;pub use advanced_vocab_intelligence::MergeRisk;pub use advanced_vocab_intelligence::MultilingualOpportunity;pub use advanced_vocab_intelligence::RecommendationCategory;pub use advanced_vocab_intelligence::RecommendationPriority;pub use advanced_vocab_intelligence::RedundantTokenGroup;pub use advanced_vocab_intelligence::RiskAssessment;pub use advanced_vocab_intelligence::RiskLevel;pub use advanced_vocab_intelligence::SemanticAnalysis;pub use advanced_vocab_intelligence::SemanticCluster;pub use advanced_vocab_intelligence::TrendingToken;pub use advanced_vocab_intelligence::VocabIntelligenceAnalyzer;pub use advanced_vocab_intelligence::VocabIntelligenceConfig;pub use advanced_vocab_intelligence::VocabIntelligenceResult;pub use alignment::AlignedSpan;pub use alignment::AlignmentConfig;pub use alignment::AlignmentEngine;pub use alignment::AlignmentStats;pub use alignment::TokenAlignment;pub use alignment::Word;pub use arabic::ArabicMode;pub use arabic::ArabicTokenizer;pub use arabic::ArabicTokenizerConfig;pub use arabic::MorphologicalAnalysis;pub use arabic::TokenizationStats;pub use async_tokenizer::AsyncTokenizer;pub use async_tokenizer::AsyncTokenizerConfig;pub use async_tokenizer::AsyncTokenizerWrapper;pub use async_tokenizer::ConfigurableAsyncTokenizer;pub use benchmark_utils::BenchmarkConfig;pub use benchmark_utils::BenchmarkResult as TokenizerBenchmarkResult;pub use benchmark_utils::TokenizerBenchmark;pub use binary_format::BinaryConfig;pub use binary_format::BinaryHeader;pub use binary_format::BinarySerializer;pub use binary_format::BinaryTokenizer;pub use binary_format::BinaryUtils;pub use binary_format::NormalizationRule as BinaryNormalizationRule;pub use binary_format::PreTokenizationRule as BinaryPreTokenizationRule;pub use binary_format::TokenizerConverter;pub use bio::BioAnalysis;pub use bio::BioToken;pub use bio::BioTokenMetadata;pub use bio::BioTokenType;pub use bio::BioTokenizer;pub use bio::BioTokenizerConfig;pub use bpe::BPETokenizer;pub use canine::CanineTokenizer;pub use char::CharTokenizer;pub use chemical::ChemicalAnalysis;pub use chemical::ChemicalToken;pub use chemical::ChemicalTokenMetadata;pub use chemical::ChemicalTokenType;pub use chemical::ChemicalTokenizer;pub use chemical::ChemicalTokenizerConfig;pub use chinese::ChineseTokenizer;pub use chinese::ChineseTokenizerConfig;pub use code_tokenizer::CodeToken;pub use code_tokenizer::CodeTokenType;pub use code_tokenizer::CodeTokenizer;pub use code_tokenizer::CodeTokenizerConfig;pub use code_tokenizer::CommentPatterns;pub use code_tokenizer::Language;pub use code_tokenizer::LiteralType;pub use code_tokenizer::TokenPosition;pub use compressed_vocab::CompressedVocab;pub use compressed_vocab::CompressedVocabStats;pub use compressed_vocab::PrefixTrie;pub use coverage::CharacterCoverage;pub use coverage::CoverageAnalyzer;pub use coverage::CoverageConfig;pub use coverage::CoverageExample;pub use coverage::CoverageReport;pub use coverage::CoverageReportExporter;pub use coverage::CoverageThresholds;pub use coverage::CoverageWarning;pub use coverage::PerformanceMetrics;pub use coverage::QualityMetrics;pub use coverage::ReportFormat;pub use coverage::TokenDistribution;pub use coverage::VocabularyCoverage;pub use custom::CustomVocabTokenizer;pub use custom::CustomVocabTokenizerBuilder;pub use custom_format::CustomFormatConverter;pub use custom_format::CustomFormatTokenizer;pub use custom_format::CustomSpecialToken;pub use custom_format::CustomToken;pub use custom_format::CustomTokenizerFormat;pub use custom_format::CustomVocabulary;pub use custom_format::NormalizationRule as CustomNormalizationRule;pub use custom_format::NormalizationType;pub use custom_format::PostProcessingRule;pub use custom_format::PostProcessingType;pub use custom_format::PreTokenizationRule as CustomPreTokenizationRule;pub use custom_format::PreTokenizationType;pub use custom_format::SpecialTokenType;pub use custom_format::VocabularyType;pub use fairseq::FairseqDictionaryBuilder;pub use fairseq::FairseqTokenizer;pub use japanese::JapaneseMode;pub use japanese::JapaneseTokenizer;pub use japanese::JapaneseTokenizerConfig;pub use korean::KoreanMode;pub use korean::KoreanTokenizer;pub use korean::KoreanTokenizerConfig;pub use math_tokenizer::MathAnalysis;pub use math_tokenizer::MathToken;pub use math_tokenizer::MathTokenType;pub use math_tokenizer::MathTokenizer;pub use math_tokenizer::MathTokenizerConfig;pub use messagepack_serialization::MessagePackConfig;pub use messagepack_serialization::MessagePackMergeRule;pub use messagepack_serialization::MessagePackNormalizationRule;pub use messagepack_serialization::MessagePackSerializer;pub use messagepack_serialization::MessagePackTokenizedInput;pub use messagepack_serialization::MessagePackTokenizerConfig;pub use messagepack_serialization::MessagePackTokenizerMetadata;pub use messagepack_serialization::MessagePackUtils;pub use messagepack_serialization::MessagePackVocabEntry;pub use minimal_perfect_hash::EfficiencyComparison;pub use minimal_perfect_hash::MemoryUsage;pub use minimal_perfect_hash::MinimalPerfectHash;pub use minimal_perfect_hash::MinimalPerfectHashConfig;pub use minimal_perfect_hash::MinimalPerfectHashVocab;pub use mmap_vocab::MemoryStats;pub use mmap_vocab::MmapVocab;pub use mmap_vocab::TokenIterator;pub use multimodal::AudioFrame;pub use multimodal::FusionStrategy;pub use multimodal::GraphData;pub use multimodal::ImagePatch;pub use multimodal::ModalityType;pub use multimodal::MultimodalConfig;pub use multimodal::MultimodalInput;pub use multimodal::MultimodalToken;pub use multimodal::MultimodalTokenMetadata;pub use multimodal::MultimodalTokenizedInput;pub use multimodal::MultimodalTokenizer;pub use multimodal::MultimodalUtils;pub use multimodal::TableData;pub use multimodal::VideoFrame;pub use music::MusicAnalysis;pub use music::MusicToken;pub use music::MusicTokenMetadata;pub use music::MusicTokenType;pub use music::MusicTokenizer;pub use music::MusicTokenizerConfig;pub use parallel::BatchTokenizer;pub use parallel::BatchedTokenizedInput;pub use parallel::ParallelTokenizer;pub use performance_profiler::BenchmarkResult;pub use performance_profiler::ExportFormat;pub use performance_profiler::MemoryStats as ProfilerMemoryStats;pub use performance_profiler::PerformanceProfiler;pub use performance_profiler::ProfilerConfig;pub use performance_profiler::ProfilingReport;pub use performance_profiler::ProfilingSummary;pub use performance_profiler::ThroughputStats;pub use performance_profiler::TimingStats;pub use performance_profiler::TokenizerComparison as ProfilerComparison;pub use protobuf_serialization::ProtobufConvertible;pub use protobuf_serialization::ProtobufExportConfig;pub use protobuf_serialization::ProtobufExporter;pub use protobuf_serialization::ProtobufFormat;pub use protobuf_serialization::ProtobufMergeRule;pub use protobuf_serialization::ProtobufNormalizationRule;pub use protobuf_serialization::ProtobufSerializer;pub use protobuf_serialization::ProtobufTokenizedInput;pub use protobuf_serialization::ProtobufTokenizerMetadata;pub use protobuf_serialization::ProtobufTokenizerModel;pub use protobuf_serialization::ProtobufVocabEntry;pub use sentencepiece::SentencePieceTokenizer;pub use sequence_packing::AdvancedSequencePacker;pub use sequence_packing::PackedSequence;pub use sequence_packing::PackingConfig;pub use sequence_packing::PackingInfo;pub use sequence_packing::PackingStats;pub use sequence_packing::PackingStrategy;pub use sequence_packing::SequencePacker;pub use shared_vocab_pool::PooledVocab;pub use shared_vocab_pool::VocabPoolConfig;pub use shared_vocab_pool::VocabPoolStats;pub use simd::SimdTokenizer;pub use special_tokens::AdvancedTemplate;pub use special_tokens::ConversationMessage;pub use special_tokens::PlaceholderProcessor;pub use special_tokens::PlaceholderToken;pub use special_tokens::PlaceholderType;pub use special_tokens::SpecialTokenConfig;pub use special_tokens::SpecialTokenManager;pub use streaming::BatchedStreamingTokenizer;pub use streaming::StreamingTokenizer;pub use streaming::TextFileIterator;pub use subword_regularization::SubwordRegularizationConfig;pub use subword_regularization::SubwordRegularizer;pub use subword_regularization::UnigramSubwordRegularizer;pub use test_infrastructure::BenchmarkResults;pub use test_infrastructure::CrossValidationResults;pub use test_infrastructure::CrossValidationRunner;pub use test_infrastructure::FuzzingResults;pub use test_infrastructure::InconsistencySeverity;pub use test_infrastructure::RegressionResults;pub use test_infrastructure::TestCaseGenerator;pub use test_infrastructure::TestConfig;pub use test_infrastructure::TestReportUtils;pub use test_infrastructure::TestResult;pub use test_infrastructure::TestRunner;pub use test_infrastructure::TestSuiteResult;pub use thai::ThaiMode;pub use thai::ThaiTokenizer;pub use thai::ThaiTokenizerConfig;pub use tiktoken::TiktokenTokenizer;pub use tokenization_debugger::CharacterAnalysis;pub use tokenization_debugger::CompressionStats;pub use tokenization_debugger::DebugAnalysis;pub use tokenization_debugger::DebugSession;pub use tokenization_debugger::DebuggerConfig;pub use tokenization_debugger::DetectedIssue;pub use tokenization_debugger::IssueSeverity;pub use tokenization_debugger::IssueType;pub use tokenization_debugger::PatternAnalysis;pub use tokenization_debugger::PerformanceStats;pub use tokenization_debugger::TokenizationDebugger;pub use tokenization_debugger::TokenizationResult;pub use tokenizer::TokenizedInputWithAlignment;pub use tokenizer::TokenizedInputWithOffsets;pub use tokenizer::TokenizerImpl;pub use tokenizer::TokenizerWrapper;pub use training::CoverageAnalysis;pub use training::DistributedTrainingCoordinator;pub use training::LanguageDetectionResult;pub use training::LanguageDetector;pub use training::StreamingTrainer;pub use training::TokenDistributionAnalyzer;pub use training::TokenDistributionResult;pub use training::TrainingCheckpoint;pub use unigram::UnigramTokenizer;pub use visualization::ComparisonStats;pub use visualization::TokenInfo;pub use visualization::TokenVisualization;pub use visualization::TokenVisualizer;pub use visualization::TokenizationStats as VisualizationTokenizationStats;pub use visualization::TokenizerComparison;pub use visualization::VisualizationConfig;pub use vocab::FlexibleVocab;pub use vocab::LazyVocab;pub use vocab::MergeStrategy;pub use vocab::Vocab;pub use vocab_analyzer::CharacterPattern;pub use vocab_analyzer::CoverageAnalysis as VocabCoverageAnalysis;pub use vocab_analyzer::FrequencyAnalysis;pub use vocab_analyzer::IssueSeverity as VocabIssueSeverity;pub use vocab_analyzer::LanguageDistribution;pub use vocab_analyzer::SubwordPattern;pub use vocab_analyzer::VocabAnalysisConfig;pub use vocab_analyzer::VocabAnalysisResult;pub use vocab_analyzer::VocabAnalyzer;pub use vocab_analyzer::VocabBasicStats;pub use vocab_analyzer::VocabDebugUtils;pub use vocab_analyzer::VocabIssue;pub use vocab_analyzer::VocabIssueType;pub use wordpiece::WordPieceTokenizer;pub use zero_copy::ZeroCopyBuilder;pub use zero_copy::ZeroCopyHeader;pub use zero_copy::ZeroCopyMemoryStats;pub use zero_copy::ZeroCopyTokenizer;pub use zero_copy::ZeroCopyUtils;pub use zero_copy::ZeroCopyVocabEntry;
Modules§
- advanced_
vocab_ intelligence - Advanced Vocabulary Intelligence System for TrustformeRS Tokenizers
- alignment
- arabic
- Arabic tokenization support for TrustformeRS
- async_
tokenizer - benchmark_
utils - Benchmark utilities for tokenization performance measurement
- binary_
format - bio
- Biological sequence tokenizer for TrustformeRS
- bpe
- canine
- char
- chemical
- Chemical notation tokenizer for TrustformeRS
- chinese
- code_
tokenizer - compressed_
vocab - coverage
- Coverage reporting for TrustformeRS tokenizers
- custom
- custom_
format - fairseq
- japanese
- korean
- math_
tokenizer - messagepack_
serialization - minimal_
perfect_ hash - mmap_
vocab - multimodal
- Multimodal tokenization for TrustformeRS
- music
- Music notation tokenizer for TrustformeRS
- normalizer
- parallel
- performance_
profiler - protobuf_
serialization - regex_
tokenizer - sentencepiece
- sequence_
packing - shared_
vocab_ pool - simd
- special_
tokens - streaming
- subword_
regularization - test_
infrastructure - thai
- tiktoken
- tokenization_
debugger - tokenizer
- training
- Tokenizer Training Module Organization
- unigram
- visualization
- vocab
- Vocabulary types and implementations
- vocab_
analyzer - wordpiece
- zero_
copy