Skip to main content

Crate trustformers_tokenizers

Crate trustformers_tokenizers 

Source
Expand description

TrustformeRS Tokenizers - Tokenization library for transformer models

Re-exports§

pub use advanced_vocab_intelligence::ActionableRecommendation;
pub use advanced_vocab_intelligence::CompressionAnalysis;
pub use advanced_vocab_intelligence::CompressionOpportunity;
pub use advanced_vocab_intelligence::CompressionOpportunityType;
pub use advanced_vocab_intelligence::CrossLingualAnalysis;
pub use advanced_vocab_intelligence::DeclineToken;
pub use advanced_vocab_intelligence::DomainAnalysis;
pub use advanced_vocab_intelligence::DomainDistribution;
pub use advanced_vocab_intelligence::EvolutionAnalysis;
pub use advanced_vocab_intelligence::EvolutionPrediction;
pub use advanced_vocab_intelligence::ImplementationDifficulty;
pub use advanced_vocab_intelligence::LanguageCoverage;
pub use advanced_vocab_intelligence::MergeRisk;
pub use advanced_vocab_intelligence::MultilingualOpportunity;
pub use advanced_vocab_intelligence::RecommendationCategory;
pub use advanced_vocab_intelligence::RecommendationPriority;
pub use advanced_vocab_intelligence::RedundantTokenGroup;
pub use advanced_vocab_intelligence::RiskAssessment;
pub use advanced_vocab_intelligence::RiskLevel;
pub use advanced_vocab_intelligence::SemanticAnalysis;
pub use advanced_vocab_intelligence::SemanticCluster;
pub use advanced_vocab_intelligence::TrendingToken;
pub use advanced_vocab_intelligence::VocabIntelligenceAnalyzer;
pub use advanced_vocab_intelligence::VocabIntelligenceConfig;
pub use advanced_vocab_intelligence::VocabIntelligenceResult;
pub use alignment::AlignedSpan;
pub use alignment::AlignmentConfig;
pub use alignment::AlignmentEngine;
pub use alignment::AlignmentStats;
pub use alignment::TokenAlignment;
pub use alignment::Word;
pub use arabic::ArabicMode;
pub use arabic::ArabicTokenizer;
pub use arabic::ArabicTokenizerConfig;
pub use arabic::MorphologicalAnalysis;
pub use arabic::TokenizationStats;
pub use async_tokenizer::AsyncTokenizer;
pub use async_tokenizer::AsyncTokenizerConfig;
pub use async_tokenizer::AsyncTokenizerWrapper;
pub use async_tokenizer::ConfigurableAsyncTokenizer;
pub use benchmark_utils::BenchmarkConfig;
pub use benchmark_utils::BenchmarkResult as TokenizerBenchmarkResult;
pub use benchmark_utils::TokenizerBenchmark;
pub use binary_format::BinaryConfig;
pub use binary_format::BinaryHeader;
pub use binary_format::BinarySerializer;
pub use binary_format::BinaryTokenizer;
pub use binary_format::BinaryUtils;
pub use binary_format::NormalizationRule as BinaryNormalizationRule;
pub use binary_format::PreTokenizationRule as BinaryPreTokenizationRule;
pub use binary_format::TokenizerConverter;
pub use bio::BioAnalysis;
pub use bio::BioToken;
pub use bio::BioTokenMetadata;
pub use bio::BioTokenType;
pub use bio::BioTokenizer;
pub use bio::BioTokenizerConfig;
pub use bpe::BPETokenizer;
pub use canine::CanineTokenizer;
pub use char::CharTokenizer;
pub use chemical::ChemicalAnalysis;
pub use chemical::ChemicalToken;
pub use chemical::ChemicalTokenMetadata;
pub use chemical::ChemicalTokenType;
pub use chemical::ChemicalTokenizer;
pub use chemical::ChemicalTokenizerConfig;
pub use chinese::ChineseTokenizer;
pub use chinese::ChineseTokenizerConfig;
pub use code_tokenizer::CodeToken;
pub use code_tokenizer::CodeTokenType;
pub use code_tokenizer::CodeTokenizer;
pub use code_tokenizer::CodeTokenizerConfig;
pub use code_tokenizer::CommentPatterns;
pub use code_tokenizer::Language;
pub use code_tokenizer::LiteralType;
pub use code_tokenizer::TokenPosition;
pub use compressed_vocab::CompressedVocab;
pub use compressed_vocab::CompressedVocabStats;
pub use compressed_vocab::PrefixTrie;
pub use coverage::CharacterCoverage;
pub use coverage::CoverageAnalyzer;
pub use coverage::CoverageConfig;
pub use coverage::CoverageExample;
pub use coverage::CoverageReport;
pub use coverage::CoverageReportExporter;
pub use coverage::CoverageThresholds;
pub use coverage::CoverageWarning;
pub use coverage::PerformanceMetrics;
pub use coverage::QualityMetrics;
pub use coverage::ReportFormat;
pub use coverage::TokenDistribution;
pub use coverage::VocabularyCoverage;
pub use custom::CustomVocabTokenizer;
pub use custom::CustomVocabTokenizerBuilder;
pub use custom_format::CustomFormatConverter;
pub use custom_format::CustomFormatTokenizer;
pub use custom_format::CustomSpecialToken;
pub use custom_format::CustomToken;
pub use custom_format::CustomTokenizerFormat;
pub use custom_format::CustomVocabulary;
pub use custom_format::NormalizationRule as CustomNormalizationRule;
pub use custom_format::NormalizationType;
pub use custom_format::PostProcessingRule;
pub use custom_format::PostProcessingType;
pub use custom_format::PreTokenizationRule as CustomPreTokenizationRule;
pub use custom_format::PreTokenizationType;
pub use custom_format::SpecialTokenType;
pub use custom_format::VocabularyType;
pub use fairseq::FairseqDictionaryBuilder;
pub use fairseq::FairseqTokenizer;
pub use japanese::JapaneseMode;
pub use japanese::JapaneseTokenizer;
pub use japanese::JapaneseTokenizerConfig;
pub use korean::KoreanMode;
pub use korean::KoreanTokenizer;
pub use korean::KoreanTokenizerConfig;
pub use math_tokenizer::MathAnalysis;
pub use math_tokenizer::MathToken;
pub use math_tokenizer::MathTokenType;
pub use math_tokenizer::MathTokenizer;
pub use math_tokenizer::MathTokenizerConfig;
pub use messagepack_serialization::MessagePackConfig;
pub use messagepack_serialization::MessagePackMergeRule;
pub use messagepack_serialization::MessagePackNormalizationRule;
pub use messagepack_serialization::MessagePackSerializer;
pub use messagepack_serialization::MessagePackTokenizedInput;
pub use messagepack_serialization::MessagePackTokenizerConfig;
pub use messagepack_serialization::MessagePackTokenizerMetadata;
pub use messagepack_serialization::MessagePackUtils;
pub use messagepack_serialization::MessagePackVocabEntry;
pub use minimal_perfect_hash::EfficiencyComparison;
pub use minimal_perfect_hash::MemoryUsage;
pub use minimal_perfect_hash::MinimalPerfectHash;
pub use minimal_perfect_hash::MinimalPerfectHashConfig;
pub use minimal_perfect_hash::MinimalPerfectHashVocab;
pub use mmap_vocab::MemoryStats;
pub use mmap_vocab::MmapVocab;
pub use mmap_vocab::TokenIterator;
pub use multimodal::AudioFrame;
pub use multimodal::FusionStrategy;
pub use multimodal::GraphData;
pub use multimodal::ImagePatch;
pub use multimodal::ModalityType;
pub use multimodal::MultimodalConfig;
pub use multimodal::MultimodalInput;
pub use multimodal::MultimodalToken;
pub use multimodal::MultimodalTokenMetadata;
pub use multimodal::MultimodalTokenizedInput;
pub use multimodal::MultimodalTokenizer;
pub use multimodal::MultimodalUtils;
pub use multimodal::TableData;
pub use multimodal::VideoFrame;
pub use music::MusicAnalysis;
pub use music::MusicToken;
pub use music::MusicTokenMetadata;
pub use music::MusicTokenType;
pub use music::MusicTokenizer;
pub use music::MusicTokenizerConfig;
pub use parallel::BatchTokenizer;
pub use parallel::BatchedTokenizedInput;
pub use parallel::ParallelTokenizer;
pub use performance_profiler::BenchmarkResult;
pub use performance_profiler::ExportFormat;
pub use performance_profiler::MemoryStats as ProfilerMemoryStats;
pub use performance_profiler::PerformanceProfiler;
pub use performance_profiler::ProfilerConfig;
pub use performance_profiler::ProfilingReport;
pub use performance_profiler::ProfilingSummary;
pub use performance_profiler::ThroughputStats;
pub use performance_profiler::TimingStats;
pub use performance_profiler::TokenizerComparison as ProfilerComparison;
pub use protobuf_serialization::ProtobufConvertible;
pub use protobuf_serialization::ProtobufExportConfig;
pub use protobuf_serialization::ProtobufExporter;
pub use protobuf_serialization::ProtobufFormat;
pub use protobuf_serialization::ProtobufMergeRule;
pub use protobuf_serialization::ProtobufNormalizationRule;
pub use protobuf_serialization::ProtobufSerializer;
pub use protobuf_serialization::ProtobufTokenizedInput;
pub use protobuf_serialization::ProtobufTokenizerMetadata;
pub use protobuf_serialization::ProtobufTokenizerModel;
pub use protobuf_serialization::ProtobufVocabEntry;
pub use sentencepiece::SentencePieceTokenizer;
pub use sequence_packing::AdvancedSequencePacker;
pub use sequence_packing::PackedSequence;
pub use sequence_packing::PackingConfig;
pub use sequence_packing::PackingInfo;
pub use sequence_packing::PackingStats;
pub use sequence_packing::PackingStrategy;
pub use sequence_packing::SequencePacker;
pub use shared_vocab_pool::PooledVocab;
pub use shared_vocab_pool::SharedVocabPool;
pub use shared_vocab_pool::VocabPoolConfig;
pub use shared_vocab_pool::VocabPoolStats;
pub use simd::SimdTokenizer;
pub use special_tokens::AdvancedTemplate;
pub use special_tokens::ConversationMessage;
pub use special_tokens::PlaceholderProcessor;
pub use special_tokens::PlaceholderToken;
pub use special_tokens::PlaceholderType;
pub use special_tokens::SpecialTokenConfig;
pub use special_tokens::SpecialTokenManager;
pub use streaming::BatchedStreamingTokenizer;
pub use streaming::StreamingTokenizer;
pub use streaming::TextFileIterator;
pub use subword_regularization::SubwordRegularizationConfig;
pub use subword_regularization::SubwordRegularizer;
pub use subword_regularization::UnigramSubwordRegularizer;
pub use test_infrastructure::BenchmarkResults;
pub use test_infrastructure::CrossValidationResults;
pub use test_infrastructure::CrossValidationRunner;
pub use test_infrastructure::FuzzingResults;
pub use test_infrastructure::InconsistencySeverity;
pub use test_infrastructure::RegressionResults;
pub use test_infrastructure::TestCaseGenerator;
pub use test_infrastructure::TestConfig;
pub use test_infrastructure::TestReportUtils;
pub use test_infrastructure::TestResult;
pub use test_infrastructure::TestRunner;
pub use test_infrastructure::TestSuiteResult;
pub use thai::ThaiMode;
pub use thai::ThaiTokenizer;
pub use thai::ThaiTokenizerConfig;
pub use tiktoken::TiktokenTokenizer;
pub use tokenization_debugger::CharacterAnalysis;
pub use tokenization_debugger::CompressionStats;
pub use tokenization_debugger::DebugAnalysis;
pub use tokenization_debugger::DebugSession;
pub use tokenization_debugger::DebuggerConfig;
pub use tokenization_debugger::DetectedIssue;
pub use tokenization_debugger::IssueSeverity;
pub use tokenization_debugger::IssueType;
pub use tokenization_debugger::PatternAnalysis;
pub use tokenization_debugger::PerformanceStats;
pub use tokenization_debugger::TokenizationDebugger;
pub use tokenization_debugger::TokenizationResult;
pub use tokenizer::TokenizedInputWithAlignment;
pub use tokenizer::TokenizedInputWithOffsets;
pub use tokenizer::TokenizerImpl;
pub use tokenizer::TokenizerWrapper;
pub use training::CoverageAnalysis;
pub use training::DistributedTrainingCoordinator;
pub use training::LanguageDetectionResult;
pub use training::LanguageDetector;
pub use training::StreamingTrainer;
pub use training::TokenDistributionAnalyzer;
pub use training::TokenDistributionResult;
pub use training::TrainingCheckpoint;
pub use unigram::UnigramTokenizer;
pub use visualization::ComparisonStats;
pub use visualization::TokenInfo;
pub use visualization::TokenVisualization;
pub use visualization::TokenVisualizer;
pub use visualization::TokenizationStats as VisualizationTokenizationStats;
pub use visualization::TokenizerComparison;
pub use visualization::VisualizationConfig;
pub use vocab::FlexibleVocab;
pub use vocab::LazyVocab;
pub use vocab::MergeStrategy;
pub use vocab::Vocab;
pub use vocab_analyzer::CharacterPattern;
pub use vocab_analyzer::CoverageAnalysis as VocabCoverageAnalysis;
pub use vocab_analyzer::FrequencyAnalysis;
pub use vocab_analyzer::IssueSeverity as VocabIssueSeverity;
pub use vocab_analyzer::LanguageDistribution;
pub use vocab_analyzer::SubwordPattern;
pub use vocab_analyzer::VocabAnalysisConfig;
pub use vocab_analyzer::VocabAnalysisResult;
pub use vocab_analyzer::VocabAnalyzer;
pub use vocab_analyzer::VocabBasicStats;
pub use vocab_analyzer::VocabDebugUtils;
pub use vocab_analyzer::VocabIssue;
pub use vocab_analyzer::VocabIssueType;
pub use wordpiece::WordPieceTokenizer;
pub use zero_copy::ZeroCopyBuilder;
pub use zero_copy::ZeroCopyHeader;
pub use zero_copy::ZeroCopyMemoryStats;
pub use zero_copy::ZeroCopyTokenizer;
pub use zero_copy::ZeroCopyUtils;
pub use zero_copy::ZeroCopyVocabEntry;

Modules§

advanced_vocab_intelligence
Advanced Vocabulary Intelligence System for TrustformeRS Tokenizers
alignment
arabic
Arabic tokenization support for TrustformeRS
async_tokenizer
benchmark_utils
Benchmark utilities for tokenization performance measurement
binary_format
bio
Biological sequence tokenizer for TrustformeRS
bpe
canine
char
chemical
Chemical notation tokenizer for TrustformeRS
chinese
code_tokenizer
compressed_vocab
coverage
Coverage reporting for TrustformeRS tokenizers
custom
custom_format
fairseq
japanese
korean
math_tokenizer
messagepack_serialization
minimal_perfect_hash
mmap_vocab
multimodal
Multimodal tokenization for TrustformeRS
music
Music notation tokenizer for TrustformeRS
normalizer
parallel
performance_profiler
protobuf_serialization
regex_tokenizer
sentencepiece
sequence_packing
shared_vocab_pool
simd
special_tokens
streaming
subword_regularization
test_infrastructure
thai
tiktoken
tokenization_debugger
tokenizer
training
Tokenizer Training Module Organization
unigram
visualization
vocab
Vocabulary types and implementations
vocab_analyzer
wordpiece
zero_copy