Expand description
Natural language processing operations for ToRSh
This crate provides PyTorch-compatible NLP functionality including:
- Tokenization (BPE, WordPiece, SentencePiece)
- Text embeddings (Word2Vec, GloVe, FastText)
- Text generation and beam search
- Pre-trained language models
- Text datasets and data loaders
- Analysis tools (sentiment, coherence, fluency)
Built on top of the SciRS2 ecosystem for high-performance text processing.
§Examples
ⓘ
use torsh_text::tokenization::*;
// Create a tokenizer
let tokenizer = BPETokenizer::from_pretrained("gpt2")?;
let tokens = tokenizer.encode("Hello, world!")?;Re-exports§
pub use generation::BeamHypothesis;pub use generation::BeamSearchDecoder;pub use generation::GenerationConfig as TextGenerationConfig;pub use generation::NGramRepetitionFilter;pub use generation::RepetitionPenalty;pub use generation::TextGenerator;pub use generation::TextSampler;pub use scirs2_ops::advanced_analytics::compute_advanced_stats;pub use scirs2_ops::advanced_analytics::AdvancedTextSampler;pub use scirs2_ops::advanced_analytics::AdvancedTextStats;pub use scirs2_ops::advanced_analytics::ComplexityAnalyzer;pub use scirs2_ops::advanced_analytics::ComplexityMetrics;pub use scirs2_ops::performance::PerformanceMetrics;pub use scirs2_ops::performance::PerformanceMonitor;pub use scirs2_text_integration::advanced_ops::cluster_documents;pub use scirs2_text_integration::advanced_ops::extract_topics;pub use scirs2_text_integration::advanced_ops::paraphrase_text;pub use scirs2_text_integration::ClassificationResult;pub use scirs2_text_integration::ClusterResult;pub use scirs2_text_integration::DeviceType as TextDeviceType;pub use scirs2_text_integration::EntityType;pub use scirs2_text_integration::LanguageDetection;pub use scirs2_text_integration::LanguageModel;pub use scirs2_text_integration::NamedEntity;pub use scirs2_text_integration::PrecisionLevel;pub use scirs2_text_integration::SciRS2TextProcessor;pub use scirs2_text_integration::SentimentLabel;pub use scirs2_text_integration::SentimentResult;pub use scirs2_text_integration::TextConfig;pub use scirs2_text_integration::TextEmbeddings;pub use scirs2_text_integration::Topic;pub use utils::clean_text;Deprecated pub use utils::count_words;pub use utils::label_encode;pub use utils::normalize_text;Deprecated pub use utils::one_hot_encode;pub use utils::pad_and_truncate_sequences;pub use utils::pad_sequence;pub use utils::split_sentences;Deprecated pub use utils::truncate_sequence;pub use utils::BatchProcessor;pub use utils::BatchTextStats;pub use utils::CustomStep;pub use utils::MaxLengthTruncateStep;pub use utils::MinLengthFilterStep;pub use utils::OptimizedBatchOps;pub use utils::PaddingStrategy;pub use utils::PreprocessingStats;pub use utils::PreprocessingUtils;pub use utils::RemoveExtraWhitespaceStep;pub use utils::StreamingBatchProcessor;pub use utils::TextAugmenter;pub use utils::TextCleaner;pub use utils::TextNormalizer;pub use utils::TextPreprocessingPipeline;pub use utils::TruncationStrategy;pub use analysis::*;pub use convenience::*;pub use datasets::*;pub use embeddings::*;pub use models::*;pub use scirs2_ops::*;pub use tokenization::*;pub use vocab::*;
Modules§
- analysis
- convenience
- Convenience utilities for common text processing tasks
- datasets
- Auto-generated module structure
- embeddings
- generation
- models
- prelude
- Prelude module for torsh-text
- scirs2_
ops - SciRS2 integration for text operations
- scirs2_
text_ integration - Comprehensive scirs2-text integration for advanced NLP
- tokenization
- utils
- vocab
Macros§
- preprocessing_
pipeline - Convenience macro for creating a preprocessing pipeline
- quick_
process - Quick text processing function for common use cases
- vocabulary
- Convenience macro for creating a vocabulary with special tokens