Skip to main content

torsh_text/
lib.rs

1//! Natural language processing operations for ToRSh
2//!
3//! This crate provides PyTorch-compatible NLP functionality including:
4//! - Tokenization (BPE, WordPiece, SentencePiece)
5//! - Text embeddings (Word2Vec, GloVe, FastText)
6//! - Text generation and beam search
7//! - Pre-trained language models
8//! - Text datasets and data loaders
9//! - Analysis tools (sentiment, coherence, fluency)
10//!
11//! Built on top of the SciRS2 ecosystem for high-performance text processing.
12//!
13//! # Examples
14//!
15//! ```rust,ignore
16//! use torsh_text::tokenization::*;
17//!
18//! // Create a tokenizer
19//! let tokenizer = BPETokenizer::from_pretrained("gpt2")?;
20//! let tokens = tokenizer.encode("Hello, world!")?;
21//! ```
22
23#![allow(clippy::too_many_arguments)]
24#![allow(clippy::module_inception)]
25#![allow(clippy::large_enum_variant)]
26// Allow dead_code for intentional placeholders and future implementations
27#![allow(dead_code)]
28// Note: Some unused imports remain from auto-generated code - will clean up in future refactoring
29
30pub mod analysis;
31pub mod convenience;
32pub mod datasets;
33pub mod embeddings;
34pub mod generation;
35// pub mod metrics;  // Has complex import issues - needs significant refactoring (deferred)
36pub mod models;
37pub mod prelude;
38pub mod scirs2_ops;
39pub mod scirs2_text_integration; // Re-enabled for checking
40pub mod tokenization;
41pub mod utils;
42pub mod vocab;
43
44#[cfg(test)]
45mod test_utils;
46
47pub use analysis::*;
48pub use convenience::*;
49pub use datasets::*;
50pub use embeddings::*;
51pub use generation::{
52    BeamHypothesis, BeamSearchDecoder, GenerationConfig as TextGenerationConfig,
53    NGramRepetitionFilter, RepetitionPenalty, TextGenerator, TextSampler,
54};
55// pub use metrics::*;  // Disabled - needs significant refactoring (deferred)
56pub use models::*;
57pub use scirs2_ops::advanced_analytics::{
58    compute_advanced_stats, AdvancedTextSampler, AdvancedTextStats, ComplexityAnalyzer,
59    ComplexityMetrics,
60};
61pub use scirs2_ops::performance::{PerformanceMetrics, PerformanceMonitor};
62pub use scirs2_ops::*;
63pub use scirs2_text_integration::{
64    advanced_ops::{cluster_documents, extract_topics, paraphrase_text},
65    ClassificationResult, ClusterResult, DeviceType as TextDeviceType, EntityType,
66    LanguageDetection, LanguageModel, NamedEntity, PrecisionLevel, SciRS2TextProcessor,
67    SentimentLabel, SentimentResult, TextConfig, TextEmbeddings, Topic,
68};
69pub use tokenization::*;
70// Note: utils::PreprocessingStep (trait) is excluded to avoid ambiguity with datasets::PreprocessingStep (enum).
71// Access the trait via `torsh_text::utils::PreprocessingStep` or through the prelude.
72#[allow(deprecated)]
73pub use utils::{
74    clean_text, count_words, label_encode, normalize_text, one_hot_encode,
75    pad_and_truncate_sequences, pad_sequence, split_sentences, truncate_sequence, BatchProcessor,
76    BatchTextStats, CustomStep, MaxLengthTruncateStep, MinLengthFilterStep, OptimizedBatchOps,
77    PaddingStrategy, PreprocessingStats, PreprocessingUtils, RemoveExtraWhitespaceStep,
78    StreamingBatchProcessor, TextAugmenter, TextCleaner, TextNormalizer, TextPreprocessingPipeline,
79    TruncationStrategy,
80};
81pub use vocab::*;
82
83// Version information
84pub const VERSION: &str = env!("CARGO_PKG_VERSION");
85pub const VERSION_MAJOR: u32 = 0;
86pub const VERSION_MINOR: u32 = 1;
87pub const VERSION_PATCH: u32 = 0;
88
89#[derive(Debug, thiserror::Error)]
90pub enum TextError {
91    #[error("Tokenization error: {0}")]
92    TokenizationError(String),
93
94    #[error("Model error: {0}")]
95    ModelError(String),
96
97    #[error("Vocabulary error: {0}")]
98    VocabError(String),
99
100    #[error("Dataset error: {0}")]
101    DatasetError(String),
102
103    #[error("Validation error: {0}")]
104    ValidationError(String),
105
106    #[error("Empty input provided where non-empty input is required")]
107    EmptyInput,
108
109    #[error("Invalid parameter: {parameter} = {value}, expected {expected}")]
110    InvalidParameter {
111        parameter: String,
112        value: String,
113        expected: String,
114    },
115
116    #[error("Processing failed for {item}: {reason}")]
117    ProcessingError { item: String, reason: String },
118
119    #[error("Configuration error: {0}")]
120    ConfigurationError(String),
121
122    #[error("IO error: {0}")]
123    IoError(#[from] std::io::Error),
124
125    #[error("Tensor error: {0}")]
126    TensorError(#[from] torsh_core::TorshError),
127
128    #[error("Other error: {0}")]
129    Other(#[from] anyhow::Error),
130}
131
132pub type Result<T> = std::result::Result<T, TextError>;
133
134impl From<TextError> for torsh_core::TorshError {
135    fn from(error: TextError) -> Self {
136        torsh_core::TorshError::Other(error.to_string())
137    }
138}