Skip to main content

scirs2_text/
lib.rs

1#![allow(clippy::manual_strip)]
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::if_same_then_else)]
4#![allow(clippy::cloned_ref_to_slice_refs)]
5#![allow(dead_code)]
6//! # SciRS2 Text - Natural Language Processing
7//!
8//! **scirs2-text** provides comprehensive text processing and NLP capabilities,
9//! offering tokenization, TF-IDF vectorization, word embeddings, sentiment analysis,
10//! topic modeling, and text classification with SIMD acceleration and parallel processing.
11//!
12//! ## 🎯 Key Features
13//!
14//! - **Tokenization**: Word, sentence, N-gram, BPE, regex tokenizers
15//! - **Vectorization**: TF-IDF, count vectorizers, word embeddings
16//! - **Text Processing**: Stemming, lemmatization, normalization, stopword removal
17//! - **Embeddings**: Word2Vec (Skip-gram, CBOW), GloVe loading
18//! - **Similarity**: Cosine, Jaccard, Levenshtein, phonetic algorithms
19//! - **NLP**: Sentiment analysis, topic modeling (LDA), text classification
20//! - **Performance**: SIMD operations, parallel processing, sparse matrices
21//!
22//! ## 📦 Module Overview
23//!
24//! | SciRS2 Module | Python Equivalent | Description |
25//! |---------------|-------------------|-------------|
26//! | `tokenize` | `nltk.tokenize` | Text tokenization utilities |
27//! | `vectorize` | `sklearn.feature_extraction.text.TfidfVectorizer` | TF-IDF and count vectorization |
28//! | `embeddings` | `gensim.models.Word2Vec` | Word embeddings (Word2Vec) |
29//! | `sentiment` | `nltk.sentiment` | Sentiment analysis |
30//! | `topic_modeling` | `sklearn.decomposition.LatentDirichletAllocation` | Topic modeling (LDA) |
31//! | `stemming` | `nltk.stem` | Stemming and lemmatization |
32//!
33//! ## 🚀 Quick Start
34//!
35//! ```toml
36//! [dependencies]
37//! scirs2-text = "0.1.2"
38//! ```
39//!
40//! ```rust,no_run
41//! use scirs2_text::{tokenize::WordTokenizer, vectorize::TfidfVectorizer, Tokenizer, Vectorizer};
42//!
43//! // Tokenization
44//! let tokenizer = WordTokenizer::default();
45//! let tokens = tokenizer.tokenize("Hello, world!").unwrap();
46//!
47//! // TF-IDF vectorization
48//! let docs = vec!["Hello world", "Good morning world"];
49//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
50//! let matrix = vectorizer.fit_transform(&docs).unwrap();
51//! ```
52//!
53//! ## 🔒 Version: 0.1.2 (January 15, 2026)
54//!
55//! ## Quick Start
56//!
57//! ```rust
58//! use scirs2_text::{
59//!     tokenize::WordTokenizer,
60//!     vectorize::TfidfVectorizer,
61//!     sentiment::LexiconSentimentAnalyzer,
62//!     Tokenizer, Vectorizer
63//! };
64//!
65//! // Basic tokenization
66//! let tokenizer = WordTokenizer::default();
67//! let tokens = tokenizer.tokenize("Hello, world! This is a test.").unwrap();
68//!
69//! // TF-IDF vectorization
70//! let documents = vec![
71//!     "The quick brown fox jumps over the lazy dog",
72//!     "A quick brown dog outpaces a quick fox",
73//!     "The lazy dog sleeps all day"
74//! ];
75//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
76//! let matrix = vectorizer.fit_transform(&documents).unwrap();
77//!
78//! // Sentiment analysis
79//! let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
80//! let sentiment = analyzer.analyze("I love this library!").unwrap();
81//! println!("Sentiment: {:?}", sentiment.sentiment);
82//! ```
83//!
84//! ## Architecture
85//!
86//! The module is organized into focused sub-modules:
87//!
88//! - [`tokenize`]: Text tokenization utilities
89//! - [`vectorize`]: Document vectorization and TF-IDF
90//! - [`embeddings`]: Word embedding training and utilities
91//! - [`sentiment`]: Sentiment analysis tools
92//! - [`topic_modeling`]: Topic modeling with LDA
93//! - [`string_metrics`]: String similarity and distance metrics
94//! - [`preprocess`]: Text cleaning and normalization
95//! - [`stemming`]: Stemming and lemmatization
96//! - [`parallel`]: Parallel processing utilities
97//! - [`simd_ops`]: SIMD-accelerated operations
98//!
99//! ## Performance
100//!
101//! SciRS2 Text is designed for high performance:
102//!
103//! - SIMD acceleration for string operations
104//! - Parallel processing for large document collections
105//! - Memory-efficient sparse matrix representations
106//! - Zero-copy string processing where possible
107//! - Optimized algorithms with complexity guarantees
108
109#![warn(missing_docs)]
110
111pub mod classification;
112pub mod cleansing;
113pub mod distance;
114pub mod domain_processors;
115pub mod embeddings;
116pub mod enhanced_vectorize;
117pub mod error;
118pub mod huggingface_compat;
119pub mod information_extraction;
120pub mod ml_integration;
121pub mod ml_sentiment;
122pub mod model_registry;
123pub mod multilingual;
124pub mod neural_architectures;
125pub mod parallel;
126pub mod paraphrasing;
127pub mod performance;
128pub mod pos_tagging;
129pub mod preprocess;
130pub mod semantic_similarity;
131pub mod sentiment;
132pub mod simd_ops;
133pub mod sparse;
134pub mod sparse_vectorize;
135pub mod spelling;
136pub mod stemming;
137pub mod streaming;
138pub mod string_metrics;
139pub mod summarization;
140pub mod text_coordinator;
141pub mod text_statistics;
142pub mod token_filter;
143pub mod tokenize;
144pub mod topic_coherence;
145pub mod topic_modeling;
146pub mod transformer;
147pub mod utils;
148pub mod vectorize;
149pub mod visualization;
150pub mod vocabulary;
151pub mod weighted_distance;
152
153// Re-export commonly used items
154pub use classification::{
155    TextClassificationMetrics, TextClassificationPipeline, TextDataset, TextFeatureSelector,
156};
157pub use cleansing::{
158    expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
159    normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
160    replace_urls, strip_html_tags, AdvancedTextCleaner,
161};
162pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
163pub use domain_processors::{
164    Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
165    MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
166    ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
167};
168pub use embeddings::{Word2Vec, Word2VecAlgorithm, Word2VecConfig};
169pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
170pub use error::{Result, TextError};
171pub use huggingface_compat::{
172    ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
173    FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
174    HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
175    TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
176};
177pub use information_extraction::{
178    AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
179    CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
180    EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
181    InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
182    MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
183    StructuredDocumentInformation, TemporalExtractor, Topic,
184};
185pub use ml_integration::{
186    BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
187};
188pub use ml_sentiment::{
189    ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
190};
191pub use model_registry::{
192    ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
193    SerializableModelData,
194};
195pub use multilingual::{
196    Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
197    StopWords,
198};
199pub use neural_architectures::{
200    ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
201    GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
202    MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
203    ResidualBlock1D, SelfAttention, TextCNN,
204};
205pub use parallel::{
206    ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
207};
208pub use paraphrasing::{ParaphraseConfig, ParaphraseResult, ParaphraseStrategy, Paraphraser};
209pub use performance::{
210    AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
211    PerformanceSummary, PerformanceThresholds,
212};
213pub use pos_tagging::{
214    PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
215};
216pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
217pub use semantic_similarity::{
218    LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
219    WordMoversDistance,
220};
221pub use sentiment::{
222    LexiconSentimentAnalyzer, RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon,
223    SentimentResult, SentimentRules, SentimentWordCounts,
224};
225pub use simd_ops::{
226    AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
227    TextProcessingResult,
228};
229pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
230pub use sparse_vectorize::{
231    sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
232};
233pub use spelling::{
234    DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
235    SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
236};
237pub use stemming::{
238    LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
239    RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
240};
241pub use streaming::{
242    AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
243    ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
244};
245pub use string_metrics::{
246    AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
247    PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
248};
249pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
250pub use text_coordinator::{
251    AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
252    AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
253};
254pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
255pub use token_filter::{
256    CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
257    TokenFilter,
258};
259pub use tokenize::{
260    bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
261    CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
262    WhitespaceTokenizer, WordTokenizer,
263};
264pub use topic_coherence::{TopicCoherence, TopicDiversity};
265pub use topic_modeling::{
266    LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
267};
268pub use transformer::{
269    FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
270    TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
271    TransformerEncoderLayer, TransformerModel,
272};
273pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
274pub use visualization::{
275    AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
276    TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
277};
278pub use vocabulary::Vocabulary;
279pub use weighted_distance::{
280    DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
281    WeightedStringMetric,
282};