scirs2_text/
lib.rs

1#![allow(deprecated)]
2#![allow(clippy::manual_strip)]
3#![allow(clippy::needless_range_loop)]
4#![allow(clippy::if_same_then_else)]
5#![allow(clippy::cloned_ref_to_slice_refs)]
6#![allow(dead_code)]
7//! # SciRS2 Text - Natural Language Processing
8//!
9//! **scirs2-text** provides comprehensive text processing and NLP capabilities,
10//! offering tokenization, TF-IDF vectorization, word embeddings, sentiment analysis,
11//! topic modeling, and text classification with SIMD acceleration and parallel processing.
12//!
13//! ## 🎯 Key Features
14//!
15//! - **Tokenization**: Word, sentence, N-gram, BPE, regex tokenizers
16//! - **Vectorization**: TF-IDF, count vectorizers, word embeddings
17//! - **Text Processing**: Stemming, lemmatization, normalization, stopword removal
18//! - **Embeddings**: Word2Vec (Skip-gram, CBOW), GloVe loading
19//! - **Similarity**: Cosine, Jaccard, Levenshtein, phonetic algorithms
20//! - **NLP**: Sentiment analysis, topic modeling (LDA), text classification
21//! - **Performance**: SIMD operations, parallel processing, sparse matrices
22//!
23//! ## 📦 Module Overview
24//!
25//! | SciRS2 Module | Python Equivalent | Description |
26//! |---------------|-------------------|-------------|
27//! | `tokenize` | `nltk.tokenize` | Text tokenization utilities |
28//! | `vectorize` | `sklearn.feature_extraction.text.TfidfVectorizer` | TF-IDF and count vectorization |
29//! | `embeddings` | `gensim.models.Word2Vec` | Word embeddings (Word2Vec) |
30//! | `sentiment` | `nltk.sentiment` | Sentiment analysis |
31//! | `topic_modeling` | `sklearn.decomposition.LatentDirichletAllocation` | Topic modeling (LDA) |
32//! | `stemming` | `nltk.stem` | Stemming and lemmatization |
33//!
34//! ## 🚀 Quick Start
35//!
36//! ```toml
37//! [dependencies]
38//! scirs2-text = "0.1.0-rc.2"
39//! ```
40//!
41//! ```rust,no_run
42//! use scirs2_text::{tokenize::WordTokenizer, vectorize::TfidfVectorizer, Tokenizer, Vectorizer};
43//!
44//! // Tokenization
45//! let tokenizer = WordTokenizer::default();
46//! let tokens = tokenizer.tokenize("Hello, world!").unwrap();
47//!
48//! // TF-IDF vectorization
49//! let docs = vec!["Hello world", "Good morning world"];
50//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
51//! let matrix = vectorizer.fit_transform(&docs).unwrap();
52//! ```
53//!
54//! ## 🔒 Version: 0.1.0-rc.2 (October 03, 2025)
55//!
56//! ## Quick Start
57//!
58//! ```rust
59//! use scirs2_text::{
60//!     tokenize::WordTokenizer,
61//!     vectorize::TfidfVectorizer,
62//!     sentiment::LexiconSentimentAnalyzer,
63//!     Tokenizer, Vectorizer
64//! };
65//!
66//! // Basic tokenization
67//! let tokenizer = WordTokenizer::default();
68//! let tokens = tokenizer.tokenize("Hello, world! This is a test.").unwrap();
69//!
70//! // TF-IDF vectorization
71//! let documents = vec![
72//!     "The quick brown fox jumps over the lazy dog",
73//!     "A quick brown dog outpaces a quick fox",
74//!     "The lazy dog sleeps all day"
75//! ];
76//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
77//! let matrix = vectorizer.fit_transform(&documents).unwrap();
78//!
79//! // Sentiment analysis
80//! let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
81//! let sentiment = analyzer.analyze("I love this library!").unwrap();
82//! println!("Sentiment: {:?}", sentiment.sentiment);
83//! ```
84//!
85//! ## Architecture
86//!
87//! The module is organized into focused sub-modules:
88//!
89//! - [`tokenize`]: Text tokenization utilities
90//! - [`vectorize`]: Document vectorization and TF-IDF
91//! - [`embeddings`]: Word embedding training and utilities
92//! - [`sentiment`]: Sentiment analysis tools
93//! - [`topic_modeling`]: Topic modeling with LDA
94//! - [`string_metrics`]: String similarity and distance metrics
95//! - [`preprocess`]: Text cleaning and normalization
96//! - [`stemming`]: Stemming and lemmatization
97//! - [`parallel`]: Parallel processing utilities
98//! - [`simd_ops`]: SIMD-accelerated operations
99//!
100//! ## Performance
101//!
102//! SciRS2 Text is designed for high performance:
103//!
104//! - SIMD acceleration for string operations
105//! - Parallel processing for large document collections
106//! - Memory-efficient sparse matrix representations
107//! - Zero-copy string processing where possible
108//! - Optimized algorithms with complexity guarantees
109
110#![warn(missing_docs)]
111
112pub mod classification;
113pub mod cleansing;
114pub mod distance;
115pub mod domain_processors;
116pub mod embeddings;
117pub mod enhanced_vectorize;
118pub mod error;
119pub mod huggingface_compat;
120pub mod information_extraction;
121pub mod ml_integration;
122pub mod ml_sentiment;
123pub mod model_registry;
124pub mod multilingual;
125pub mod neural_architectures;
126pub mod parallel;
127pub mod performance;
128pub mod pos_tagging;
129pub mod preprocess;
130pub mod semantic_similarity;
131pub mod sentiment;
132pub mod simd_ops;
133pub mod sparse;
134pub mod sparse_vectorize;
135pub mod spelling;
136pub mod stemming;
137pub mod streaming;
138pub mod string_metrics;
139pub mod summarization;
140pub mod text_coordinator;
141pub mod text_statistics;
142pub mod token_filter;
143pub mod tokenize;
144pub mod topic_coherence;
145pub mod topic_modeling;
146pub mod transformer;
147pub mod utils;
148pub mod vectorize;
149pub mod visualization;
150pub mod vocabulary;
151pub mod weighted_distance;
152
153// Re-export commonly used items
154pub use classification::{
155    TextClassificationMetrics, TextClassificationPipeline, TextDataset, TextFeatureSelector,
156};
157pub use cleansing::{
158    expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
159    normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
160    replace_urls, strip_html_tags, AdvancedTextCleaner,
161};
162pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
163pub use domain_processors::{
164    Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
165    MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
166    ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
167};
168pub use embeddings::{Word2Vec, Word2VecAlgorithm, Word2VecConfig};
169pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
170pub use error::{Result, TextError};
171pub use huggingface_compat::{
172    ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
173    FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
174    HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
175    TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
176};
177pub use information_extraction::{
178    AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
179    CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
180    EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
181    InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
182    MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
183    StructuredDocumentInformation, TemporalExtractor, Topic,
184};
185pub use ml_integration::{
186    BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
187};
188pub use ml_sentiment::{
189    ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
190};
191pub use model_registry::{
192    ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
193    SerializableModelData,
194};
195pub use multilingual::{
196    Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
197    StopWords,
198};
199pub use neural_architectures::{
200    ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
201    GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
202    MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
203    ResidualBlock1D, SelfAttention, TextCNN,
204};
205pub use parallel::{
206    ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
207};
208pub use performance::{
209    AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
210    PerformanceSummary, PerformanceThresholds,
211};
212pub use pos_tagging::{
213    PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
214};
215pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
216pub use semantic_similarity::{
217    LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
218    WordMoversDistance,
219};
220pub use sentiment::{
221    LexiconSentimentAnalyzer, RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon,
222    SentimentResult, SentimentRules, SentimentWordCounts,
223};
224pub use simd_ops::{
225    AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
226    TextProcessingResult,
227};
228pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
229pub use sparse_vectorize::{
230    sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
231};
232pub use spelling::{
233    DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
234    SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
235};
236pub use stemming::{
237    LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
238    RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
239};
240pub use streaming::{
241    AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
242    ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
243};
244pub use string_metrics::{
245    AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
246    PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
247};
248pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
249pub use text_coordinator::{
250    AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
251    AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
252};
253pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
254pub use token_filter::{
255    CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
256    TokenFilter,
257};
258pub use tokenize::{
259    bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
260    CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
261    WhitespaceTokenizer, WordTokenizer,
262};
263pub use topic_coherence::{TopicCoherence, TopicDiversity};
264pub use topic_modeling::{
265    LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
266};
267pub use transformer::{
268    FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
269    TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
270    TransformerEncoderLayer, TransformerModel,
271};
272pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
273pub use visualization::{
274    AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
275    TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
276};
277pub use vocabulary::Vocabulary;
278pub use weighted_distance::{
279    DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
280    WeightedStringMetric,
281};