Skip to main content

scirs2_text/
lib.rs

1#![allow(clippy::manual_strip)]
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::if_same_then_else)]
4#![allow(clippy::cloned_ref_to_slice_refs)]
5#![allow(dead_code)]
6//! # SciRS2 Text - Natural Language Processing
7//!
8//! **scirs2-text** provides comprehensive text processing and NLP capabilities,
9//! offering tokenization, TF-IDF vectorization, word embeddings, sentiment analysis,
10//! topic modeling, and text classification with SIMD acceleration and parallel processing.
11//!
12//! ## 🎯 Key Features
13//!
14//! - **Tokenization**: Word, sentence, N-gram, BPE, regex tokenizers
15//! - **Vectorization**: TF-IDF, count vectorizers, word embeddings
16//! - **Text Processing**: Stemming, lemmatization, normalization, stopword removal
17//! - **Embeddings**: Word2Vec (Skip-gram, CBOW), GloVe loading
18//! - **Similarity**: Cosine, Jaccard, Levenshtein, phonetic algorithms
19//! - **NLP**: Sentiment analysis, topic modeling (LDA), text classification
20//! - **Performance**: SIMD operations, parallel processing, sparse matrices
21//!
22//! ## 📦 Module Overview
23//!
24//! | SciRS2 Module | Python Equivalent | Description |
25//! |---------------|-------------------|-------------|
26//! | `tokenize` | `nltk.tokenize` | Text tokenization utilities |
27//! | `vectorize` | `sklearn.feature_extraction.text.TfidfVectorizer` | TF-IDF and count vectorization |
28//! | `embeddings` | `gensim.models.Word2Vec` | Word embeddings (Word2Vec) |
29//! | `sentiment` | `nltk.sentiment` | Sentiment analysis |
30//! | `topic_modeling` | `sklearn.decomposition.LatentDirichletAllocation` | Topic modeling (LDA) |
31//! | `stemming` | `nltk.stem` | Stemming and lemmatization |
32//!
33//! ## 🚀 Quick Start
34//!
35//! ```toml
36//! [dependencies]
37//! scirs2-text = "0.4.0"
38//! ```
39//!
40//! ```rust,no_run
41//! use scirs2_text::{tokenize::WordTokenizer, vectorize::TfidfVectorizer, Tokenizer, Vectorizer};
42//!
43//! // Tokenization
44//! let tokenizer = WordTokenizer::default();
45//! let tokens = tokenizer.tokenize("Hello, world!").unwrap();
46//!
47//! // TF-IDF vectorization
48//! let docs = vec!["Hello world", "Good morning world"];
49//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
50//! let matrix = vectorizer.fit_transform(&docs).unwrap();
51//! ```
52//!
53//! ## 🔒 Version: 0.1.5 (January 15, 2026)
54//!
55//! ## Quick Start
56//!
57//! ```rust
58//! use scirs2_text::{
59//!     tokenize::WordTokenizer,
60//!     vectorize::TfidfVectorizer,
61//!     sentiment::LexiconSentimentAnalyzer,
62//!     Tokenizer, Vectorizer
63//! };
64//!
65//! // Basic tokenization
66//! let tokenizer = WordTokenizer::default();
67//! let tokens = tokenizer.tokenize("Hello, world! This is a test.").unwrap();
68//!
69//! // TF-IDF vectorization
70//! let documents = vec![
71//!     "The quick brown fox jumps over the lazy dog",
72//!     "A quick brown dog outpaces a quick fox",
73//!     "The lazy dog sleeps all day"
74//! ];
75//! let mut vectorizer = TfidfVectorizer::new(false, true, Some("l2".to_string()));
76//! let matrix = vectorizer.fit_transform(&documents).unwrap();
77//!
78//! // Sentiment analysis
79//! let analyzer = LexiconSentimentAnalyzer::with_basiclexicon();
80//! let sentiment = analyzer.analyze("I love this library!").unwrap();
81//! println!("Sentiment: {:?}", sentiment.sentiment);
82//! ```
83//!
84//! ## Architecture
85//!
86//! The module is organized into focused sub-modules:
87//!
88//! - [`tokenize`]: Text tokenization utilities
89//! - [`vectorize`]: Document vectorization and TF-IDF
90//! - [`embeddings`]: Word embedding training and utilities
91//! - [`sentiment`]: Sentiment analysis tools
92//! - [`topic_modeling`]: Topic modeling with LDA
93//! - [`string_metrics`]: String similarity and distance metrics
94//! - [`preprocess`]: Text cleaning and normalization
95//! - [`stemming`]: Stemming and lemmatization
96//! - [`parallel`]: Parallel processing utilities
97//! - [`simd_ops`]: SIMD-accelerated operations
98//!
99//! ## Performance
100//!
101//! SciRS2 Text is designed for high performance:
102//!
103//! - SIMD acceleration for string operations
104//! - Parallel processing for large document collections
105//! - Memory-efficient sparse matrix representations
106//! - Zero-copy string processing where possible
107//! - Optimized algorithms with complexity guarantees
108
109#![warn(missing_docs)]
110
111/// Batch tokenization with padding and attention masks.
112pub mod batch_tokenizer;
113pub mod classification;
114pub mod cleansing;
115pub mod distance;
116pub mod domain_processors;
117pub mod embeddings;
118pub mod enhanced_vectorize;
119pub mod error;
120pub mod evaluation;
121/// GPT-2 byte-level BPE tokenizer.
122pub mod gpt_bpe;
123pub mod huggingface_compat;
124pub mod information_extraction;
125pub mod language_model;
126pub mod lemmatization;
127pub mod ml_integration;
128pub mod ml_sentiment;
129pub mod model_registry;
130pub mod multilingual;
131pub mod neural_architectures;
132pub mod parallel;
133pub mod paraphrasing;
134pub mod performance;
135pub mod pipeline;
136pub mod pos_tagging;
137pub mod preprocess;
138pub mod semantic_similarity;
139/// SentencePiece Unigram Language Model tokenizer.
140pub mod sentencepiece;
141pub mod sentiment;
142pub mod simd_ops;
143pub mod sparse;
144pub mod sparse_vectorize;
145pub mod spelling;
146pub mod stemming;
147pub mod streaming;
148pub mod string_metrics;
149pub mod summarization;
150pub mod text_coordinator;
151pub mod text_statistics;
152pub mod token_filter;
153pub mod tokenize;
154pub mod tokenizer;
155pub mod topic_coherence;
156pub mod topic_modeling;
157pub mod transformer;
158pub mod utils;
159pub mod vectorize;
160pub mod visualization;
161pub mod vocabulary;
162pub mod weighted_distance;
163
164// New text processing modules
165pub mod keyword_extraction;
166pub mod language_detection;
167pub mod named_entity_recognition;
168pub mod text_similarity;
169pub mod text_summarization;
170
171// BERT fine-tuning
172pub mod bert_finetune;
173// Cross-lingual NER and transliteration
174pub mod crosslingual;
175// Correlated topic model
176pub mod ctm;
177// Dynamic topic model
178pub mod dtm;
179// Sentence embeddings
180pub mod sentence_embeddings;
181// Semantic similarity with embedding-based search
182pub mod similarity;
183// Tokenizer implementations (HuggingFace, byte-level BPE)
184pub mod tokenizers;
185// Transliteration
186pub mod transliteration;
187
188// Re-export commonly used items
189pub use classification::{
190    cross_validate_nb, BernoulliNaiveBayes, CrossValidationResult, FeatureHasher, FoldResult,
191    MultiLabelClassifier, MultiLabelPrediction, MultinomialNaiveBayes, TextClassificationMetrics,
192    TextClassificationPipeline, TextDataset, TextFeatureSelector, TfidfCosineClassifier,
193};
194pub use cleansing::{
195    expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
196    normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
197    replace_urls, strip_html_tags, AdvancedTextCleaner,
198};
199pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
200pub use domain_processors::{
201    Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
202    MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
203    ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
204};
205pub use embeddings::{
206    embedding_cosine_similarity,
207    fasttext::{FastText, FastTextConfig},
208    glove::{CooccurrenceMatrix, GloVe, GloVeTrainer, GloVeTrainerConfig},
209    pairwise_similarity, Word2Vec, Word2VecAlgorithm, Word2VecConfig, WordEmbedding,
210};
211pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
212pub use error::{Result, TextError};
213pub use huggingface_compat::{
214    ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
215    FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
216    HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
217    TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
218};
219pub use information_extraction::{
220    AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
221    CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
222    EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
223    InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
224    MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
225    StructuredDocumentInformation, TemporalExtractor, Topic,
226};
227pub use language_model::{NgramModel, SmoothingMethod};
228pub use lemmatization::{Lemmatizer, RuleBasedLemmatizer, WordNetLemmatizer};
229pub use ml_integration::{
230    BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
231};
232pub use ml_sentiment::{
233    ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
234};
235pub use model_registry::{
236    ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
237    SerializableModelData,
238};
239pub use multilingual::{
240    Language, LanguageDetectionResult, LanguageDetector, MultilingualProcessor, ProcessedText,
241    StopWords,
242};
243pub use neural_architectures::{
244    ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
245    GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
246    MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
247    ResidualBlock1D, SelfAttention, TextCNN,
248};
249pub use parallel::{
250    ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
251};
252pub use paraphrasing::{ParaphraseConfig, ParaphraseResult, ParaphraseStrategy, Paraphraser};
253pub use performance::{
254    AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
255    PerformanceSummary, PerformanceThresholds,
256};
257pub use pipeline::{
258    basic_pipeline, lemmatization_pipeline, ngram_pipeline, stemming_pipeline, BatchProcessor,
259    NlpPipeline, PipelineBuilder, PipelineStep,
260};
261pub use pos_tagging::{
262    PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
263};
264pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
265pub use semantic_similarity::{
266    LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
267    WordMoversDistance,
268};
269pub use sentiment::{
270    aggregate_sentiment, analyze_and_aggregate, AggregatedSentiment, AspectSentiment,
271    AspectSentimentAnalyzer, LexiconSentimentAnalyzer, NaiveBayesSentiment,
272    RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon, SentimentResult, SentimentRules,
273    SentimentWordCounts, VaderResult, VaderSentimentAnalyzer,
274};
275pub use simd_ops::{
276    AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
277    TextProcessingResult,
278};
279pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
280pub use sparse_vectorize::{
281    sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
282};
283pub use spelling::{
284    DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
285    SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
286};
287pub use stemming::{
288    LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
289    RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
290};
291pub use streaming::{
292    AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
293    ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
294};
295pub use string_metrics::{
296    AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
297    PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
298};
299pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
300pub use text_coordinator::{
301    AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
302    AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
303};
304pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
305pub use token_filter::{
306    CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
307    TokenFilter,
308};
309pub use tokenize::{
310    bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
311    CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
312    WhitespaceTokenizer, WordTokenizer,
313};
314pub use tokenizer::{
315    BPETokenizer, SimpleCharTokenizer, SimpleWhitespaceTokenizer, TransformerTokenizer,
316    WordPieceTokenizer,
317};
318pub use topic_coherence::{TopicCoherence, TopicDiversity};
319pub use topic_modeling::{
320    LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
321};
322pub use transformer::{
323    FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
324    TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
325    TransformerEncoderLayer, TransformerModel,
326};
327pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
328pub use visualization::{
329    AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
330    TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
331};
332pub use vocabulary::Vocabulary;
333pub use weighted_distance::{
334    DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
335    WeightedStringMetric,
336};
337
338// Re-exports for new modules
339pub use keyword_extraction::{
340    extract_keywords, Keyword, KeywordMethod, RakeKeywordExtractor, TextRankKeywordExtractor,
341    TfIdfKeywordExtractor,
342};
343pub use language_detection::{
344    detect_language, detect_language_with_strategy, DetectedLanguage, DetectionStrategy,
345    LanguageDetectionOutput,
346};
347pub use named_entity_recognition::{extract_entities, NerEntity, NerEntityType, NerPatternConfig};
348pub use text_similarity::{
349    bm25_score, char_ngram_jaccard_similarity, edit_distance_similarity, jaccard_token_similarity,
350    text_similarity, tfidf_cosine_similarity, Bm25Config, Bm25Scorer, SimilarityMethod,
351    SimilarityResult, TfIdfCosineSimilarity,
352};
353pub use text_summarization::{
354    score_position, score_textrank, score_tfidf, summarize, ScoredSentence, SummarizationMethod,
355};