#![allow(clippy::manual_strip)]
#![allow(clippy::needless_range_loop)]
#![allow(clippy::if_same_then_else)]
#![allow(clippy::cloned_ref_to_slice_refs)]
#![allow(dead_code)]
#![warn(missing_docs)]
pub mod batch_tokenizer;
pub mod classification;
pub mod cleansing;
pub mod distance;
pub mod domain_processors;
pub mod embeddings;
pub mod enhanced_vectorize;
pub mod error;
pub mod evaluation;
pub mod gpt_bpe;
pub mod huggingface_compat;
pub mod information_extraction;
pub mod language_model;
pub mod language_models;
pub mod lemmatization;
pub mod ml_integration;
pub mod ml_sentiment;
pub mod model_registry;
pub mod multilingual;
pub mod neural_architectures;
pub mod parallel;
pub mod paraphrasing;
pub mod performance;
pub mod pipeline;
pub mod pos_tagging;
pub mod preprocess;
pub mod semantic_similarity;
pub mod sentencepiece;
pub mod sentiment;
pub mod simd_ops;
pub mod sparse;
pub mod sparse_vectorize;
pub mod spelling;
pub mod stemming;
pub mod streaming;
pub mod string_metrics;
pub mod summarization;
pub mod text_coordinator;
pub mod text_statistics;
pub mod token_filter;
pub mod tokenize;
pub mod tokenizer;
pub mod topic_coherence;
pub mod topic_modeling;
pub mod transformer;
pub mod utils;
pub mod vectorize;
pub mod visualization;
pub mod vocabulary;
pub mod weighted_distance;
pub mod keyword_extraction;
pub mod language_detection;
pub mod named_entity_recognition;
pub mod text_similarity;
pub mod text_summarization;
pub mod bert_finetune;
pub mod crosslingual;
pub mod ctm;
pub mod dtm;
pub mod hdp;
pub mod sentence_embeddings;
pub mod similarity;
pub mod tokenization;
pub mod tokenizers;
pub mod transliteration;
pub mod topic;
pub mod abstractive_summary;
pub mod advanced_classification;
pub mod advanced_distance;
pub mod alignment;
pub mod bpe_tokenizer;
pub mod coreference;
pub mod dialog;
pub mod discourse;
pub mod doc_similarity;
pub mod event_extraction;
pub mod keywords;
pub mod multilingual_ext;
pub mod ner;
pub mod pos_tagging_original;
pub mod question_answering;
pub mod regex_lite;
pub mod segmentation;
pub mod summarize_advanced;
pub mod text_classification;
pub mod text_preprocess;
pub mod topic_model;
pub use classification::{
cross_validate_nb, BernoulliNaiveBayes, CrossValidationResult, FeatureHasher, FoldResult,
MultiLabelClassifier, MultiLabelPrediction, MultinomialNaiveBayes, TextClassificationMetrics,
TextClassificationPipeline, TextDataset, TextFeatureSelector, TfidfCosineClassifier,
};
pub use cleansing::{
expand_contractions, normalize_currencies, normalize_numbers, normalize_ordinals,
normalize_percentages, normalize_unicode, normalize_whitespace, remove_accents, replace_emails,
replace_urls, strip_html_tags, AdvancedTextCleaner,
};
pub use distance::{cosine_similarity, jaccard_similarity, levenshtein_distance};
pub use domain_processors::{
Domain, DomainProcessorConfig, FinancialTextProcessor, LegalTextProcessor,
MedicalTextProcessor, NewsTextProcessor, PatentTextProcessor, ProcessedDomainText,
ScientificTextProcessor, SocialMediaTextProcessor, UnifiedDomainProcessor,
};
pub use embeddings::{
embedding_cosine_similarity,
fasttext::{FastText, FastTextConfig},
glove::{CooccurrenceMatrix, GloVe, GloVeTrainer, GloVeTrainerConfig},
pairwise_similarity, Word2Vec, Word2VecAlgorithm, Word2VecConfig, WordEmbedding,
};
pub use enhanced_vectorize::{EnhancedCountVectorizer, EnhancedTfidfVectorizer};
pub use error::{Result, TextError};
pub use huggingface_compat::{
ClassificationResult, FeatureExtractionPipeline, FillMaskPipeline, FillMaskResult,
FormatConverter, HfConfig, HfEncodedInput, HfHub, HfModelAdapter, HfPipeline, HfTokenizer,
HfTokenizerConfig, QuestionAnsweringPipeline, QuestionAnsweringResult,
TextClassificationPipeline as HfTextClassificationPipeline, ZeroShotClassificationPipeline,
};
pub use information_extraction::{
AdvancedExtractedInformation, AdvancedExtractionPipeline, ConfidenceScorer, CoreferenceChain,
CoreferenceMention, CoreferenceResolver, DocumentInformationExtractor, DocumentSummary, Entity,
EntityCluster, EntityLinker, EntityType, Event, ExtractedInformation,
InformationExtractionPipeline, KeyPhraseExtractor, KnowledgeBaseEntry, LinkedEntity,
MentionType, PatternExtractor, Relation, RelationExtractor, RuleBasedNER,
StructuredDocumentInformation, TemporalExtractor, Topic,
};
pub use language_model::{NgramModel, SmoothingMethod};
pub use lemmatization::{Lemmatizer, RuleBasedLemmatizer, WordNetLemmatizer};
pub use ml_integration::{
BatchTextProcessor, FeatureExtractionMode, MLTextPreprocessor, TextFeatures, TextMLPipeline,
};
pub use ml_sentiment::{
ClassMetrics, EvaluationMetrics, MLSentimentAnalyzer, MLSentimentConfig, TrainingMetrics,
};
pub use model_registry::{
ModelMetadata, ModelRegistry, ModelType, PrebuiltModels, RegistrableModel,
SerializableModelData,
};
pub use multilingual::{
is_cjk_char, is_combining_mark, is_cyrillic, Language, LanguageDetectionResult,
LanguageDetector, MultilingualProcessor, ProcessedText, ScriptFamily, StopWords,
Transliterator as MultilingualTransliterator, UnicodeTokenizer, UnicodeTokenizerConfig,
};
pub use neural_architectures::{
ActivationFunction, AdditiveAttention, BiLSTM, CNNLSTMHybrid, Conv1D, CrossAttention, Dropout,
GRUCell, LSTMCell, LayerNorm as NeuralLayerNorm, MaxPool1D,
MultiHeadAttention as NeuralMultiHeadAttention, MultiScaleCNN, PositionwiseFeedForward,
ResidualBlock1D, SelfAttention, TextCNN,
};
pub use parallel::{
ParallelCorpusProcessor, ParallelTextProcessor, ParallelTokenizer, ParallelVectorizer,
};
pub use paraphrasing::{ParaphraseConfig, ParaphraseResult, ParaphraseStrategy, Paraphraser};
pub use performance::{
AdvancedPerformanceMonitor, DetailedPerformanceReport, OptimizationRecommendation,
PerformanceSummary, PerformanceThresholds,
};
pub use pipeline::{
basic_pipeline, lemmatization_pipeline, ngram_pipeline, stemming_pipeline, BatchProcessor,
NlpPipeline, PipelineBuilder, PipelineStep,
};
pub use pos_tagging::{
PosAwareLemmatizer, PosTagResult, PosTagger, PosTaggerConfig, PosTaggingResult,
};
pub use preprocess::{BasicNormalizer, BasicTextCleaner, TextCleaner, TextNormalizer};
pub use semantic_similarity::{
LcsSimilarity, SemanticSimilarityEnsemble, SoftCosineSimilarity, WeightedJaccard,
WordMoversDistance,
};
pub use sentiment::{
aggregate_sentiment, analyze_and_aggregate, AggregatedSentiment, AspectSentiment,
AspectSentimentAnalyzer, LexiconSentimentAnalyzer, NaiveBayesSentiment,
RuleBasedSentimentAnalyzer, Sentiment, SentimentLexicon, SentimentResult, SentimentRules,
SentimentWordCounts, VaderResult, VaderSentimentAnalyzer,
};
pub use simd_ops::{
AdvancedSIMDTextProcessor, SimdEditDistance, SimdStringOps, SimdTextAnalyzer,
TextProcessingResult,
};
pub use sparse::{CsrMatrix, DokMatrix, SparseMatrixBuilder, SparseVector};
pub use sparse_vectorize::{
sparse_cosine_similarity, MemoryStats, SparseCountVectorizer, SparseTfidfVectorizer,
};
pub use spelling::{
DictionaryCorrector, DictionaryCorrectorConfig, EditOp, ErrorModel, NGramModel,
SpellingCorrector, StatisticalCorrector, StatisticalCorrectorConfig,
};
pub use stemming::{
LancasterStemmer, LemmatizerConfig, PorterStemmer, PosTag, RuleLemmatizer,
RuleLemmatizerBuilder, SimpleLemmatizer, SnowballStemmer, Stemmer,
};
pub use streaming::{
AdvancedStreamingMetrics, AdvancedStreamingProcessor, ChunkedCorpusReader, MemoryMappedCorpus,
ProgressTracker, StreamingTextProcessor, StreamingVectorizer,
};
pub use string_metrics::{
AlignmentResult, DamerauLevenshteinMetric, Metaphone, NeedlemanWunsch, Nysiis,
PhoneticAlgorithm, SmithWaterman, Soundex, StringMetric,
};
pub use summarization::{CentroidSummarizer, KeywordExtractor, TextRank};
pub use text_coordinator::{
AdvancedBatchClassificationResult, AdvancedSemanticSimilarityResult, AdvancedTextConfig,
AdvancedTextCoordinator, AdvancedTextResult, AdvancedTopicModelingResult,
};
pub use text_statistics::{ReadabilityMetrics, TextMetrics, TextStatistics};
pub use token_filter::{
CompositeFilter, CustomFilter, FrequencyFilter, LengthFilter, RegexFilter, StopwordsFilter,
TokenFilter,
};
pub use tokenize::{
bpe::{BpeConfig, BpeTokenizer, BpeVocabulary},
CharacterTokenizer, NgramTokenizer, RegexTokenizer, SentenceTokenizer, Tokenizer,
WhitespaceTokenizer, WordTokenizer,
};
pub use tokenizer::{
BPETokenizer, SimpleCharTokenizer, SimpleWhitespaceTokenizer, TransformerTokenizer,
WordPieceTokenizer,
};
pub use topic_coherence::{TopicCoherence, TopicDiversity};
pub use topic_modeling::{
LatentDirichletAllocation, LdaBuilder, LdaConfig, LdaLearningMethod, Topic as LdaTopic,
};
pub use transformer::{
FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TokenEmbedding,
TransformerConfig, TransformerDecoder, TransformerDecoderLayer, TransformerEncoder,
TransformerEncoderLayer, TransformerModel,
};
pub use vectorize::{CountVectorizer, TfidfVectorizer, Vectorizer};
pub use visualization::{
AttentionVisualizer, Color, ColorScheme, EmbeddingVisualizer, SentimentVisualizer,
TextAnalyticsDashboard, TopicVisualizer, VisualizationConfig, WordCloud,
};
pub use vocabulary::Vocabulary;
pub use weighted_distance::{
DamerauLevenshteinWeights, LevenshteinWeights, WeightedDamerauLevenshtein, WeightedLevenshtein,
WeightedStringMetric,
};
pub use keyword_extraction::{
extract_keywords, Keyword, KeywordMethod, RakeKeywordExtractor, TextRankKeywordExtractor,
TfIdfKeywordExtractor,
};
pub use language_detection::{
detect_language, detect_language_with_strategy, DetectedLanguage, DetectionStrategy,
LanguageDetectionOutput,
};
pub use named_entity_recognition::{extract_entities, NerEntity, NerEntityType, NerPatternConfig};
pub use text_similarity::{
bm25_score, char_ngram_jaccard_similarity, edit_distance_similarity, jaccard_token_similarity,
text_similarity, tfidf_cosine_similarity, Bm25Config, Bm25Scorer, SimilarityMethod,
SimilarityResult, TfIdfCosineSimilarity,
};
pub use text_summarization::{
score_position, score_textrank, score_tfidf, summarize, ScoredSentence, SummarizationMethod,
};
pub use hdp::{HdpConfig, HdpModel, HdpResult};
pub use sentence_embeddings::{
SentenceEncoder, SentenceEncoderConfig, SentenceEncoderPooling, SimCSELoss, SimCSETrainer,
};
pub use topic::hdp::{
Hdp, HdpConfig as HdpAutoConfig, HdpState, HdpTopicConfig, HdpTopicModel, TopicError,
};
pub use embeddings::sentence_encoder::{
PoolingStrategy as SentencePoolingStrategy, SemanticSimilarity as EmbeddingSearch,
SentenceEncoder as ProjSentenceEncoder, SimCseConfig, SimCseTrainer as ProjSimCseTrainer,
};
pub use transliteration::{
CyrillicScheme, CyrillicTransliterator, HepburnTransliterator, PinyinStyle,
PinyinTransliterator, Transliterator,
};
pub mod neural_nlp;
pub use neural_nlp::{
AttentionHeatmap, AttentionVisualization, BertClassifier, BertClassifierConfig, NerTag,
NeuralNer, NeuralNerConfig, TransformerEncoderConfig as NeuralEncoderConfig,
TransformerTextEncoder,
};