import numpy as np
import pytest
import scirs2
class TestTokenization:
def test_word_tokenizer_basic(self):
tokenizer = scirs2.WordTokenizer(lowercase=True)
text = "Hello World! This is a test."
tokens = tokenizer.tokenize(text)
assert len(tokens) > 0
assert "hello" in tokens or "world" in tokens
def test_word_tokenizer_batch(self):
tokenizer = scirs2.WordTokenizer(lowercase=True)
texts = ["First sentence.", "Second sentence.", "Third sentence."]
tokens_batch = tokenizer.tokenize_batch(texts)
assert len(tokens_batch) == 3
assert all(len(tokens) > 0 for tokens in tokens_batch)
def test_word_tokenizer_case_sensitive(self):
tokenizer = scirs2.WordTokenizer(lowercase=False)
text = "Hello WORLD"
tokens = tokenizer.tokenize(text)
assert any(t[0].isupper() for t in tokens if t)
def test_sentence_tokenizer(self):
tokenizer = scirs2.SentenceTokenizer()
text = "This is the first sentence. This is the second! And the third?"
sentences = tokenizer.tokenize(text)
assert len(sentences) >= 2
def test_character_tokenizer(self):
tokenizer = scirs2.CharacterTokenizer()
text = "abc"
chars = tokenizer.tokenize(text)
assert len(chars) == 3
assert chars == ["a", "b", "c"]
def test_whitespace_tokenizer(self):
tokenizer = scirs2.WhitespaceTokenizer()
text = "word1 word2 word3"
tokens = tokenizer.tokenize(text)
assert len(tokens) == 3
assert "word1" in tokens
def test_regex_tokenizer(self):
tokenizer = scirs2.RegexTokenizer(pattern=r"\w+")
text = "Hello, World! 123"
tokens = tokenizer.tokenize(text)
assert len(tokens) >= 2
def test_ngram_tokenizer(self):
tokenizer = scirs2.NgramTokenizer(n=2)
text = "one two three"
ngrams = tokenizer.tokenize(text)
assert len(ngrams) >= 1
class TestVectorization:
def test_count_vectorizer_basic(self):
vectorizer = scirs2.CountVectorizer()
texts = ["hello world", "hello python", "world python"]
vectorizer.fit(texts)
vectors = vectorizer.transform(texts)
assert vectors.shape[0] == 3 assert vectors.shape[1] >= 2
def test_count_vectorizer_vocabulary(self):
vectorizer = scirs2.CountVectorizer()
texts = ["cat dog", "dog bird", "cat bird"]
vectorizer.fit(texts)
vocab = vectorizer.get_vocabulary()
assert len(vocab) >= 3
assert "cat" in vocab or "dog" in vocab or "bird" in vocab
def test_tfidf_vectorizer_basic(self):
vectorizer = scirs2.TfidfVectorizer()
texts = [
"the quick brown fox",
"the lazy dog",
"the quick dog"
]
vectorizer.fit(texts)
vectors = vectorizer.transform(texts)
assert vectors.shape[0] == 3
assert vectors.shape[1] >= 2
assert np.all(vectors >= 0)
def test_tfidf_vectorizer_idf(self):
vectorizer = scirs2.TfidfVectorizer()
texts = [
"common word",
"common word",
"rare word"
]
vectorizer.fit(texts)
vectors = vectorizer.transform(texts)
assert vectors.shape[0] == 3
def test_count_vectorizer_max_features(self):
vectorizer = scirs2.CountVectorizer(max_features=2)
texts = ["a b c d e f", "a b c", "a b"]
vectorizer.fit(texts)
vectors = vectorizer.transform(texts)
assert vectors.shape[1] <= 2
def test_count_vectorizer_ngrams(self):
vectorizer = scirs2.CountVectorizer(ngram_range=(1, 2))
texts = ["hello world", "hello python"]
vectorizer.fit(texts)
vectors = vectorizer.transform(texts)
assert vectors.shape[1] >= 2
class TestStemming:
def test_porter_stemmer(self):
stemmer = scirs2.PorterStemmer()
word = "running"
stem = stemmer.stem(word)
assert len(stem) < len(word)
assert stem in ["run", "runn"]
def test_porter_stemmer_batch(self):
stemmer = scirs2.PorterStemmer()
words = ["running", "runner", "runs", "ran"]
stems = stemmer.stem_batch(words)
assert len(stems) == 4
assert all(len(stem) <= len(word) for stem, word in zip(stems, words))
def test_snowball_stemmer(self):
stemmer = scirs2.SnowballStemmer(language="english")
word = "generously"
stem = stemmer.stem(word)
assert len(stem) <= len(word)
def test_lancaster_stemmer(self):
stemmer = scirs2.LancasterStemmer()
word = "maximum"
stem = stemmer.stem(word)
assert len(stem) <= len(word)
def test_stemmer_variants(self):
words = ["compute", "computing", "computer", "computation"]
porter = scirs2.PorterStemmer()
porter_stems = [porter.stem(w) for w in words]
assert len(set(porter_stems)) <= 2
class TestCleansing:
def test_remove_accents(self):
text = "café naïve résumé"
cleaned = scirs2.remove_accents_py(text)
assert "é" not in cleaned or "cafe" in cleaned.lower()
def test_normalize_whitespace(self):
text = "hello world \t test"
normalized = scirs2.normalize_whitespace_py(text)
assert " " not in normalized
assert "\t" not in normalized
def test_normalize_unicode(self):
text = "Hello\u00A0World" normalized = scirs2.normalize_unicode_py(text)
assert len(normalized) > 0
def test_strip_html_tags(self):
text = "<p>Hello <b>World</b></p>"
cleaned = scirs2.strip_html_tags_py(text)
assert "<p>" not in cleaned
assert "<b>" not in cleaned
assert "Hello" in cleaned
assert "World" in cleaned
def test_replace_urls(self):
text = "Check out https://example.com for more info"
cleaned = scirs2.replace_urls_py(text, replacement="URL")
assert "https://example.com" not in cleaned
assert "URL" in cleaned
def test_replace_emails(self):
text = "Contact us at test@example.com for help"
cleaned = scirs2.replace_emails_py(text, replacement="EMAIL")
assert "test@example.com" not in cleaned
assert "EMAIL" in cleaned
def test_expand_contractions(self):
text = "I'm can't won't"
expanded = scirs2.expand_contractions_py(text)
assert len(expanded) >= len(text)
class TestSentiment:
def test_lexicon_sentiment_positive(self):
analyzer = scirs2.LexiconSentimentAnalyzer()
text = "This is wonderful and amazing!"
sentiment = analyzer.analyze(text)
assert "score" in sentiment or "polarity" in sentiment
score = sentiment.get("score", sentiment.get("polarity", 0))
assert score > 0
def test_lexicon_sentiment_negative(self):
analyzer = scirs2.LexiconSentimentAnalyzer()
text = "This is terrible and awful!"
sentiment = analyzer.analyze(text)
score = sentiment.get("score", sentiment.get("polarity", 0))
assert score < 0
def test_lexicon_sentiment_neutral(self):
analyzer = scirs2.LexiconSentimentAnalyzer()
text = "The table is wooden."
sentiment = analyzer.analyze(text)
score = sentiment.get("score", sentiment.get("polarity", 0))
assert -0.5 <= score <= 0.5
def test_sentiment_batch(self):
analyzer = scirs2.LexiconSentimentAnalyzer()
texts = [
"Great product!",
"Terrible service.",
"It is okay."
]
sentiments = analyzer.analyze_batch(texts)
assert len(sentiments) == 3
class TestStringSimilarity:
def test_levenshtein_distance(self):
s1 = "kitten"
s2 = "sitting"
distance = scirs2.levenshtein_distance_py(s1, s2)
assert distance == 3
def test_levenshtein_identical(self):
s1 = "hello"
s2 = "hello"
distance = scirs2.levenshtein_distance_py(s1, s2)
assert distance == 0
def test_hamming_distance(self):
s1 = "karolin"
s2 = "kathrin"
distance = scirs2.hamming_distance_py(s1, s2)
assert distance == 3
def test_jaro_winkler_similarity(self):
s1 = "martha"
s2 = "marhta"
similarity = scirs2.jaro_winkler_similarity_py(s1, s2)
assert 0.9 <= similarity <= 1.0
def test_cosine_similarity_text(self):
s1 = "the quick brown fox"
s2 = "the fast brown fox"
similarity = scirs2.cosine_similarity_text_py(s1, s2)
assert 0.5 <= similarity <= 1.0
class TestTextStatistics:
def test_word_count(self):
text = "The quick brown fox jumps over the lazy dog"
count = scirs2.word_count_py(text)
assert count == 9
def test_character_count(self):
text = "Hello World!"
count = scirs2.character_count_py(text, include_spaces=False)
assert count == 10
def test_unique_words(self):
text = "the cat and the dog and the bird"
unique = scirs2.unique_words_py(text)
assert len(unique) == 5
def test_lexical_diversity(self):
text = "the cat sat on the mat"
diversity = scirs2.lexical_diversity_py(text)
assert 0 < diversity <= 1
class TestTextNormalization:
def test_lowercase_normalization(self):
text = "Hello WORLD"
normalized = scirs2.normalize_case_py(text, case="lower")
assert normalized == "hello world"
def test_uppercase_normalization(self):
text = "Hello World"
normalized = scirs2.normalize_case_py(text, case="upper")
assert normalized == "HELLO WORLD"
def test_remove_punctuation(self):
text = "Hello, World! How are you?"
cleaned = scirs2.remove_punctuation_py(text)
assert "," not in cleaned
assert "!" not in cleaned
assert "?" not in cleaned
def test_remove_numbers(self):
text = "There are 123 apples and 456 oranges"
cleaned = scirs2.remove_numbers_py(text)
assert "123" not in cleaned
assert "456" not in cleaned
class TestEdgeCases:
def test_empty_text_tokenization(self):
tokenizer = scirs2.WordTokenizer()
tokens = tokenizer.tokenize("")
assert len(tokens) == 0
def test_single_character_tokenization(self):
tokenizer = scirs2.WordTokenizer()
tokens = tokenizer.tokenize("a")
assert len(tokens) >= 1
def test_unicode_text(self):
tokenizer = scirs2.WordTokenizer()
text = "Hello 世界 مرحبا"
tokens = tokenizer.tokenize(text)
assert len(tokens) > 0
def test_very_long_text(self):
tokenizer = scirs2.WordTokenizer()
text = " ".join(["word"] * 10000)
tokens = tokenizer.tokenize(text)
assert len(tokens) == 10000
def test_special_characters(self):
text = "!@#$%^&*()_+-=[]{}|;':,.<>?/"
tokenizer = scirs2.WordTokenizer()
tokens = tokenizer.tokenize(text)
assert isinstance(tokens, list)
if __name__ == "__main__":
pytest.main([__file__, "-v"])