import pytest
class TestRecursiveCharacterTextSplitter:
def test_create_splitter(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(500, 50)
assert splitter is not None
def test_splitter_repr(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(500, 50)
repr_str = repr(splitter)
assert "RecursiveCharacterTextSplitter" in repr_str
def test_split_short_text(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(500, 50)
text = "This is a short text."
chunks = splitter.split_text(text)
assert len(chunks) == 1
assert chunks[0] == text
def test_split_long_text(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 10)
text = "A" * 250
chunks = splitter.split_text(text)
assert len(chunks) > 1
for chunk in chunks:
assert len(chunk) <= 200
def test_split_with_paragraphs(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 20)
text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
chunks = splitter.split_text(text)
assert len(chunks) >= 1
def test_split_with_sentences(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(50, 10)
text = "First sentence. Second sentence. Third sentence. Fourth sentence."
chunks = splitter.split_text(text)
assert len(chunks) >= 1
def test_different_chunk_sizes(self):
from vecstore import RecursiveCharacterTextSplitter
text = "A" * 500
splitter_small = RecursiveCharacterTextSplitter(100, 10)
chunks_small = splitter_small.split_text(text)
splitter_large = RecursiveCharacterTextSplitter(200, 20)
chunks_large = splitter_large.split_text(text)
assert len(chunks_small) > len(chunks_large)
def test_chunk_overlap(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(50, 10)
text = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" * 10
chunks = splitter.split_text(text)
if len(chunks) > 1:
assert len(chunks) >= 2
def test_empty_text(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 10)
chunks = splitter.split_text("")
assert isinstance(chunks, list)
def test_whitespace_text(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 10)
text = " \n\n \t\t "
chunks = splitter.split_text(text)
assert isinstance(chunks, list)
def test_unicode_text(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 10)
text = "Hello 世界! " * 20
chunks = splitter.split_text(text)
assert len(chunks) >= 1
for chunk in chunks:
assert isinstance(chunk, str)
def test_code_text(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(200, 20)
code = """
def function1():
print("Hello")
return 42
def function2():
print("World")
return 24
"""
chunks = splitter.split_text(code)
assert len(chunks) >= 1
def test_markdown_text(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(150, 20)
markdown = """
# Header 1
Some content here.
## Header 2
More content.
### Header 3
Even more content.
"""
chunks = splitter.split_text(markdown)
assert len(chunks) >= 1
def test_very_small_chunks(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(10, 2)
text = "This is a test sentence."
chunks = splitter.split_text(text)
assert len(chunks) >= 1
for chunk in chunks:
assert len(chunk) <= 30
def test_very_large_chunks(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(10000, 100)
text = "Short text"
chunks = splitter.split_text(text)
assert len(chunks) == 1
def test_zero_overlap(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 0)
text = "A" * 250
chunks = splitter.split_text(text)
assert len(chunks) >= 2
def test_realistic_document(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(500, 50)
document = """
The Rust Programming Language
Rust is a systems programming language that runs blazingly fast,
prevents segfaults, and guarantees thread safety.
Memory Safety
Rust's ownership system is its most distinctive feature. At any given time,
you can have either one mutable reference or any number of immutable references.
Zero-Cost Abstractions
Rust provides high-level abstractions like iterators, closures, and pattern
matching, but these abstractions compile down to code as efficient as if you
had written the low-level code by hand.
"""
chunks = splitter.split_text(document)
assert len(chunks) >= 1
for chunk in chunks:
assert len(chunk) > 0
for chunk in chunks:
assert len(chunk) <= 600
class TestTextSplitterEdgeCases:
def test_single_long_word(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(50, 5)
text = "A" * 200
chunks = splitter.split_text(text)
assert len(chunks) >= 1
def test_repeating_separators(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 10)
text = "Word\n\n\n\n\n\nAnother word"
chunks = splitter.split_text(text)
assert len(chunks) >= 1
def test_mixed_separators(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 10)
text = "Para1.\n\nPara2. Sentence! Question? Another sentence."
chunks = splitter.split_text(text)
assert len(chunks) >= 1
def test_consistency(self):
from vecstore import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(100, 10)
text = "Same text every time. " * 20
chunks1 = splitter.split_text(text)
chunks2 = splitter.split_text(text)
assert len(chunks1) == len(chunks2)
for c1, c2 in zip(chunks1, chunks2):
assert c1 == c2