import pytest
import omniparse
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
class TestConcurrentExtraction:
def test_concurrent_extraction_basic(self):
files = [
"test_data/text/sample.txt",
"test_data/text/sample.json",
"test_data/text/sample.csv",
"test_data/document/sample.pdf",
]
with ThreadPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(omniparse.extract_from_path, f) for f in files]
results = [future.result() for future in as_completed(futures)]
assert len(results) == len(files)
for result in results:
assert isinstance(result, omniparse.ExtractionResult)
assert isinstance(result.mime_type, str)
def test_concurrent_same_file(self):
file_path = "test_data/document/sample.pdf"
num_extractions = 10
with ThreadPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(omniparse.extract_from_path, file_path)
for _ in range(num_extractions)]
results = [future.result() for future in as_completed(futures)]
assert len(results) == num_extractions
mime_types = [r.mime_type for r in results]
assert all(mt == "application/pdf" for mt in mime_types)
def test_concurrent_different_formats(self):
files = [
"test_data/text/sample.txt",
"test_data/text/sample.json",
"test_data/document/sample.pdf",
"test_data/image/sample.png",
"test_data/archive/sample.zip",
]
with ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(omniparse.extract_from_path, files))
assert len(results) == len(files)
expected_types = [
"text/plain",
"application/json",
"application/pdf",
"image/png",
"application/zip",
]
for result, expected in zip(results, expected_types):
assert result.mime_type == expected
class TestGILRelease:
def test_parallel_speedup(self):
files = ["test_data/document/sample.pdf"] * 8
start_seq = time.time()
for file_path in files:
omniparse.extract_from_path(file_path)
sequential_time = time.time() - start_seq
start_par = time.time()
with ThreadPoolExecutor(max_workers=4) as executor:
list(executor.map(omniparse.extract_from_path, files))
parallel_time = time.time() - start_par
speedup = sequential_time / parallel_time
assert speedup > 1.0
def test_threads_can_run_simultaneously(self):
results = []
errors = []
def extract_and_record(file_path, thread_id):
try:
start = time.time()
result = omniparse.extract_from_path(file_path)
duration = time.time() - start
results.append((thread_id, duration, result))
except Exception as e:
errors.append((thread_id, e))
threads = []
files = [
"test_data/document/sample.pdf",
"test_data/text/sample.json",
"test_data/text/sample.csv",
"test_data/image/sample.png",
]
for i, file_path in enumerate(files):
thread = threading.Thread(target=extract_and_record, args=(file_path, i))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
assert len(errors) == 0, f"Errors occurred: {errors}"
assert len(results) == len(files)
class TestExtractionOverhead:
def test_extraction_performance(self):
file_path = "test_data/document/sample.pdf"
start = time.time()
result = omniparse.extract_from_path(file_path)
duration = time.time() - start
assert duration < 1.0
assert isinstance(result, omniparse.ExtractionResult)
def test_batch_extraction_performance(self):
files = [
"test_data/text/sample.txt",
"test_data/text/sample.json",
"test_data/text/sample.csv",
"test_data/document/sample.pdf",
"test_data/image/sample.png",
] * 2
start = time.time()
results = [omniparse.extract_from_path(f) for f in files]
duration = time.time() - start
assert len(results) == len(files)
assert duration < 5.0
def test_repeated_extraction_performance(self):
file_path = "test_data/text/sample.json"
num_iterations = 20
start = time.time()
for _ in range(num_iterations):
result = omniparse.extract_from_path(file_path)
assert result.mime_type == "application/json"
duration = time.time() - start
avg_time = duration / num_iterations
assert avg_time < 0.1
class TestMemoryEfficiency:
def test_large_file_extraction(self):
try:
result = omniparse.extract_from_path("test_data/large_test.txt")
assert isinstance(result, omniparse.ExtractionResult)
assert result.mime_type == "text/plain"
assert isinstance(result.content, str)
except IOError:
pytest.skip("Large test file not available")
def test_multiple_large_extractions(self):
file_path = "test_data/document/sample.pdf"
num_iterations = 50
for _ in range(num_iterations):
result = omniparse.extract_from_path(file_path)
assert isinstance(result, omniparse.ExtractionResult)
def test_concurrent_large_extractions(self):
files = ["test_data/document/sample.pdf"] * 20
with ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(omniparse.extract_from_path, files))
assert len(results) == len(files)
for result in results:
assert result.mime_type == "application/pdf"
class TestExtractionFromBytes:
def test_bytes_extraction_performance(self):
with open("test_data/document/sample.pdf", "rb") as f:
data = f.read()
start = time.time()
result = omniparse.extract_from_bytes(data)
duration = time.time() - start
assert duration < 1.0
assert result.mime_type == "application/pdf"
def test_concurrent_bytes_extraction(self):
with open("test_data/text/sample.json", "rb") as f:
data = f.read()
def extract_bytes():
return omniparse.extract_from_bytes(data)
with ThreadPoolExecutor(max_workers=4) as executor:
futures = [executor.submit(extract_bytes) for _ in range(10)]
results = [future.result() for future in as_completed(futures)]
assert len(results) == 10
for result in results:
assert result.mime_type == "application/json"
class TestScalability:
def test_scaling_with_thread_count(self):
files = ["test_data/text/sample.json"] * 16
times = {}
for workers in [1, 2, 4]:
start = time.time()
with ThreadPoolExecutor(max_workers=workers) as executor:
list(executor.map(omniparse.extract_from_path, files))
times[workers] = time.time() - start
assert times[1] > 0
assert times[4] > 0
def test_many_small_files(self):
files = [
"test_data/text/sample.txt",
"test_data/text/sample.json",
"test_data/text/minimal.json",
"test_data/text/minimal.csv",
] * 5
start = time.time()
with ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(omniparse.extract_from_path, files))
duration = time.time() - start
assert len(results) == len(files)
assert duration < 3.0