import pytest
import omniparse
class TestExtractFromPath:
def test_extract_pdf(self):
result = omniparse.extract_from_path("test_data/document/sample.pdf")
assert isinstance(result, omniparse.ExtractionResult)
assert result.mime_type == "application/pdf"
assert result.detection_confidence > 0.5
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)
assert len(result.content) > 0
def test_extract_json(self):
result = omniparse.extract_from_path("test_data/text/sample.json")
assert result.mime_type == "application/json"
assert result.detection_confidence > 0.5
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)
def test_extract_csv(self):
result = omniparse.extract_from_path("test_data/text/sample.csv")
assert result.mime_type == "text/csv"
assert result.detection_confidence > 0.5
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)
def test_extract_plain_text(self):
result = omniparse.extract_from_path("test_data/text/sample.txt")
assert result.mime_type == "text/plain"
assert result.detection_confidence > 0.5
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)
def test_extract_xml(self):
result = omniparse.extract_from_path("test_data/text/sample.xml")
assert result.mime_type in ["application/xml", "text/xml"]
assert result.detection_confidence > 0.5
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)
def test_extract_docx(self):
result = omniparse.extract_from_path("test_data/document/sample.docx")
assert result.mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
assert result.detection_confidence > 0.5
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)
class TestExtractFromBytes:
def test_extract_from_bytes_json(self):
with open("test_data/text/sample.json", "rb") as f:
data = f.read()
result = omniparse.extract_from_bytes(data)
assert isinstance(result, omniparse.ExtractionResult)
assert result.mime_type == "application/json"
assert result.detection_confidence > 0.5
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)
def test_extract_from_bytes_with_hint(self):
with open("test_data/text/sample.csv", "rb") as f:
data = f.read()
result = omniparse.extract_from_bytes(data, mime_hint="text/csv")
assert result.mime_type == "text/csv"
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)
def test_extract_from_bytes_pdf(self):
with open("test_data/document/sample.pdf", "rb") as f:
data = f.read()
result = omniparse.extract_from_bytes(data)
assert result.mime_type == "application/pdf"
assert result.detection_confidence > 0.5
assert isinstance(result.content, str)
def test_extract_from_bytes_plain_text(self):
with open("test_data/text/sample.txt", "rb") as f:
data = f.read()
result = omniparse.extract_from_bytes(data)
assert result.mime_type == "text/plain"
assert isinstance(result.content, str)
class TestResultStructure:
def test_result_has_all_properties(self):
result = omniparse.extract_from_path("test_data/text/sample.json")
assert hasattr(result, "mime_type")
assert hasattr(result, "content")
assert hasattr(result, "metadata")
assert hasattr(result, "detection_confidence")
def test_mime_type_is_string(self):
result = omniparse.extract_from_path("test_data/text/sample.txt")
assert isinstance(result.mime_type, str)
assert len(result.mime_type) > 0
def test_content_types(self):
result = omniparse.extract_from_path("test_data/text/sample.txt")
assert result.content is None or isinstance(result.content, (str, bytes))
def test_metadata_is_dict(self):
result = omniparse.extract_from_path("test_data/text/sample.json")
assert isinstance(result.metadata, dict)
def test_detection_confidence_range(self):
result = omniparse.extract_from_path("test_data/text/sample.txt")
assert isinstance(result.detection_confidence, float)
assert 0.0 <= result.detection_confidence <= 1.0
def test_result_repr(self):
result = omniparse.extract_from_path("test_data/text/sample.txt")
repr_str = repr(result)
assert isinstance(repr_str, str)
assert "ExtractionResult" in repr_str
assert result.mime_type in repr_str
class TestMultipleFormats:
@pytest.mark.parametrize("file_path,expected_mime", [
("test_data/text/sample.txt", "text/plain"),
("test_data/text/sample.json", "application/json"),
("test_data/text/sample.csv", "text/csv"),
("test_data/text/sample.xml", "application/xml"),
("test_data/document/sample.pdf", "application/pdf"),
])
def test_multiple_text_formats(self, file_path, expected_mime):
result = omniparse.extract_from_path(file_path)
if expected_mime == "application/xml":
assert result.mime_type in ["application/xml", "text/xml"]
else:
assert result.mime_type == expected_mime
assert result.detection_confidence > 0.0
assert isinstance(result.content, str)
assert isinstance(result.metadata, dict)