import pytest
import omniparse
class TestTextFormats:
def test_plain_text_extraction(self):
result = omniparse.extract_from_path("test_data/text/sample.txt")
assert result.mime_type == "text/plain"
assert isinstance(result.content, str)
assert len(result.content) > 0
assert result.detection_confidence > 0.5
def test_json_extraction(self):
result = omniparse.extract_from_path("test_data/text/sample.json")
assert result.mime_type == "application/json"
assert isinstance(result.content, str)
assert result.detection_confidence > 0.5
assert len(result.content) > 0
def test_csv_extraction(self):
result = omniparse.extract_from_path("test_data/text/sample.csv")
assert result.mime_type == "text/csv"
assert isinstance(result.content, str)
assert result.detection_confidence > 0.5
metadata = result.metadata
assert isinstance(metadata, dict)
def test_xml_extraction(self):
result = omniparse.extract_from_path("test_data/text/sample.xml")
assert result.mime_type in ["application/xml", "text/xml"]
assert isinstance(result.content, str)
assert result.detection_confidence > 0.5
assert len(result.content) > 0
def test_minimal_json(self):
result = omniparse.extract_from_path("test_data/text/minimal.json")
assert result.mime_type == "application/json"
assert isinstance(result.content, str)
def test_minimal_csv(self):
result = omniparse.extract_from_path("test_data/text/minimal.csv")
assert result.mime_type == "text/csv"
assert isinstance(result.content, str)
class TestDocumentFormats:
def test_pdf_extraction(self):
result = omniparse.extract_from_path("test_data/document/sample.pdf")
assert result.mime_type == "application/pdf"
assert isinstance(result.content, str)
assert result.detection_confidence > 0.5
metadata = result.metadata
assert isinstance(metadata, dict)
def test_docx_extraction(self):
result = omniparse.extract_from_path("test_data/document/sample.docx")
assert result.mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
assert isinstance(result.content, str)
assert result.detection_confidence > 0.5
metadata = result.metadata
assert isinstance(metadata, dict)
def test_odt_extraction(self):
result = omniparse.extract_from_path("test_data/document/sample.odt")
assert result.mime_type == "application/vnd.oasis.opendocument.text"
assert isinstance(result.content, str)
assert result.detection_confidence > 0.5
metadata = result.metadata
assert isinstance(metadata, dict)
def test_empty_pdf(self):
result = omniparse.extract_from_path("test_data/document/empty.pdf")
assert result.mime_type == "application/pdf"
assert result.detection_confidence > 0.5
assert result.content is None or isinstance(result.content, str)
class TestImageFormats:
def test_png_extraction(self):
result = omniparse.extract_from_path("test_data/image/sample.png")
assert result.mime_type == "image/png"
assert result.detection_confidence > 0.5
assert result.content is None or isinstance(result.content, (str, bytes))
metadata = result.metadata
assert isinstance(metadata, dict)
def test_jpeg_extraction(self):
result = omniparse.extract_from_path("test_data/image/sample.jpg")
assert result.mime_type == "image/jpeg"
assert result.detection_confidence > 0.5
metadata = result.metadata
assert isinstance(metadata, dict)
def test_tiff_extraction(self):
result = omniparse.extract_from_path("test_data/image/sample.tiff")
assert result.mime_type == "image/tiff"
assert result.detection_confidence > 0.5
metadata = result.metadata
assert isinstance(metadata, dict)
def test_empty_png(self):
result = omniparse.extract_from_path("test_data/image/empty.png")
assert result.mime_type == "image/png"
assert result.detection_confidence > 0.5
class TestArchiveFormats:
def test_zip_extraction(self):
result = omniparse.extract_from_path("test_data/archive/sample.zip")
assert result.mime_type == "application/zip"
assert result.detection_confidence > 0.5
metadata = result.metadata
assert isinstance(metadata, dict)
def test_tar_extraction(self):
result = omniparse.extract_from_path("test_data/archive/sample.tar")
assert result.mime_type in ["application/x-tar", "application/tar"]
assert result.detection_confidence > 0.5
metadata = result.metadata
assert isinstance(metadata, dict)
def test_empty_zip(self):
result = omniparse.extract_from_path("test_data/archive/empty.zip")
assert result.mime_type == "application/zip"
assert result.detection_confidence > 0.5
class TestMetadataExtraction:
def test_metadata_types(self):
result = omniparse.extract_from_path("test_data/document/sample.pdf")
metadata = result.metadata
for key, value in metadata.items():
assert isinstance(key, str)
assert isinstance(value, (str, int, float, bool, list, type(None)))
def test_metadata_list_values(self):
result = omniparse.extract_from_path("test_data/text/sample.json")
metadata = result.metadata
for value in metadata.values():
if isinstance(value, list):
for item in value:
assert isinstance(item, (str, int, float, bool, type(None)))
def test_empty_metadata(self):
result = omniparse.extract_from_path("test_data/text/empty.txt")
assert isinstance(result.metadata, dict)
class TestFormatCategories:
@pytest.mark.parametrize("file_path,category", [
("test_data/text/sample.txt", "text"),
("test_data/text/sample.json", "text"),
("test_data/text/sample.csv", "text"),
("test_data/document/sample.pdf", "document"),
("test_data/document/sample.docx", "document"),
("test_data/image/sample.png", "image"),
("test_data/image/sample.jpg", "image"),
("test_data/archive/sample.zip", "archive"),
])
def test_category_extraction(self, file_path, category):
result = omniparse.extract_from_path(file_path)
assert isinstance(result, omniparse.ExtractionResult)
assert isinstance(result.mime_type, str)
assert len(result.mime_type) > 0
assert result.detection_confidence > 0.0
assert isinstance(result.metadata, dict)