import meta_oxide
import pytest
class TestJSONLDExtraction:
def test_extract_article(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Test Article",
"description": "This is a test article",
"datePublished": "2024-01-15",
"author": {
"@type": "Person",
"name": "John Doe"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "Article"
assert objects[0]["headline"] == "Test Article"
assert objects[0]["description"] == "This is a test article"
assert objects[0]["datePublished"] == "2024-01-15"
def test_extract_product(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Product",
"name": "Wireless Headphones",
"description": "Noise-cancelling wireless headphones",
"sku": "WH-1000XM5",
"brand": {
"@type": "Brand",
"name": "Sony"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "Product"
assert objects[0]["name"] == "Wireless Headphones"
assert objects[0]["sku"] == "WH-1000XM5"
def test_extract_person(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Person",
"name": "Jane Smith",
"email": "jane@example.com",
"jobTitle": "Software Engineer",
"url": "https://janesmith.com"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "Person"
assert objects[0]["name"] == "Jane Smith"
assert objects[0]["email"] == "jane@example.com"
assert objects[0]["jobTitle"] == "Software Engineer"
def test_extract_organization(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Organization",
"name": "Acme Corp",
"url": "https://acme.com",
"logo": "https://acme.com/logo.png",
"sameAs": [
"https://twitter.com/acme",
"https://facebook.com/acme"
]
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "Organization"
assert objects[0]["name"] == "Acme Corp"
assert objects[0]["url"] == "https://acme.com"
class TestJSONLDMultipleObjects:
def test_multiple_script_tags(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "First Article"
}
</script>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Person",
"name": "John Doe"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 2
assert objects[0]["@type"] == "Article"
assert objects[1]["@type"] == "Person"
def test_graph_array(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@graph": [
{
"@type": "Article",
"headline": "Article in Graph"
},
{
"@type": "Person",
"name": "Author Name"
},
{
"@type": "Organization",
"name": "Publisher Name"
}
]
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 3
assert objects[0]["@type"] == "Article"
assert objects[1]["@type"] == "Person"
assert objects[2]["@type"] == "Organization"
def test_multiple_types(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": ["Article", "BlogPosting"],
"headline": "Blog Post"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == ["Article", "BlogPosting"]
class TestJSONLDEdgeCases:
def test_empty_html(self):
html = "<html><body>No JSON-LD here</body></html>"
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 0
def test_empty_script_tag(self):
html = """
<html>
<head>
<script type="application/ld+json"></script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 0
def test_invalid_json(self):
html = """
<html>
<head>
<script type="application/ld+json">
{invalid json}
</script>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Valid Article"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) >= 1
assert any(obj.get("@type") == "Article" for obj in objects)
def test_minimal_object(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert "@context" in objects[0]
def test_unicode_content(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "日本語のタイトル",
"description": "Описание на русском",
"author": {
"@type": "Person",
"name": "François Müller"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["headline"] == "日本語のタイトル"
assert objects[0]["description"] == "Описание на русском"
class TestJSONLDIntegration:
def test_extract_all_includes_jsonld(self):
html = """
<html>
<head>
<title>Test Page</title>
<meta property="og:title" content="OG Title">
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Article Title"
}
</script>
</head>
<body></body>
</html>
"""
data = meta_oxide.extract_all(html)
assert "meta" in data
assert "opengraph" in data
assert "jsonld" in data
assert len(data["jsonld"]) == 1
assert data["jsonld"][0]["@type"] == "Article"
assert data["jsonld"][0]["headline"] == "Article Title"
def test_extract_all_without_jsonld(self):
html = """
<html>
<head>
<title>Test Page</title>
<meta property="og:title" content="OG Title">
</head>
<body></body>
</html>
"""
data = meta_oxide.extract_all(html)
assert "meta" in data
assert "opengraph" in data
assert "jsonld" not in data or len(data.get("jsonld", [])) == 0
def test_extract_all_multiple_jsonld(self):
html = """
<html>
<head>
<title>Test Page</title>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Article"
}
</script>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Person",
"name": "Author"
}
</script>
</head>
<body></body>
</html>
"""
data = meta_oxide.extract_all(html)
assert "jsonld" in data
assert len(data["jsonld"]) == 2
class TestJSONLDRealWorld:
def test_news_article(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "NewsArticle",
"headline": "Breaking: Major Discovery Announced",
"description": "Scientists announce breakthrough",
"datePublished": "2024-01-15T10:30:00Z",
"dateModified": "2024-01-15T12:00:00Z",
"author": {
"@type": "Person",
"name": "Jane Reporter"
},
"publisher": {
"@type": "Organization",
"name": "News Corp",
"logo": {
"@type": "ImageObject",
"url": "https://news.com/logo.png"
}
},
"image": "https://news.com/article-image.jpg"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "NewsArticle"
assert objects[0]["headline"] == "Breaking: Major Discovery Announced"
def test_ecommerce_product(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Product",
"name": "Laptop Computer",
"description": "High-performance laptop",
"sku": "LAPTOP-123",
"brand": {
"@type": "Brand",
"name": "TechBrand"
},
"offers": {
"@type": "Offer",
"price": "999.99",
"priceCurrency": "USD",
"availability": "https://schema.org/InStock"
},
"aggregateRating": {
"@type": "AggregateRating",
"ratingValue": "4.5",
"reviewCount": "150"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "Product"
assert objects[0]["name"] == "Laptop Computer"
assert objects[0]["sku"] == "LAPTOP-123"
def test_blog_posting(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "BlogPosting",
"headline": "How to Build a Web Scraper",
"description": "A comprehensive guide",
"datePublished": "2024-01-10",
"author": {
"@type": "Person",
"name": "Tech Blogger",
"url": "https://blog.com/author/tech-blogger"
},
"wordCount": 2500,
"keywords": ["web scraping", "python", "tutorial"]
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "BlogPosting"
assert objects[0]["wordCount"] == 2500
class TestJSONLDDataTypes:
def test_string_values(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "String Value",
"description": "Another string"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert isinstance(objects[0]["headline"], str)
assert isinstance(objects[0]["description"], str)
def test_numeric_values(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"wordCount": 1500,
"rating": 4.5
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert objects[0]["wordCount"] == 1500
assert objects[0]["rating"] == 4.5
def test_boolean_values(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Product",
"inStock": true,
"discontinued": false
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert objects[0]["inStock"] is True
assert objects[0]["discontinued"] is False
def test_array_values(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Organization",
"sameAs": [
"https://twitter.com/org",
"https://facebook.com/org"
]
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert "sameAs" in objects[0]
def test_null_values(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Test",
"description": null
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert objects[0]["headline"] == "Test"
assert "description" in objects[0]
if __name__ == "__main__":
pytest.main([__file__, "-v"])