import meta_oxide
import pytest
class TestBookBasic:
def test_book_minimal(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Great Gatsby"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "Book"
assert objects[0]["name"] == "The Great Gatsby"
def test_book_with_author_and_isbn(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "1984",
"author": {
"@type": "Person",
"name": "George Orwell"
},
"isbn": "978-0-452-28423-4"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["@type"] == "Book"
assert objects[0]["name"] == "1984"
assert objects[0]["isbn"] == "978-0-452-28423-4"
assert "author" in objects[0]
def test_book_with_description_and_url(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "To Kill a Mockingbird",
"description": "A gripping tale of racial injustice and childhood innocence",
"url": "https://example.com/books/mockingbird"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["name"] == "To Kill a Mockingbird"
assert (
objects[0]["description"]
== "A gripping tale of racial injustice and childhood innocence"
)
assert objects[0]["url"] == "https://example.com/books/mockingbird"
def test_book_with_single_image(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Pride and Prejudice",
"image": "https://example.com/covers/pride-prejudice.jpg"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["name"] == "Pride and Prejudice"
assert objects[0]["image"] == "https://example.com/covers/pride-prejudice.jpg"
class TestBookDetails:
def test_book_with_book_format(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Hobbit",
"bookFormat": "Hardcover"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["bookFormat"] == "Hardcover"
def test_book_with_number_of_pages(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Harry Potter and the Philosopher's Stone",
"numberOfPages": 223
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["numberOfPages"] == 223
def test_book_with_book_edition(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Lord of the Rings",
"bookEdition": "50th Anniversary Edition"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["bookEdition"] == "50th Anniversary Edition"
def test_book_with_publisher(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Dune",
"publisher": {
"@type": "Organization",
"name": "Chilton Books"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["name"] == "Dune"
assert "publisher" in objects[0]
def test_book_with_date_published(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Brave New World",
"datePublished": "1932-01-01"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["datePublished"] == "1932-01-01"
class TestBookLanguageAndGenre:
def test_book_with_in_language(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Les Misérables",
"inLanguage": "fr"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["inLanguage"] == "fr"
def test_book_with_single_genre(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Foundation",
"genre": "Science Fiction"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["genre"] == "Science Fiction"
def test_book_with_multiple_genres(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Martian",
"genre": ["Science Fiction", "Thriller", "Adventure"]
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert "genre" in objects[0]
class TestBookContributors:
def test_book_with_string_author(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Catcher in the Rye",
"author": "J.D. Salinger"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["author"] == "J.D. Salinger"
def test_book_with_person_author(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "A Brief History of Time",
"author": {
"@type": "Person",
"name": "Stephen Hawking"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert "author" in objects[0]
def test_book_with_illustrator(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Very Hungry Caterpillar",
"author": {
"@type": "Person",
"name": "Eric Carle"
},
"illustrator": {
"@type": "Person",
"name": "Eric Carle"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert "illustrator" in objects[0]
class TestBookRatingsAndReviews:
def test_book_with_aggregate_rating(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Handmaid's Tale",
"aggregateRating": {
"@type": "AggregateRating",
"ratingValue": "4.6",
"reviewCount": "15234"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert "aggregateRating" in objects[0]
def test_book_with_review(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Sapiens",
"review": {
"@type": "Review",
"reviewRating": {
"@type": "Rating",
"ratingValue": "5"
},
"author": {
"@type": "Person",
"name": "Book Reviewer"
},
"reviewBody": "An outstanding exploration of human history."
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert "review" in objects[0]
def test_book_with_offers(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Educated",
"offers": {
"@type": "Offer",
"price": "14.99",
"priceCurrency": "USD",
"availability": "https://schema.org/InStock"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert "offers" in objects[0]
def test_book_with_abridged(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "War and Peace",
"abridged": false
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["abridged"] == False
class TestBookRealWorldExamples:
def test_fiction_book_complete(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Name of the Wind",
"description": "The riveting first-person narrative of a young man who grows to be the most notorious magician his world has ever seen.",
"author": {
"@type": "Person",
"name": "Patrick Rothfuss"
},
"isbn": "978-0-7564-0474-1",
"bookFormat": "Hardcover",
"numberOfPages": 662,
"publisher": {
"@type": "Organization",
"name": "DAW Books"
},
"datePublished": "2007-03-27",
"inLanguage": "en",
"genre": ["Fantasy", "Adventure"],
"aggregateRating": {
"@type": "AggregateRating",
"ratingValue": "4.5",
"reviewCount": "89234"
},
"offers": {
"@type": "Offer",
"price": "27.95",
"priceCurrency": "USD"
}
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
obj = objects[0]
assert obj["@type"] == "Book"
assert obj["name"] == "The Name of the Wind"
assert obj["isbn"] == "978-0-7564-0474-1"
assert obj["bookFormat"] == "Hardcover"
assert obj["numberOfPages"] == 662
assert obj["datePublished"] == "2007-03-27"
assert obj["inLanguage"] == "en"
assert "author" in obj
assert "publisher" in obj
assert "genre" in obj
assert "aggregateRating" in obj
assert "offers" in obj
def test_textbook_complete(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Introduction to Algorithms",
"description": "A comprehensive textbook on computer algorithms",
"author": [
{
"@type": "Person",
"name": "Thomas H. Cormen"
},
{
"@type": "Person",
"name": "Charles E. Leiserson"
}
],
"isbn": "978-0-262-03384-8",
"bookFormat": "Hardcover",
"numberOfPages": 1312,
"bookEdition": "Third Edition",
"publisher": {
"@type": "Organization",
"name": "MIT Press"
},
"datePublished": "2009-07-31",
"inLanguage": "en",
"genre": "Textbook"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
obj = objects[0]
assert obj["@type"] == "Book"
assert obj["name"] == "Introduction to Algorithms"
assert obj["isbn"] == "978-0-262-03384-8"
assert obj["bookEdition"] == "Third Edition"
assert obj["numberOfPages"] == 1312
assert "author" in obj
def test_audiobook(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Becoming",
"author": {
"@type": "Person",
"name": "Michelle Obama"
},
"bookFormat": "AudioBook",
"isbn": "978-1-5247-6313-8",
"publisher": {
"@type": "Organization",
"name": "Random House Audio"
},
"datePublished": "2018-11-13",
"inLanguage": "en",
"abridged": false
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
obj = objects[0]
assert obj["@type"] == "Book"
assert obj["name"] == "Becoming"
assert obj["bookFormat"] == "AudioBook"
assert obj["abridged"] == False
class TestBookEdgeCases:
def test_book_empty_fields(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Untitled Book",
"description": null,
"genre": []
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 1
assert objects[0]["name"] == "Untitled Book"
def test_multiple_books(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Book One"
}
</script>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "Book Two"
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 2
assert objects[0]["name"] == "Book One"
assert objects[1]["name"] == "Book Two"
def test_book_in_graph(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@graph": [
{
"@type": "Book",
"name": "The Chronicles of Narnia"
},
{
"@type": "Person",
"name": "C.S. Lewis"
}
]
}
</script>
</head>
<body></body>
</html>
"""
objects = meta_oxide.extract_jsonld(html)
assert len(objects) == 2
book = next(obj for obj in objects if obj.get("@type") == "Book")
assert book["name"] == "The Chronicles of Narnia"
class TestBookIntegration:
def test_extract_all_includes_book(self):
html = """
<html>
<head>
<title>Book Page</title>
<meta property="og:title" content="Amazing Book">
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Book",
"name": "The Great Novel",
"author": {
"@type": "Person",
"name": "Famous Author"
},
"isbn": "978-1-234-56789-0"
}
</script>
</head>
<body></body>
</html>
"""
data = meta_oxide.extract_all(html)
assert "jsonld" in data
assert len(data["jsonld"]) == 1
assert data["jsonld"][0]["@type"] == "Book"
assert data["jsonld"][0]["name"] == "The Great Novel"
assert data["jsonld"][0]["isbn"] == "978-1-234-56789-0"
if __name__ == "__main__":
pytest.main([__file__, "-v"])