import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
try:
import meta_oxide
except ImportError:
print("ERROR: meta_oxide module not found")
sys.exit(1)
class TestMalformedHTML:
def test_unclosed_tags(self):
html = """
<html>
<head>
<title>Test Page
</head>
</html>
"""
result = meta_oxide.extract_all(html)
assert isinstance(result, dict)
def test_deeply_nested_tags(self):
html = "<div>" * 100 + "Content" + "</div>" * 100
result = meta_oxide.extract_all(html)
assert isinstance(result, dict)
def test_malformed_meta_tags(self):
html = """
<html>
<head>
<meta name="description">
<meta property="og:title">
<meta>
</head>
</html>
"""
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
def test_broken_json_ld(self):
html = """
<html>
<head>
<script type="application/ld+json">
{
"name": "Test"
"missing_comma": true
}
</script>
</head>
</html>
"""
result = meta_oxide.extract_jsonld(html)
assert isinstance(result, list)
def test_invalid_json_ld_syntax(self):
html = """
<html>
<head>
<script type="application/ld+json">
This is not JSON at all
</script>
</head>
</html>
"""
result = meta_oxide.extract_jsonld(html)
assert isinstance(result, list)
def test_script_injection_in_content(self):
html = """
<html>
<head>
<title>Test</title>
<script>
var title = '<meta property="og:title" content="hacked">';
</script>
</head>
</html>
"""
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
def test_mixed_quotes_in_attributes(self):
html = """
<html>
<head>
<meta name="description" content="It's a test's description">
<meta property="og:title" content='He said "hello"'>
</head>
</html>
"""
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
class TestEdgeCases:
def test_empty_html(self):
result = meta_oxide.extract_all("")
assert isinstance(result, dict)
def test_none_like_empty_strings(self):
for html in ["", " ", "\n", "\t"]:
result = meta_oxide.extract_all(html)
assert isinstance(result, dict)
def test_html_with_only_whitespace(self):
html = " \n\n\t \n "
result = meta_oxide.extract_all(html)
assert isinstance(result, dict)
def test_very_long_attribute_values(self):
long_content = "x" * 10000
html = f'<meta name="description" content="{long_content}">'
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
def test_very_long_html_document(self):
html = "<html><head>"
for i in range(1000):
html += f'<meta name="test{i}" content="value{i}">'
html += "</head></html>"
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
def test_html_with_unicode_content(self):
html = """
<html>
<head>
<title>测试页面 - テスト - 검사</title>
<meta name="description" content="日本語、中文、한글の説明">
</head>
</html>
"""
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
assert "title" in result
def test_html_with_emoji(self):
html = """
<html>
<head>
<title>Test 🎉 Page</title>
<meta name="description" content="We love 🚀 and 🎨">
</head>
</html>
"""
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
def test_html_with_html_entities(self):
html = """
<html>
<head>
<title>Test & Demo <Page></title>
<meta name="description" content=""Quoted" 'content'">
</head>
</html>
"""
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
def test_html_with_cdata_sections(self):
html = """
<html>
<head>
<script type="application/ld+json">
<![CDATA[
{"@type": "Article"}
]]>
</script>
</head>
</html>
"""
result = meta_oxide.extract_jsonld(html)
assert isinstance(result, list)
def test_html_with_comments(self):
html = """
<html>
<head>
<!-- <meta name="fake" content="content"> -->
<title>Real Title</title>
<!-- <meta property="og:title" content="fake"> -->
</head>
</html>
"""
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
class TestInvalidInput:
def test_non_html_text(self):
text = "This is just plain text, not HTML at all."
result = meta_oxide.extract_all(text)
assert isinstance(result, dict)
def test_xml_input(self):
xml = """<?xml version="1.0"?>
<root>
<item>Test</item>
</root>
"""
result = meta_oxide.extract_all(xml)
assert isinstance(result, dict)
def test_json_input(self):
json_text = '{"title": "Test", "description": "Test"}'
result = meta_oxide.extract_all(json_text)
assert isinstance(result, dict)
def test_binary_like_strings(self):
html = "<html><head><title>Test\x00Title</title></head></html>"
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
class TestInvalidURLs:
def test_invalid_base_url_format(self):
html = '<link rel="canonical" href="/page">'
try:
result = meta_oxide.extract_meta(html, base_url="not a url")
assert isinstance(result, dict)
except:
pass
def test_relative_base_url(self):
html = '<link rel="canonical" href="/page">'
try:
result = meta_oxide.extract_meta(html, base_url="/relative/path")
assert isinstance(result, dict)
except:
pass
def test_empty_base_url(self):
html = '<link rel="canonical" href="/page">'
result = meta_oxide.extract_meta(html, base_url="")
assert isinstance(result, dict)
def test_special_characters_in_base_url(self):
html = '<link rel="canonical" href="/page">'
result = meta_oxide.extract_meta(
html, base_url="https://example.com/path?query=value&other=123"
)
assert isinstance(result, dict)
class TestExtractAllRobustness:
def test_extract_all_with_broken_formats_mixed(self):
html = """
<html>
<head>
<title>Valid Title</title>
<meta property="og:title" content="Valid OG">
<script type="application/ld+json">
{BROKEN JSON HERE}
</script>
</head>
<body>
<div class="h-card">
<span class="p-name">John</span>
</div>
</body>
</html>
"""
result = meta_oxide.extract_all(html)
assert "meta" in result
assert isinstance(result, dict)
def test_extract_all_with_multiple_formats_empty(self):
html = """
<html>
<head></head>
<body></body>
</html>
"""
result = meta_oxide.extract_all(html)
assert isinstance(result, dict)
def test_extract_all_graceful_degradation(self):
html = """
<html>
<head>
<title>Title Works</title>
<meta property="og:title" content="OG Works">
<script type="application/ld+json">BROKEN</script>
</head>
</html>
"""
result = meta_oxide.extract_all(html)
assert "meta" in result or "opengraph" in result or len(result) >= 0
def test_microformats_with_missing_required_properties(self):
html = """
<div class="h-card">
<!-- missing p-name which is typically required -->
<a class="u-url" href="https://example.com">Link</a>
</div>
"""
result = meta_oxide.extract_hcard(html)
assert isinstance(result, list)
def test_microdata_with_incomplete_schema(self):
html = """
<div itemscope itemtype="https://schema.org/Person">
<!-- missing many properties -->
<span itemprop="name">John</span>
</div>
"""
result = meta_oxide.extract_microdata(html)
assert isinstance(result, list)
class TestMemoryAndPerformance:
def test_extract_large_attribute_value(self):
large_value = "x" * 100000 html = f'<meta name="test" content="{large_value}">'
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
def test_extract_many_meta_tags(self):
html = "<html><head>"
for i in range(5000): html += f'<meta name="tag{i}" content="value{i}">'
html += "</head></html>"
result = meta_oxide.extract_meta(html)
assert isinstance(result, dict)
def test_extract_many_microformats(self):
html = "<html><body>"
for i in range(500): html += f'<div class="h-card"><span class="p-name">Person {i}</span></div>'
html += "</body></html>"
result = meta_oxide.extract_hcard(html)
assert isinstance(result, list)
if __name__ == "__main__":
print("Running error handling tests...\n")
try:
test = TestMalformedHTML()
test.test_unclosed_tags()
print("✓ Unclosed tags handled gracefully")
except Exception as e:
print(f"✗ Unclosed tags: {e}")
try:
test = TestEdgeCases()
test.test_empty_html()
print("✓ Empty HTML handled gracefully")
except Exception as e:
print(f"✗ Empty HTML: {e}")
try:
test.test_html_with_unicode_content()
print("✓ Unicode content handled correctly")
except Exception as e:
print(f"✗ Unicode content: {e}")
try:
test = TestInvalidInput()
test.test_non_html_text()
print("✓ Non-HTML text handled gracefully")
except Exception as e:
print(f"✗ Non-HTML text: {e}")
print("\nError handling tests completed!")