import json
import random
import string
import mmh3
def generate_test_strings(count=100):
test_strings = []
test_strings.append("")
for c in "abcdefghijklmnopqrstuvwxyz0123456789":
test_strings.append(c)
test_strings.extend([
"hello",
"hello world",
"Hello World",
"aaaa",
"0123456789",
"abcdefghijklmnopqrstuvwxyz",
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
"!@#$%^&*()_+-=[]{}|;:,.<>?/",
])
for _ in range(count - len(test_strings)):
length = random.randint(1, 100)
chars = string.ascii_letters + string.digits + string.punctuation + " "
random_str = ''.join(random.choice(chars) for _ in range(length))
test_strings.append(random_str)
return test_strings
def calculate_mmh3_hashes(test_strings):
results = []
for s in test_strings:
string_bytes = s.encode('utf-8')
seed_0_32 = mmh3.hash(s, 0)
seed_42_32 = mmh3.hash(s, 42)
seed_0_128 = mmh3.hash128(s, 0, signed=False)
seed_42_128 = mmh3.hash128(s, 42, signed=False)
results.append({
"input": s,
"input_bytes": [b for b in string_bytes],
"murmur3_32_seed0": seed_0_32 & 0xFFFFFFFF, "murmur3_32_seed42": seed_42_32 & 0xFFFFFFFF,
"murmur3_128_seed0": seed_0_128,
"murmur3_128_seed42": seed_42_128
})
return results
def main():
test_strings = generate_test_strings(200)
results = calculate_mmh3_hashes(test_strings)
with open("data/mmh3_test_corpus.json", "w") as f:
json.dump(results, f, indent=2)
print(f"Generated test corpus with {len(results)} entries.")
if __name__ == "__main__":
main()