import argparse
import random
import string
import sys
from datetime import datetime, timedelta
TABLE_DEFINITIONS = [
{
"name": "users",
"columns": [
("id", "bigint unsigned NOT NULL AUTO_INCREMENT"),
("email", "varchar(255) NOT NULL"),
("name", "varchar(100) DEFAULT NULL"),
("password_hash", "varchar(255) NOT NULL"),
("avatar_url", "varchar(500) DEFAULT NULL"),
("bio", "text"),
("created_at", "datetime DEFAULT NULL"),
("updated_at", "datetime DEFAULT NULL"),
],
"primary_key": "id",
"insert_batch_size": 50, },
{
"name": "posts",
"columns": [
("id", "bigint unsigned NOT NULL AUTO_INCREMENT"),
("user_id", "bigint unsigned NOT NULL"),
("title", "varchar(255) NOT NULL"),
("slug", "varchar(300) NOT NULL"),
("content", "longtext"),
("excerpt", "text"),
("status", "enum('draft','published','archived') DEFAULT 'draft'"),
("published_at", "datetime DEFAULT NULL"),
("created_at", "datetime DEFAULT NULL"),
("updated_at", "datetime DEFAULT NULL"),
],
"primary_key": "id",
"insert_batch_size": 20,
},
{
"name": "comments",
"columns": [
("id", "bigint unsigned NOT NULL AUTO_INCREMENT"),
("post_id", "bigint unsigned NOT NULL"),
("user_id", "bigint unsigned DEFAULT NULL"),
("parent_id", "bigint unsigned DEFAULT NULL"),
("body", "text NOT NULL"),
("author_name", "varchar(100) DEFAULT NULL"),
("author_email", "varchar(255) DEFAULT NULL"),
("is_approved", "tinyint(1) DEFAULT '0'"),
("created_at", "datetime DEFAULT NULL"),
],
"primary_key": "id",
"insert_batch_size": 100,
},
{
"name": "products",
"columns": [
("id", "bigint unsigned NOT NULL AUTO_INCREMENT"),
("sku", "varchar(50) NOT NULL"),
("name", "varchar(255) NOT NULL"),
("description", "text"),
("price", "decimal(10,2) NOT NULL"),
("cost", "decimal(10,2) DEFAULT NULL"),
("quantity", "int DEFAULT '0'"),
("weight", "decimal(8,3) DEFAULT NULL"),
("is_active", "tinyint(1) DEFAULT '1'"),
("metadata", "json DEFAULT NULL"),
("created_at", "datetime DEFAULT NULL"),
("updated_at", "datetime DEFAULT NULL"),
],
"primary_key": "id",
"insert_batch_size": 30,
},
{
"name": "orders",
"columns": [
("id", "bigint unsigned NOT NULL AUTO_INCREMENT"),
("user_id", "bigint unsigned NOT NULL"),
("order_number", "varchar(50) NOT NULL"),
("status", "enum('pending','processing','shipped','delivered','cancelled') DEFAULT 'pending'"),
("subtotal", "decimal(12,2) NOT NULL"),
("tax", "decimal(10,2) DEFAULT '0.00'"),
("shipping", "decimal(10,2) DEFAULT '0.00'"),
("total", "decimal(12,2) NOT NULL"),
("currency", "char(3) DEFAULT 'USD'"),
("notes", "text"),
("shipped_at", "datetime DEFAULT NULL"),
("created_at", "datetime DEFAULT NULL"),
("updated_at", "datetime DEFAULT NULL"),
],
"primary_key": "id",
"insert_batch_size": 40,
},
{
"name": "order_items",
"columns": [
("id", "bigint unsigned NOT NULL AUTO_INCREMENT"),
("order_id", "bigint unsigned NOT NULL"),
("product_id", "bigint unsigned NOT NULL"),
("quantity", "int NOT NULL"),
("unit_price", "decimal(10,2) NOT NULL"),
("total_price", "decimal(12,2) NOT NULL"),
("created_at", "datetime DEFAULT NULL"),
],
"primary_key": "id",
"insert_batch_size": 100,
},
{
"name": "sessions",
"columns": [
("id", "varchar(255) NOT NULL"),
("user_id", "bigint unsigned DEFAULT NULL"),
("ip_address", "varchar(45) DEFAULT NULL"),
("user_agent", "text"),
("payload", "longtext NOT NULL"),
("last_activity", "int NOT NULL"),
],
"primary_key": "id",
"insert_batch_size": 50,
},
{
"name": "activity_logs",
"columns": [
("id", "bigint unsigned NOT NULL AUTO_INCREMENT"),
("log_name", "varchar(100) DEFAULT NULL"),
("description", "text NOT NULL"),
("subject_type", "varchar(255) DEFAULT NULL"),
("subject_id", "bigint unsigned DEFAULT NULL"),
("causer_type", "varchar(255) DEFAULT NULL"),
("causer_id", "bigint unsigned DEFAULT NULL"),
("properties", "json DEFAULT NULL"),
("created_at", "datetime DEFAULT NULL"),
],
"primary_key": "id",
"insert_batch_size": 80,
},
{
"name": "cache",
"columns": [
("key", "varchar(255) NOT NULL"),
("value", "mediumtext NOT NULL"),
("expiration", "int NOT NULL"),
],
"primary_key": "key",
"insert_batch_size": 100,
},
{
"name": "media",
"columns": [
("id", "bigint unsigned NOT NULL AUTO_INCREMENT"),
("model_type", "varchar(255) NOT NULL"),
("model_id", "bigint unsigned NOT NULL"),
("collection_name", "varchar(255) NOT NULL"),
("name", "varchar(255) NOT NULL"),
("file_name", "varchar(255) NOT NULL"),
("mime_type", "varchar(100) DEFAULT NULL"),
("disk", "varchar(50) DEFAULT 'public'"),
("size", "bigint unsigned DEFAULT NULL"),
("manipulations", "json DEFAULT NULL"),
("custom_properties", "json DEFAULT NULL"),
("responsive_images", "json DEFAULT NULL"),
("created_at", "datetime DEFAULT NULL"),
("updated_at", "datetime DEFAULT NULL"),
],
"primary_key": "id",
"insert_batch_size": 30,
},
]
FIRST_NAMES = ["James", "Mary", "John", "Patricia", "Robert", "Jennifer", "Michael", "Linda", "William", "Elizabeth",
"David", "Barbara", "Richard", "Susan", "Joseph", "Jessica", "Thomas", "Sarah", "Charles", "Karen"]
LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
"Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin"]
WORDS = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", "sed", "do", "eiusmod",
"tempor", "incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua", "enim", "ad", "minim", "veniam"]
DOMAINS = ["gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "example.com", "test.org", "company.io"]
def random_string(length):
return ''.join(random.choices(string.ascii_lowercase, k=length))
def random_email():
name = f"{random.choice(FIRST_NAMES).lower()}.{random.choice(LAST_NAMES).lower()}{random.randint(1, 999)}"
return f"{name}@{random.choice(DOMAINS)}"
def random_name():
return f"{random.choice(FIRST_NAMES)} {random.choice(LAST_NAMES)}"
def random_text(min_words=10, max_words=50):
count = random.randint(min_words, max_words)
return ' '.join(random.choices(WORDS, k=count)).capitalize() + '.'
def random_datetime(start_year=2020):
start = datetime(start_year, 1, 1)
end = datetime.now()
delta = end - start
random_days = random.randint(0, delta.days)
return (start + timedelta(days=random_days)).strftime('%Y-%m-%d %H:%M:%S')
def random_ip():
return f"{random.randint(1, 255)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
def escape_sql(s):
if s is None:
return "NULL"
s = str(s)
return s.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n").replace("\r", "\\r")
def generate_row_value(table_name, col_name, col_type, row_id):
if "DEFAULT NULL" in col_type and random.random() < 0.1:
return None
col_lower = col_name.lower()
type_lower = col_type.lower()
if col_name == "id" and "auto_increment" in type_lower:
return row_id
if col_name.endswith("_id"):
return random.randint(1, max(1, row_id // 2))
if "email" in col_lower:
return random_email()
if col_lower in ("name", "author_name"):
return random_name()
if col_lower == "first_name":
return random.choice(FIRST_NAMES)
if col_lower == "last_name":
return random.choice(LAST_NAMES)
if "url" in col_lower:
return f"https://example.com/{random_string(10)}"
if "ip" in col_lower:
return random_ip()
if "datetime" in type_lower or col_lower.endswith("_at"):
return random_datetime()
if "tinyint" in type_lower:
return random.randint(0, 1)
if "json" in type_lower:
return '{"key": "value", "count": ' + str(random.randint(1, 100)) + '}'
if "decimal" in type_lower:
return round(random.uniform(1.0, 999.99), 2)
if "int" in type_lower and "bigint" not in type_lower:
return random.randint(1, 10000)
if "bigint" in type_lower:
return random.randint(1, 1000000)
if "enum" in type_lower:
import re
match = re.search(r"enum\(([^)]+)\)", type_lower)
if match:
values = [v.strip("'\"") for v in match.group(1).split(",")]
return random.choice(values)
if "longtext" in type_lower:
return random_text(100, 500)
if "mediumtext" in type_lower:
return random_text(50, 200)
if "text" in type_lower:
return random_text(20, 100)
if "varchar" in type_lower:
match = __import__('re').search(r"varchar\((\d+)\)", type_lower)
max_len = int(match.group(1)) if match else 100
if col_lower in ("title", "slug"):
return ' '.join(random.choices(WORDS, k=random.randint(3, 8))).title()[:max_len]
if col_lower == "sku":
return f"SKU-{random_string(6).upper()}-{random.randint(1000, 9999)}"
if col_lower == "order_number":
return f"ORD-{random.randint(100000, 999999)}"
if col_lower == "password_hash":
return f"$2y$10${random_string(50)}"
if col_lower == "user_agent":
return f"Mozilla/5.0 (compatible; Bot/{random.randint(1, 10)}.0)"
return random_string(min(max_len, random.randint(10, 50)))
if "char" in type_lower:
match = __import__('re').search(r"char\((\d+)\)", type_lower)
length = int(match.group(1)) if match else 3
return random_string(length).upper()
return random_string(20)
def write_header(f):
f.write(f"""-- MySQL dump 10.13 Distrib 8.0.40, for Linux (x86_64)
--
-- Host: localhost Database: benchmark
-- ------------------------------------------------------
-- Server version 8.0.40
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!50503 SET NAMES utf8mb4 */;
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
/*!40103 SET TIME_ZONE='+00:00' */;
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
""")
def write_footer(f):
f.write("""
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
-- Dump completed on """ + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "\n")
def write_table_structure(f, table_def):
name = table_def["name"]
columns = table_def["columns"]
pk = table_def["primary_key"]
f.write(f"""--
-- Table structure for table `{name}`
--
DROP TABLE IF EXISTS `{name}`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `{name}` (
""")
col_defs = []
for col_name, col_type in columns:
col_defs.append(f" `{col_name}` {col_type}")
col_defs.append(f" PRIMARY KEY (`{pk}`)")
f.write(",\n".join(col_defs))
f.write("\n) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;\n")
f.write("/*!40101 SET character_set_client = @saved_cs_client */;\n\n")
def write_table_data(f, table_def, num_rows, progress_callback=None):
name = table_def["name"]
columns = table_def["columns"]
batch_size = table_def["insert_batch_size"]
f.write(f"""--
-- Dumping data for table `{name}`
--
LOCK TABLES `{name}` WRITE;
/*!40000 ALTER TABLE `{name}` DISABLE KEYS */;
""")
col_names = [c[0] for c in columns]
rows_written = 0
while rows_written < num_rows:
remaining = num_rows - rows_written
current_batch = min(batch_size, remaining)
f.write(f"INSERT INTO `{name}` (`{'`,`'.join(col_names)}`) VALUES\n")
row_strs = []
for i in range(current_batch):
row_id = rows_written + i + 1
values = []
for col_name, col_type in columns:
val = generate_row_value(name, col_name, col_type, row_id)
if val is None:
values.append("NULL")
elif isinstance(val, (int, float)):
values.append(str(val))
else:
values.append(f"'{escape_sql(val)}'")
row_strs.append(f"({','.join(values)})")
f.write(",\n".join(row_strs))
f.write(";\n")
rows_written += current_batch
if progress_callback:
progress_callback(rows_written)
f.write(f"/*!40000 ALTER TABLE `{name}` ENABLE KEYS */;\n")
f.write("UNLOCK TABLES;\n\n")
def generate_dump(output_file, target_size_mb, verbose=True):
target_bytes = target_size_mb * 1024 * 1024
rows_per_table = max(100, (target_bytes // len(TABLE_DEFINITIONS)) // 750)
if verbose:
print(f"Generating {target_size_mb}MB dump file: {output_file}")
print(f" Tables: {len(TABLE_DEFINITIONS)}")
print(f" Rows per table: ~{rows_per_table:,}")
with open(output_file, 'w', encoding='utf-8') as f:
write_header(f)
for table_def in TABLE_DEFINITIONS:
write_table_structure(f, table_def)
for i, table_def in enumerate(TABLE_DEFINITIONS):
if verbose:
print(f" [{i+1}/{len(TABLE_DEFINITIONS)}] Generating {table_def['name']}...", end=' ', flush=True)
write_table_data(f, table_def, rows_per_table)
if verbose:
print("done")
write_footer(f)
actual_size = __import__('os').path.getsize(output_file)
if verbose:
print(f"Generated: {actual_size / 1024 / 1024:.1f}MB")
return actual_size
def main():
parser = argparse.ArgumentParser(
description='Generate realistic mysqldump-format SQL files for benchmarking'
)
parser.add_argument('size', type=int, help='Target size in MB')
parser.add_argument('-o', '--output', help='Output file path')
parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')
args = parser.parse_args()
output = args.output or f"/tmp/benchmark_{args.size}mb.sql"
generate_dump(output, args.size, verbose=not args.quiet)
print(f"Output: {output}")
if __name__ == '__main__':
main()