import json
import sys
import time
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
FIXTURES_DIR = Path(__file__).resolve().parent.parent.parent / "tests" / "fixtures"
def _load_csv_features(name: str) -> np.ndarray:
import csv
path = FIXTURES_DIR / name
with open(path) as f:
reader = csv.reader(f)
next(reader) rows = [[float(v) for v in row] for row in reader]
return np.array(rows)
def _load_csv_target(name: str) -> np.ndarray:
import csv
path = FIXTURES_DIR / name
with open(path) as f:
reader = csv.reader(f)
next(reader) values = [float(row[0]) for row in reader]
return np.array(values)
def load_california():
X = _load_csv_features("california_features.csv")
y = _load_csv_target("california_target.csv")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
return X_train, X_test, y_train, y_test
def get_regressors():
return {
"linear_regression": LinearRegression(),
"lasso": Lasso(alpha=0.01, max_iter=1000, random_state=42),
"elastic_net": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=1000, random_state=42),
"knn_regressor": KNeighborsRegressor(n_neighbors=5),
"gradient_boosting": GradientBoostingRegressor(
n_estimators=50, max_depth=5, learning_rate=0.1, random_state=42
),
"ridge": Ridge(alpha=1.0),
}
def time_fn(fn, n_runs=5, warmup=2):
for _ in range(warmup):
fn()
times = []
for _ in range(n_runs):
start = time.perf_counter_ns()
fn()
elapsed_us = (time.perf_counter_ns() - start) / 1000
times.append(elapsed_us)
times.sort()
return times[len(times) // 2]
def prediction_latency(model, X, n_iters=10_000):
rng = np.random.default_rng(42)
indices = rng.integers(0, len(X), size=n_iters)
times = []
for idx in indices:
row = X[idx : idx + 1]
start = time.perf_counter_ns()
model.predict(row)
elapsed = (time.perf_counter_ns() - start) / 1000
times.append(elapsed)
times.sort()
n = len(times)
return {
"p50_us": times[n // 2],
"p95_us": times[int(n * 0.95)],
"p99_us": times[int(n * 0.99)],
}
def fmt_time(us: float) -> str:
if us < 1:
return f"{us * 1000:.1f} ns"
if us < 1000:
return f"{us:.1f} µs"
if us < 1_000_000:
return f"{us / 1000:.2f} ms"
return f"{us / 1_000_000:.2f} s"
def run_regression_metrics():
print("\n" + "=" * 72)
print(" SECTION 1: Regression Metrics — California Housing (80/20 split)")
print("=" * 72)
X_train, X_test, y_train, y_test = load_california()
results = {}
print(f"\n {'Model':<25} {'R²':>8} {'RMSE':>8} {'MAE':>8}")
print(" " + "-" * 52)
for name, model in get_regressors().items():
try:
model.fit(X_train, y_train)
preds = model.predict(X_test)
r2 = r2_score(y_test, preds)
rmse = root_mean_squared_error(y_test, preds)
mae = mean_absolute_error(y_test, preds)
print(f" {name:<25} {r2:>8.4f} {rmse:>8.4f} {mae:>8.4f}")
results[name] = {
"r2": round(float(r2), 6),
"rmse": round(float(rmse), 6),
"mae": round(float(mae), 6),
}
except Exception as e:
print(f" {name:<25} {'FAILED':>8} {e}")
results[name] = {"error": str(e)}
return results
def run_training_benchmarks():
print("\n" + "=" * 72)
print(" SECTION 2: Training Time (California Housing, full train set)")
print("=" * 72)
X_train, _, y_train, _ = load_california()
results = {}
print(f"\n {'Model':<25} {'Median Time':>12}")
print(" " + "-" * 40)
for name, model in get_regressors().items():
try:
model_cls = model.__class__
params = model.get_params()
def train():
m = model_cls(**params)
m.fit(X_train, y_train)
median_us = time_fn(train)
print(f" {name:<25} {fmt_time(median_us):>12}")
results[name] = {
"median_us": round(median_us, 2),
"n_samples": len(X_train),
}
except Exception as e:
print(f" {name:<25} {'FAILED':>12} {e}")
return results
def run_prediction_benchmarks():
print("\n" + "=" * 72)
print(" SECTION 3: Single-Row Prediction Latency")
print("=" * 72)
X_train, X_test, y_train, _ = load_california()
results = {}
print(f"\n {'Model':<25} {'p50':>10} {'p95':>10} {'p99':>10}")
print(" " + "-" * 58)
for name, model in get_regressors().items():
try:
model.fit(X_train, y_train)
latency = prediction_latency(model, X_test, n_iters=5_000)
print(f" {name:<25} {fmt_time(latency['p50_us']):>10} "
f"{fmt_time(latency['p95_us']):>10} {fmt_time(latency['p99_us']):>10}")
results[name] = latency
except Exception as e:
print(f" {name:<25} {'FAILED':>10} {e}")
return results
def main():
import sklearn
print("=" * 72)
print(f" scikit-learn Regression Benchmark — v{sklearn.__version__}")
print(f" NumPy {np.__version__}")
print(f" Dataset: California Housing (20640 samples, 8 features)")
print("=" * 72)
all_results = {}
all_results["regression_metrics"] = run_regression_metrics()
all_results["training_time"] = run_training_benchmarks()
all_results["prediction_latency"] = run_prediction_benchmarks()
out_path = Path(__file__).parent / "sklearn_regression_results.json"
with open(out_path, "w") as f:
json.dump(all_results, f, indent=2)
print(f"\n✓ Results saved to {out_path}")
if __name__ == "__main__":
main()