import json
import time
from pathlib import Path
import numpy as np
import lightgbm as lgb
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_digits
from sklearn.model_selection import cross_val_score, StratifiedKFold
def gen_classification(n: int, n_features: int, seed: int = 42):
rng = np.random.default_rng(seed)
half = n // 2
X = np.zeros((n, n_features))
y = np.zeros(n, dtype=int)
for j in range(n_features):
offset = 3.0 + j * 0.5
X[:half, j] = rng.random(half) * 2.0
X[half:, j] = rng.random(n - half) * 2.0 + offset
y[half:] = 1
return X, y
DATASETS = {
"iris": load_iris,
"wine": load_wine,
"breast_cancer": load_breast_cancer,
"digits": load_digits,
}
def fmt_time(us: float) -> str:
if us < 1:
return f"{us * 1000:.1f} ns"
if us < 1000:
return f"{us:.1f} µs"
if us < 1_000_000:
return f"{us / 1000:.2f} ms"
return f"{us / 1_000_000:.2f} s"
def time_fn(fn, n_runs=5, warmup=2):
for _ in range(warmup):
fn()
times = []
for _ in range(n_runs):
start = time.perf_counter_ns()
fn()
elapsed_us = (time.perf_counter_ns() - start) / 1000
times.append(elapsed_us)
times.sort()
return times[len(times) // 2]
def prediction_latency(model, X, n_iters=5_000):
rng = np.random.default_rng(42)
indices = rng.integers(0, len(X), size=n_iters)
times = []
for idx in indices:
row = X[idx : idx + 1]
start = time.perf_counter_ns()
model.predict(row)
elapsed = (time.perf_counter_ns() - start) / 1000
times.append(elapsed)
times.sort()
n = len(times)
return {
"p50_us": times[n // 2],
"p95_us": times[int(n * 0.95)],
"p99_us": times[int(n * 0.99)],
}
def make_lgb(n_classes):
params = dict(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
random_state=42,
n_jobs=1, verbose=-1,
)
if n_classes > 2:
params["objective"] = "multiclass"
params["num_class"] = n_classes
return lgb.LGBMClassifier(**params)
def run_accuracy_benchmarks():
print("\n" + "=" * 72)
print(" SECTION 1: LightGBM 5-Fold Stratified CV Accuracy")
print("=" * 72)
results = {}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for ds_name, loader in DATASETS.items():
data = loader()
X, y = data.data, data.target.astype(float)
n_classes = len(np.unique(y))
print(f"\n Dataset: {ds_name} ({X.shape[0]}×{X.shape[1]}, {n_classes} classes)")
model = make_lgb(n_classes)
try:
scores = cross_val_score(model, X, y, cv=skf, scoring="accuracy")
mean_acc = scores.mean()
std_acc = scores.std()
fold_str = ", ".join(f"{s:.3f}" for s in scores)
print(f" LightGBM: {mean_acc:.4f} ± {std_acc:.4f} [{fold_str}]")
results[ds_name] = {
"mean_accuracy": round(float(mean_acc), 6),
"std_accuracy": round(float(std_acc), 6),
"fold_scores": [round(float(s), 6) for s in scores],
}
except Exception as e:
print(f" LightGBM: FAILED — {e}")
results[ds_name] = {"error": str(e)}
return results
def run_training_benchmarks():
print("\n" + "=" * 72)
print(" SECTION 2: LightGBM Training Throughput")
print("=" * 72)
sizes = [1_000, 10_000, 100_000]
results = {}
for n in sizes:
X, y = gen_classification(n, 10)
model = lgb.LGBMClassifier(
n_estimators=100, max_depth=6, learning_rate=0.1,
random_state=42, n_jobs=1, verbose=-1, )
median_us = time_fn(lambda X=X, y=y: model.fit(X, y))
rows_per_sec = n / (median_us / 1e6) if median_us > 0 else 0
print(f" {n:>7,} × 10: {fmt_time(median_us):>12} ({rows_per_sec:,.0f} rows/s)")
results[str(n)] = {
"median_us": round(median_us, 2),
"rows_per_sec": round(rows_per_sec, 0),
}
return results
def run_prediction_benchmarks():
print("\n" + "=" * 72)
print(" SECTION 3: LightGBM Single-Row Prediction Latency")
print("=" * 72)
X, y = gen_classification(1000, 10)
model = lgb.LGBMClassifier(
n_estimators=100, max_depth=6, learning_rate=0.1,
random_state=42, verbose=-1,
)
model.fit(X, y)
latency = prediction_latency(model, X)
print(f" p50: {fmt_time(latency['p50_us']):>10} "
f"p95: {fmt_time(latency['p95_us']):>10} "
f"p99: {fmt_time(latency['p99_us']):>10}")
return latency
def main():
print("=" * 72)
print(f" LightGBM Industry Benchmark — v{lgb.__version__}")
print(f" NumPy {np.__version__}")
print("=" * 72)
all_results = {
"accuracy_cv": run_accuracy_benchmarks(),
"training": run_training_benchmarks(),
"prediction_latency": run_prediction_benchmarks(),
}
out_path = Path(__file__).parent / "lightgbm_results.json"
with open(out_path, "w") as f:
json.dump(all_results, f, indent=2)
print(f"\n✓ Results saved to {out_path}")
if __name__ == "__main__":
main()