{
"cells": [
{
"cell_type": "markdown",
"id": "55b13622",
"metadata": {},
"source": [
"# Linear Regression Validation\n",
"\n",
"This notebook uses linear regression implementations to validate the linear regression model of the `rust-ml` library. We'll use the same dataset to train a linear regression model."
]
},
{
"cell_type": "code",
"id": "34f2131a",
"metadata": {
"jupyter": {
"is_executing": true
}
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import sklearn\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "46053fd2",
"metadata": {
"jupyter": {
"is_executing": true
}
},
"source": [
"import os\n",
"df = pd.read_csv('../datasets/advertising.csv')\n",
"df.describe()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"execution_count": 140,
"id": "5d71b5cd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>TV</th>\n",
" <th>Radio</th>\n",
" <th>Newspaper</th>\n",
" <th>Sales</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2.000000e+02</td>\n",
" <td>2.000000e+02</td>\n",
" <td>2.000000e+02</td>\n",
" <td>2.000000e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.287859e-16</td>\n",
" <td>-4.263256e-16</td>\n",
" <td>2.309264e-16</td>\n",
" <td>-2.842171e-16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-1.704546e+00</td>\n",
" <td>-1.566936e+00</td>\n",
" <td>-1.389161e+00</td>\n",
" <td>-2.560707e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>-8.464055e-01</td>\n",
" <td>-8.950745e-01</td>\n",
" <td>-8.174990e-01</td>\n",
" <td>-7.817154e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>3.153601e-02</td>\n",
" <td>-2.451705e-02</td>\n",
" <td>-2.205833e-01</td>\n",
" <td>1.645567e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>8.360974e-01</td>\n",
" <td>8.931886e-01</td>\n",
" <td>6.679027e-01</td>\n",
" <td>7.417827e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.739664e+00</td>\n",
" <td>1.773849e+00</td>\n",
" <td>3.831556e+00</td>\n",
" <td>2.246355e+00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" TV Radio Newspaper Sales\n",
"count 2.000000e+02 2.000000e+02 2.000000e+02 2.000000e+02\n",
"mean 1.287859e-16 -4.263256e-16 2.309264e-16 -2.842171e-16\n",
"std 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00\n",
"min -1.704546e+00 -1.566936e+00 -1.389161e+00 -2.560707e+00\n",
"25% -8.464055e-01 -8.950745e-01 -8.174990e-01 -7.817154e-01\n",
"50% 3.153601e-02 -2.451705e-02 -2.205833e-01 1.645567e-01\n",
"75% 8.360974e-01 8.931886e-01 6.679027e-01 7.417827e-01\n",
"max 1.739664e+00 1.773849e+00 3.831556e+00 2.246355e+00"
]
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Normalize the dataset\n",
"df = (df - df.mean(axis=0)) / df.std(axis=0)\n",
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 141,
"id": "02a07902",
"metadata": {},
"outputs": [],
"source": [
"# Split the dataset into training and testing sets\n",
"X = df[[\"TV\", \"Radio\", \"Newspaper\"]].values\n",
"y = df[\"Sales\"].values\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)"
]
},
{
"cell_type": "code",
"execution_count": 148,
"id": "64ab0fce",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training time: 0.0005171298980712891\n"
]
}
],
"source": [
"# Create a linear regression model\n",
"import time\n",
"model = LinearRegression()\n",
"tic = time.time()\n",
"model.fit(X_train, y_train)\n",
"tok = time.time()\n",
"print(\"Training time:\", tok - tic)\n"
]
},
{
"cell_type": "code",
"execution_count": 144,
"id": "aa58d88e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model coefficients: [0.87288215 0.32052642 0.0207583 ]\n",
"Model intercept: -0.0067498162449043525\n",
"Mean Squared Error: 0.8370477252121606\n"
]
}
],
"source": [
"\n",
"mse = model.score(X_test, y_test)\n",
"\n",
"print(\"Model coefficients:\", model.coef_)\n",
"print(\"Model intercept:\", model.intercept_)\n",
"print(\"Mean Squared Error:\", mse)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}