from pandas.api.types import CategoricalDtype
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import mean_squared_error
import argparse
import numpy as np
import pandas as pd
import json
parser = argparse.ArgumentParser()
parser.add_argument('--library', choices=['sklearn', 'pytorch'], required=True)
args = parser.parse_args()
path_train = 'data/higgs_train.csv'
path_test = 'data/higgs_test.csv'
target_column_name = "signal"
dtype = {
'signal': bool,
'lepton_pt': np.float64,
'lepton_eta': np.float64,
'lepton_phi': np.float64,
'missing_energy_magnitude': np.float64,
'missing_energy_phi': np.float64,
'jet_1_pt': np.float64,
'jet_1_eta': np.float64,
'jet_1_phi': np.float64,
'jet_1_b_tag': np.float64,
'jet_2_pt': np.float64,
'jet_2_eta': np.float64,
'jet_2_phi': np.float64,
'jet_2_b_tag': np.float64,
'jet_3_pt': np.float64,
'jet_3_eta': np.float64,
'jet_3_phi': np.float64,
'jet_3_b_tag': np.float64,
'jet_4_pt': np.float64,
'jet_4_eta': np.float64,
'jet_4_phi': np.float64,
'jet_4_b_tag': np.float64,
'm_jj': np.float64,
'm_jjj': np.float64,
'm_lv': np.float64,
'm_jlv': np.float64,
'm_bb': np.float64,
'm_wbb': np.float64,
'm_wwbb': np.float64,
}
data_train = pd.read_csv(path_train, dtype=dtype)
data_test = pd.read_csv(path_test, dtype=dtype)
features_train = data_train.loc[:, data_train.columns != target_column_name]
labels_train = data_train[target_column_name]
features_test = data_test.loc[:, data_test.columns != target_column_name]
labels_test = data_test[target_column_name]
if args.library == 'pytorch' or args.library == 'sklearn':
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_features = features_train.select_dtypes(
include=[np.float64, np.int64]
).columns
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
])
features_train = preprocessor.fit_transform(features_train)
features_test = preprocessor.transform(features_test)
if args.library == 'pytorch':
from pytorch_linear import LinearBinaryClassifier
model = LinearBinaryClassifier(batch_size=1000, n_epochs=1, learning_rate=0.01)
model.fit(features_train, labels_train)
elif args.library == 'sklearn':
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(
max_iter=1,
eta0=0.01,
learning_rate='constant',
tol=None,
loss='log'
)
model.fit(features_train, labels_train)
if args.library == 'pytorch':
predictions_proba = model.predict_proba(features_test)
else:
predictions_proba = model.predict_proba(features_test)[:, 1]
auc_roc = roc_auc_score(labels_test, predictions_proba)
print(json.dumps({
'auc_roc': auc_roc,
}))