Module src.train
Expand source code
import sys
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
import eli5
import numpy as np
from copy import deepcopy
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import KFold
import pickle as pkl
sys.path.append('lib')
from sklearn.neighbors import KNeighborsClassifier as KNN
scorers = {'balanced_accuracy': metrics.balanced_accuracy_score, 'accuracy': metrics.accuracy_score,
'precision': metrics.precision_score, 'recall': metrics.recall_score, 'f1': metrics.f1_score,
'roc_auc': metrics.roc_auc_score,
'precision_recall_curve': metrics.precision_recall_curve, 'roc_curve': metrics.roc_curve}
def get_feature_importance(model, model_type, X_val, Y_val):
if 'Calibrated' in str(type(model)):
perm = eli5.sklearn.permutation_importance.PermutationImportance(model).fit(X_val, Y_val)
imps = perm.feature_importances_
elif model_type in ['dt']:
imps = model.feature_importances_
elif model_type in ['rf', 'irf']:
# imps, _ = feature_importance(model, np.array(X_val), np.transpose(np.vstack((Y_val, 1-Y_val))))
imps = model.feature_importances_
elif model_type == 'logistic':
imps = model.coef_
else:
perm = eli5.sklearn.permutation_importance.PermutationImportance(model).fit(X_val, Y_val)
imps = perm.feature_importances_
return imps.squeeze()
def balance(X, y, balancing='ros', balancing_ratio=1):
'''Balance classes in y using strategy specified by balancing
Params
-----
balancing_ratio: float
ratio of pos: neg samples
'''
class0 = np.sum(y == 0)
class1 = np.sum(y == 1)
class_max = max(class0, class1)
if balancing_ratio >= 1:
sample_nums = {0: int(class_max), 1: int(class_max * balancing_ratio)}
else:
sample_nums = {0: int(class_max / balancing_ratio), 1: int(class_max)}
if balancing == 'none':
return X, y
if balancing == 'ros':
sampler = RandomOverSampler(sampling_strategy=sample_nums, random_state=42)
elif balancing == 'smote':
sampler = SMOTE(sampling_strategy=sample_nums, random_state=42)
X_r, Y_r = sampler.fit_resample(X, y)
return X_r, Y_r
def train(df, feat_names,
cell_nums_feature_selection, cell_nums_train,
model_type='rf', outcome_def='y_thresh',
balancing='ros', balancing_ratio=1, out_name='results/classify/test.pkl',
calibrated=False, feature_selection=None,
feature_selection_num=3, hyperparam=0, seed=42):
'''Run training and fit models
This will balance the data
This will normalize the features before fitting
Params
------
normalize: bool
if True, will normalize features before fitting
cell_nums_feature_selection: list[str]
cell names to use for feature selection
'''
np.random.seed(seed)
X = df[feat_names]
X = (X - X.mean()) / X.std() # normalize the data
y = df[outcome_def].values
if model_type == 'rf':
m = RandomForestClassifier(n_estimators=100)
elif model_type == 'dt':
m = DecisionTreeClassifier()
elif model_type == 'logistic':
m = LogisticRegression(solver='lbfgs')
elif model_type == 'svm':
h = {
-1: 0.5,
0: 1,
1: 5
}[hyperparam]
m = SVC(C=h, gamma='scale')
elif model_type == 'mlp2':
h = {
-1: (50,),
0: (100,),
1: (50, 50,)
}[hyperparam]
m = MLPClassifier(hidden_layer_sizes=h)
elif model_type == 'gb':
m = GradientBoostingClassifier()
elif model_type == 'qda':
m = QDA()
elif model_type == 'KNN':
m = KNN()
elif model_type == 'irf':
import irf
m = irf.ensemble.wrf()
elif model_type == 'voting_mlp+svm+rf':
models_list = [('mlp', MLPClassifier()),
('svm', SVC(gamma='scale')),
('rf', RandomForestClassifier(n_estimators=100))]
m = VotingClassifier(estimators=models_list, voting='hard')
if calibrated:
m = CalibratedClassifierCV(m)
scores_cv = {s: [] for s in scorers.keys()}
imps = {'model': [], 'imps': []}
kf = KFold(n_splits=len(cell_nums_train))
# feature selection on cell num 1
feature_selector = None
if feature_selection is not None:
if feature_selection == 'select_lasso':
feature_selector_model = Lasso()
elif feature_selection == 'select_rf':
feature_selector_model = RandomForestClassifier()
# select only feature_selection_num features
feature_selector = SelectFromModel(feature_selector_model, threshold=-np.inf,
max_features=feature_selection_num)
idxs = df.cell_num.isin(cell_nums_feature_selection)
feature_selector.fit(X[idxs], y[idxs].reshape(-1, 1))
X = feature_selector.transform(X)
support = np.array(feature_selector.get_support())
else:
support = np.ones(len(feat_names)).astype(np.bool)
num_pts_by_fold_cv = []
# loops over cv, where validation set order is cell_nums_train[0], ..., cell_nums_train[-1]
for cv_idx, cv_val_idx in kf.split(cell_nums_train):
# get sample indices
idxs_cv = df.cell_num.isin(cell_nums_train[np.array(cv_idx)])
idxs_val_cv = df.cell_num.isin(cell_nums_train[np.array(cv_val_idx)])
X_train_cv, Y_train_cv = X[idxs_cv], y[idxs_cv]
X_val_cv, Y_val_cv = X[idxs_val_cv], y[idxs_val_cv]
num_pts_by_fold_cv.append(X_val_cv.shape[0])
# resample training data
X_train_r_cv, Y_train_r_cv = balance(X_train_cv, Y_train_cv, balancing, balancing_ratio)
# fit
m.fit(X_train_r_cv, Y_train_r_cv)
# get preds
preds = m.predict(X_val_cv)
if 'svm' in model_type:
preds_proba = preds
else:
preds_proba = m.predict_proba(X_val_cv)[:, 1]
# add scores
for s in scorers.keys():
scorer = scorers[s]
if 'roc' in s or 'curve' in s:
scores_cv[s].append(scorer(Y_val_cv, preds_proba))
else:
scores_cv[s].append(scorer(Y_val_cv, preds))
imps['model'].append(deepcopy(m))
imps['imps'].append(get_feature_importance(m, model_type, X_val_cv, Y_val_cv))
# save results
# os.makedirs(out_dir, exist_ok=True)
results = {'metrics': list(scorers.keys()),
'num_pts_by_fold_cv': np.array(num_pts_by_fold_cv),
'cv': scores_cv,
'imps': imps, # note this contains the model
'feat_names': feat_names,
'feature_selector': feature_selector,
'feature_selection_num': feature_selection_num,
'model_type': model_type,
'balancing': balancing,
'feat_names_selected': np.array(feat_names)[support],
'calibrated': calibrated
}
pkl.dump(results, open(out_name, 'wb'))
Functions
def balance(X, y, balancing='ros', balancing_ratio=1)
-
Balance classes in y using strategy specified by balancing Params
balancing_ratio
:float
- ratio of pos: neg samples
Expand source code
def balance(X, y, balancing='ros', balancing_ratio=1): '''Balance classes in y using strategy specified by balancing Params ----- balancing_ratio: float ratio of pos: neg samples ''' class0 = np.sum(y == 0) class1 = np.sum(y == 1) class_max = max(class0, class1) if balancing_ratio >= 1: sample_nums = {0: int(class_max), 1: int(class_max * balancing_ratio)} else: sample_nums = {0: int(class_max / balancing_ratio), 1: int(class_max)} if balancing == 'none': return X, y if balancing == 'ros': sampler = RandomOverSampler(sampling_strategy=sample_nums, random_state=42) elif balancing == 'smote': sampler = SMOTE(sampling_strategy=sample_nums, random_state=42) X_r, Y_r = sampler.fit_resample(X, y) return X_r, Y_r
def get_feature_importance(model, model_type, X_val, Y_val)
-
Expand source code
def get_feature_importance(model, model_type, X_val, Y_val): if 'Calibrated' in str(type(model)): perm = eli5.sklearn.permutation_importance.PermutationImportance(model).fit(X_val, Y_val) imps = perm.feature_importances_ elif model_type in ['dt']: imps = model.feature_importances_ elif model_type in ['rf', 'irf']: # imps, _ = feature_importance(model, np.array(X_val), np.transpose(np.vstack((Y_val, 1-Y_val)))) imps = model.feature_importances_ elif model_type == 'logistic': imps = model.coef_ else: perm = eli5.sklearn.permutation_importance.PermutationImportance(model).fit(X_val, Y_val) imps = perm.feature_importances_ return imps.squeeze()
def train(df, feat_names, cell_nums_feature_selection, cell_nums_train, model_type='rf', outcome_def='y_thresh', balancing='ros', balancing_ratio=1, out_name='results/classify/test.pkl', calibrated=False, feature_selection=None, feature_selection_num=3, hyperparam=0, seed=42)
-
Run training and fit models This will balance the data This will normalize the features before fitting
Params
normalize
:bool
- if True, will normalize features before fitting
cell_nums_feature_selection
:list
[str
]- cell names to use for feature selection
Expand source code
def train(df, feat_names, cell_nums_feature_selection, cell_nums_train, model_type='rf', outcome_def='y_thresh', balancing='ros', balancing_ratio=1, out_name='results/classify/test.pkl', calibrated=False, feature_selection=None, feature_selection_num=3, hyperparam=0, seed=42): '''Run training and fit models This will balance the data This will normalize the features before fitting Params ------ normalize: bool if True, will normalize features before fitting cell_nums_feature_selection: list[str] cell names to use for feature selection ''' np.random.seed(seed) X = df[feat_names] X = (X - X.mean()) / X.std() # normalize the data y = df[outcome_def].values if model_type == 'rf': m = RandomForestClassifier(n_estimators=100) elif model_type == 'dt': m = DecisionTreeClassifier() elif model_type == 'logistic': m = LogisticRegression(solver='lbfgs') elif model_type == 'svm': h = { -1: 0.5, 0: 1, 1: 5 }[hyperparam] m = SVC(C=h, gamma='scale') elif model_type == 'mlp2': h = { -1: (50,), 0: (100,), 1: (50, 50,) }[hyperparam] m = MLPClassifier(hidden_layer_sizes=h) elif model_type == 'gb': m = GradientBoostingClassifier() elif model_type == 'qda': m = QDA() elif model_type == 'KNN': m = KNN() elif model_type == 'irf': import irf m = irf.ensemble.wrf() elif model_type == 'voting_mlp+svm+rf': models_list = [('mlp', MLPClassifier()), ('svm', SVC(gamma='scale')), ('rf', RandomForestClassifier(n_estimators=100))] m = VotingClassifier(estimators=models_list, voting='hard') if calibrated: m = CalibratedClassifierCV(m) scores_cv = {s: [] for s in scorers.keys()} imps = {'model': [], 'imps': []} kf = KFold(n_splits=len(cell_nums_train)) # feature selection on cell num 1 feature_selector = None if feature_selection is not None: if feature_selection == 'select_lasso': feature_selector_model = Lasso() elif feature_selection == 'select_rf': feature_selector_model = RandomForestClassifier() # select only feature_selection_num features feature_selector = SelectFromModel(feature_selector_model, threshold=-np.inf, max_features=feature_selection_num) idxs = df.cell_num.isin(cell_nums_feature_selection) feature_selector.fit(X[idxs], y[idxs].reshape(-1, 1)) X = feature_selector.transform(X) support = np.array(feature_selector.get_support()) else: support = np.ones(len(feat_names)).astype(np.bool) num_pts_by_fold_cv = [] # loops over cv, where validation set order is cell_nums_train[0], ..., cell_nums_train[-1] for cv_idx, cv_val_idx in kf.split(cell_nums_train): # get sample indices idxs_cv = df.cell_num.isin(cell_nums_train[np.array(cv_idx)]) idxs_val_cv = df.cell_num.isin(cell_nums_train[np.array(cv_val_idx)]) X_train_cv, Y_train_cv = X[idxs_cv], y[idxs_cv] X_val_cv, Y_val_cv = X[idxs_val_cv], y[idxs_val_cv] num_pts_by_fold_cv.append(X_val_cv.shape[0]) # resample training data X_train_r_cv, Y_train_r_cv = balance(X_train_cv, Y_train_cv, balancing, balancing_ratio) # fit m.fit(X_train_r_cv, Y_train_r_cv) # get preds preds = m.predict(X_val_cv) if 'svm' in model_type: preds_proba = preds else: preds_proba = m.predict_proba(X_val_cv)[:, 1] # add scores for s in scorers.keys(): scorer = scorers[s] if 'roc' in s or 'curve' in s: scores_cv[s].append(scorer(Y_val_cv, preds_proba)) else: scores_cv[s].append(scorer(Y_val_cv, preds)) imps['model'].append(deepcopy(m)) imps['imps'].append(get_feature_importance(m, model_type, X_val_cv, Y_val_cv)) # save results # os.makedirs(out_dir, exist_ok=True) results = {'metrics': list(scorers.keys()), 'num_pts_by_fold_cv': np.array(num_pts_by_fold_cv), 'cv': scores_cv, 'imps': imps, # note this contains the model 'feat_names': feat_names, 'feature_selector': feature_selector, 'feature_selection_num': feature_selection_num, 'model_type': model_type, 'balancing': balancing, 'feat_names_selected': np.array(feat_names)[support], 'calibrated': calibrated } pkl.dump(results, open(out_name, 'wb'))