Module src.train_reg

Expand source code
import os
import pickle as pkl
from os.path import join as oj

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from statsmodels import robust
import features
import data
import config
from tqdm import tqdm
from scipy.stats import pearsonr, kendalltau
from neural_networks import neural_net_sklearn

#cell_nums_train = np.array([1, 2, 3, 4, 5])
#cell_nums_test = np.array([6])


def add_robust_features(df):
    df['X_95_quantile'] = np.array([np.quantile(df.iloc[i].X, 0.95) for i in range(len(df))])
    df['X_mad'] = np.array([robust.mad(df.iloc[i].X) for i in range(len(df))])
    return df


def log_transforms(df):
    
    df = add_robust_features(df)
    df['X_max_log'] = np.log(df['X_max'])
    df['X_95_quantile_log'] = np.log(df['X_95_quantile'] + 1)
    df['Y_max_log'] = np.log(df['Y_max'])
    df['X_mad_log'] = np.log(df['X_mad'])

    def calc_rise_log(x):
        idx_max = np.argmax(x)
        val_max = x[idx_max]
        rise = np.log(val_max) - np.log(abs(np.min(x[:idx_max + 1])) + 1)  # max change before peak
        return rise

    def calc_fall_log(x):
        idx_max = np.argmax(x)
        val_max = x[idx_max]
        fall = np.log(val_max) - np.log(abs(np.min(x[idx_max:])) + 1)  # drop after peak
        return fall

    df['rise_log'] = np.array([calc_rise_log(df.iloc[i].X) for i in range(len(df))])
    df['fall_log'] = np.array([calc_fall_log(df.iloc[i].X) for i in range(len(df))])
    num = 3
    df['rise_local_3_log'] = df.apply(lambda row:
                                      calc_rise_log(np.array(row['X'][max(0, row['X_peak_idx'] - num):
                                                                      row['X_peak_idx'] + num + 1])),
                                      axis=1)
    df['fall_local_3_log'] = df.apply(lambda row:
                                      calc_fall_log(np.array(row['X'][max(0, row['X_peak_idx'] - num):
                                                                      row['X_peak_idx'] + num + 1])),
                                      axis=1)

    num2 = 11
    df['rise_local_11_log'] = df.apply(lambda row:
                                       calc_rise_log(np.array(row['X'][max(0, row['X_peak_idx'] - num2):
                                                                       row['X_peak_idx'] + num2 + 1])),
                                       axis=1)
    df['fall_local_11_log'] = df.apply(lambda row:
                                       calc_fall_log(np.array(row['X'][max(0, row['X_peak_idx'] - num2):
                                                                       row['X_peak_idx'] + num2 + 1])),
                                       axis=1)
    return df


def train_reg(df, 
              feat_names, 
              model_type='rf', 
              outcome_def='Y_max_log',
              out_name='results/regression/test.pkl', 
              seed=42,
              **kwargs):
    '''
    train regression model
    
    hyperparameters of model can be specified using **kwargs
    '''
    np.random.seed(seed)
    X = df[feat_names]
    # X = (X - X.mean()) / X.std() # normalize the data
    y = df[outcome_def].values

    if model_type == 'rf':
        m = RandomForestRegressor(n_estimators=100)
    elif model_type == 'dt':
        m = DecisionTreeRegressor()
    elif model_type == 'linear':
        m = LinearRegression()
    elif model_type == 'ridge':
        m = RidgeCV()
    elif model_type == 'svm':
        m = SVR(gamma='scale')
    elif model_type == 'gb':
        m = GradientBoostingRegressor()
    elif model_type == 'irf':
        m = irf.ensemble.wrf()
    elif 'nn' in model_type: # neural nets
        
        """
        train fully connected neural network
        """
        
        H = kwargs['fcnn_hidden_neurons'] if 'fcnn_hidden_neurons' in kwargs else 40
        epochs = kwargs['fcnn_epochs'] if 'fcnn_epochs' in kwargs else 1000
        batch_size = kwargs['fcnn_batch_size'] if 'fcnn_batch_size' in kwargs else 1000
        track_name = kwargs['track_name'] if 'track_name' in kwargs else 'X_same_length'
        D_in = len(df[track_name].iloc[0])
        
        m = neural_net_sklearn(D_in=D_in, 
                             H=H, 
                             p=len(feat_names)-1,
                             epochs=epochs,
                             batch_size=batch_size,
                             track_name=track_name,
                             arch=model_type)

    # scores_cv = {s: [] for s in scorers.keys()}
    # scores_test = {s: [] for s in scorers.keys()}
    imps = {'model': [], 'imps': []}

    cell_nums_train = np.array(list(set(df.cell_num.values)))
    kf = KFold(n_splits=len(cell_nums_train))

    # split testing data based on cell num
    #idxs_test = df.cell_num.isin(cell_nums_test)
    #idxs_train = df.cell_num.isin(cell_nums_train)
    #X_test, Y_test = X[idxs_test], y[idxs_test]
    num_pts_by_fold_cv = []
    y_preds = {}
    cv_score = []
    cv_pearsonr = []
    
    print("Looping over cv...")
    # loops over cv, where test set order is cell_nums_train[0], ..., cell_nums_train[-1]
    for cv_idx, cv_val_idx in tqdm(kf.split(cell_nums_train)):
        # get sample indices
        
        
        idxs_cv = df.cell_num.isin(cell_nums_train[np.array(cv_idx)])
        idxs_val_cv = df.cell_num.isin(cell_nums_train[np.array(cv_val_idx)])
        X_train_cv, Y_train_cv = X[idxs_cv], y[idxs_cv]
        X_val_cv, Y_val_cv = X[idxs_val_cv], y[idxs_val_cv]
        num_pts_by_fold_cv.append(X_val_cv.shape[0])

        # resample training data

        # fit
        m.fit(X_train_cv, Y_train_cv)

        # get preds
        preds = m.predict(X_val_cv)
        y_preds[cell_nums_train[np.array(cv_val_idx)][0]] = preds
        if 'log' in outcome_def:
            cv_score.append(r2_score(np.exp(Y_val_cv), np.exp(preds)))
            cv_pearsonr.append(pearsonr(np.exp(Y_val_cv), np.exp(preds))[0])
        else:
            print(r2_score(Y_val_cv, preds))
            cv_score.append(r2_score(Y_val_cv, preds))
            cv_pearsonr.append(pearsonr(Y_val_cv, preds)[0])
    
    print("Training with full data...")
    # cv_score = cv_score/len(cell_nums_train)
    m.fit(X, y)
    #print(cv_score)
    #test_preds = m.predict(X_test)
    results = {'y_preds': y_preds,
               'y': y,
               'model_state_dict': m.model.state_dict(),
               #'test_preds': test_preds,
               'cv': {'r2': cv_score, 'pearsonr': cv_pearsonr},
               'model_type': model_type,
               #'model': m,
               'num_pts_by_fold_cv': np.array(num_pts_by_fold_cv),
               }
    if model_type in ['rf', 'linear', 'ridge', 'gb', 'svm', 'irf']:
        results['model'] = m
    # save results
    # os.makedirs(out_dir, exist_ok=True)

    pkl.dump(results, open(out_name, 'wb'))


def load_results(out_dir, by_cell=True):
    r = []
    for fname in os.listdir(out_dir):
        if os.path.isdir(oj(out_dir, fname)): 
            continue
        d = pkl.load(open(oj(out_dir, fname), 'rb'))
        metrics = {k: d['cv'][k] for k in d['cv'].keys() if not 'curve' in k}
        num_pts_by_fold_cv = d['num_pts_by_fold_cv']
        print(metrics)
        out = {k: np.average(metrics[k], weights=num_pts_by_fold_cv) for k in metrics}
        if by_cell:
            out.update({'cv_accuracy_by_cell': metrics['r2']})
        out.update({k + '_std': np.std(metrics[k]) for k in metrics})
        out['model_type'] = fname.replace('.pkl', '')  # d['model_type']
        print(d['cv'].keys())
        # imp_mat = np.array(d['imps']['imps'])
        # imp_mu = imp_mat.mean(axis=0)
        # imp_sd = imp_mat.std(axis=0)

        # feat_names = d['feat_names_selected']
        # out.update({feat_names[i] + '_f': imp_mu[i] for i in range(len(feat_names))})
        # out.update({feat_names[i]+'_std_f': imp_sd[i] for i in range(len(feat_names))})
        r.append(pd.Series(out))
    r = pd.concat(r, axis=1, sort=False).T.infer_objects()
    r = r.reindex(sorted(r.columns), axis=1)  # sort the column names
    r = r.round(3)
    r = r.set_index('model_type')
    return r

def load_and_train(dset, outcome_def, out_dir, feat_names=None, use_processed=True):
    
    df = pd.read_pickle(f'../data/tracks/tracks_{dset}.pkl')
    if dset == 'clath_aux_dynamin':
        df = df[df.catIdx.isin([1, 2])]
        df = df[df.lifetime > 15]
    else:
        df = df[df['valid'] == 1] 
    df = features.add_basic_features(df)
    df = log_transforms(df)
    df = add_sig_mean(df)
    df_train = df[df.cell_num.isin(config.DSETS[dset]['train'])] 
    df_test = df[df.cell_num.isin(config.DSETS[dset]['test'])] 
    df_train = df_train.dropna()
    
    #outcome_def = 'Z_sig_mean'
    #out_dir = 'results/regression/Sep15'
    os.makedirs(out_dir, exist_ok=True)
    if not feat_names:
        feat_names = data.get_feature_names(df_train)
        feat_names = [x for x in feat_names 
                      if not x.startswith('sc_') 
                      and not x.startswith('nmf_')
                      and not x in ['center_max', 'left_max', 'right_max', 'up_max', 'down_max',
                                   'X_max_around_Y_peak', 'X_max_after_Y_peak']
                      and not x.startswith('pc_')
                      and not 'log' in x
                      and not 'binary' in x
        #               and not 'slope' in x
                     ]
    for model_type in tqdm(['linear', 'gb', 'rf', 'svm', 'ridge']):
        out_name = f'{model_type}'
                        #print(out_name)
        if use_processed and os.path.exists(f'{out_dir}/{out_name}.pkl'):
            continue
        train_reg(df_train, feat_names=feat_names, model_type=model_type, 
                     outcome_def=outcome_def,
                     out_name=f'{out_dir}/{out_name}.pkl')    
        
def test_reg(df, 
             model, 
             feat_names=None, 
             outcome_def='Y_max_log',
             out_name='results/regression/test.pkl', 
             seed=42):

    np.random.seed(seed)
    if not feat_names:
        feat_names = data.get_feature_names(df)
        feat_names = [x for x in feat_names 
                      if not x.startswith('sc_') 
                      and not x.startswith('nmf_')
                      and not x in ['center_max', 'left_max', 'right_max', 'up_max', 'down_max',
                                   'X_max_around_Y_peak', 'X_max_after_Y_peak']
                      and not x.startswith('pc_')
                      and not 'log' in x
                      and not 'binary' in x
        #               and not 'slope' in x
                     ]
    X = df[feat_names]
    # X = (X - X.mean()) / X.std() # normalize the data
    test_preds = model.predict(X)
    results = {'preds': test_preds}
    if outcome_def in df.keys():
        y = df[outcome_def].values
        results['r2'] = r2_score(y, test_preds)
        results['pearsonr'] = pearsonr(y, test_preds)
        results['kendalltau'] = kendalltau(y, test_preds)
        
    return results

Functions

def add_robust_features(df)
Expand source code
def add_robust_features(df):
    df['X_95_quantile'] = np.array([np.quantile(df.iloc[i].X, 0.95) for i in range(len(df))])
    df['X_mad'] = np.array([robust.mad(df.iloc[i].X) for i in range(len(df))])
    return df
def load_and_train(dset, outcome_def, out_dir, feat_names=None, use_processed=True)
Expand source code
def load_and_train(dset, outcome_def, out_dir, feat_names=None, use_processed=True):
    
    df = pd.read_pickle(f'../data/tracks/tracks_{dset}.pkl')
    if dset == 'clath_aux_dynamin':
        df = df[df.catIdx.isin([1, 2])]
        df = df[df.lifetime > 15]
    else:
        df = df[df['valid'] == 1] 
    df = features.add_basic_features(df)
    df = log_transforms(df)
    df = add_sig_mean(df)
    df_train = df[df.cell_num.isin(config.DSETS[dset]['train'])] 
    df_test = df[df.cell_num.isin(config.DSETS[dset]['test'])] 
    df_train = df_train.dropna()
    
    #outcome_def = 'Z_sig_mean'
    #out_dir = 'results/regression/Sep15'
    os.makedirs(out_dir, exist_ok=True)
    if not feat_names:
        feat_names = data.get_feature_names(df_train)
        feat_names = [x for x in feat_names 
                      if not x.startswith('sc_') 
                      and not x.startswith('nmf_')
                      and not x in ['center_max', 'left_max', 'right_max', 'up_max', 'down_max',
                                   'X_max_around_Y_peak', 'X_max_after_Y_peak']
                      and not x.startswith('pc_')
                      and not 'log' in x
                      and not 'binary' in x
        #               and not 'slope' in x
                     ]
    for model_type in tqdm(['linear', 'gb', 'rf', 'svm', 'ridge']):
        out_name = f'{model_type}'
                        #print(out_name)
        if use_processed and os.path.exists(f'{out_dir}/{out_name}.pkl'):
            continue
        train_reg(df_train, feat_names=feat_names, model_type=model_type, 
                     outcome_def=outcome_def,
                     out_name=f'{out_dir}/{out_name}.pkl')    
def load_results(out_dir, by_cell=True)
Expand source code
def load_results(out_dir, by_cell=True):
    r = []
    for fname in os.listdir(out_dir):
        if os.path.isdir(oj(out_dir, fname)): 
            continue
        d = pkl.load(open(oj(out_dir, fname), 'rb'))
        metrics = {k: d['cv'][k] for k in d['cv'].keys() if not 'curve' in k}
        num_pts_by_fold_cv = d['num_pts_by_fold_cv']
        print(metrics)
        out = {k: np.average(metrics[k], weights=num_pts_by_fold_cv) for k in metrics}
        if by_cell:
            out.update({'cv_accuracy_by_cell': metrics['r2']})
        out.update({k + '_std': np.std(metrics[k]) for k in metrics})
        out['model_type'] = fname.replace('.pkl', '')  # d['model_type']
        print(d['cv'].keys())
        # imp_mat = np.array(d['imps']['imps'])
        # imp_mu = imp_mat.mean(axis=0)
        # imp_sd = imp_mat.std(axis=0)

        # feat_names = d['feat_names_selected']
        # out.update({feat_names[i] + '_f': imp_mu[i] for i in range(len(feat_names))})
        # out.update({feat_names[i]+'_std_f': imp_sd[i] for i in range(len(feat_names))})
        r.append(pd.Series(out))
    r = pd.concat(r, axis=1, sort=False).T.infer_objects()
    r = r.reindex(sorted(r.columns), axis=1)  # sort the column names
    r = r.round(3)
    r = r.set_index('model_type')
    return r
def log_transforms(df)
Expand source code
def log_transforms(df):
    
    df = add_robust_features(df)
    df['X_max_log'] = np.log(df['X_max'])
    df['X_95_quantile_log'] = np.log(df['X_95_quantile'] + 1)
    df['Y_max_log'] = np.log(df['Y_max'])
    df['X_mad_log'] = np.log(df['X_mad'])

    def calc_rise_log(x):
        idx_max = np.argmax(x)
        val_max = x[idx_max]
        rise = np.log(val_max) - np.log(abs(np.min(x[:idx_max + 1])) + 1)  # max change before peak
        return rise

    def calc_fall_log(x):
        idx_max = np.argmax(x)
        val_max = x[idx_max]
        fall = np.log(val_max) - np.log(abs(np.min(x[idx_max:])) + 1)  # drop after peak
        return fall

    df['rise_log'] = np.array([calc_rise_log(df.iloc[i].X) for i in range(len(df))])
    df['fall_log'] = np.array([calc_fall_log(df.iloc[i].X) for i in range(len(df))])
    num = 3
    df['rise_local_3_log'] = df.apply(lambda row:
                                      calc_rise_log(np.array(row['X'][max(0, row['X_peak_idx'] - num):
                                                                      row['X_peak_idx'] + num + 1])),
                                      axis=1)
    df['fall_local_3_log'] = df.apply(lambda row:
                                      calc_fall_log(np.array(row['X'][max(0, row['X_peak_idx'] - num):
                                                                      row['X_peak_idx'] + num + 1])),
                                      axis=1)

    num2 = 11
    df['rise_local_11_log'] = df.apply(lambda row:
                                       calc_rise_log(np.array(row['X'][max(0, row['X_peak_idx'] - num2):
                                                                       row['X_peak_idx'] + num2 + 1])),
                                       axis=1)
    df['fall_local_11_log'] = df.apply(lambda row:
                                       calc_fall_log(np.array(row['X'][max(0, row['X_peak_idx'] - num2):
                                                                       row['X_peak_idx'] + num2 + 1])),
                                       axis=1)
    return df
def test_reg(df, model, feat_names=None, outcome_def='Y_max_log', out_name='results/regression/test.pkl', seed=42)
Expand source code
def test_reg(df, 
             model, 
             feat_names=None, 
             outcome_def='Y_max_log',
             out_name='results/regression/test.pkl', 
             seed=42):

    np.random.seed(seed)
    if not feat_names:
        feat_names = data.get_feature_names(df)
        feat_names = [x for x in feat_names 
                      if not x.startswith('sc_') 
                      and not x.startswith('nmf_')
                      and not x in ['center_max', 'left_max', 'right_max', 'up_max', 'down_max',
                                   'X_max_around_Y_peak', 'X_max_after_Y_peak']
                      and not x.startswith('pc_')
                      and not 'log' in x
                      and not 'binary' in x
        #               and not 'slope' in x
                     ]
    X = df[feat_names]
    # X = (X - X.mean()) / X.std() # normalize the data
    test_preds = model.predict(X)
    results = {'preds': test_preds}
    if outcome_def in df.keys():
        y = df[outcome_def].values
        results['r2'] = r2_score(y, test_preds)
        results['pearsonr'] = pearsonr(y, test_preds)
        results['kendalltau'] = kendalltau(y, test_preds)
        
    return results
def train_reg(df, feat_names, model_type='rf', outcome_def='Y_max_log', out_name='results/regression/test.pkl', seed=42, **kwargs)

train regression model

hyperparameters of model can be specified using **kwargs

Expand source code
def train_reg(df, 
              feat_names, 
              model_type='rf', 
              outcome_def='Y_max_log',
              out_name='results/regression/test.pkl', 
              seed=42,
              **kwargs):
    '''
    train regression model
    
    hyperparameters of model can be specified using **kwargs
    '''
    np.random.seed(seed)
    X = df[feat_names]
    # X = (X - X.mean()) / X.std() # normalize the data
    y = df[outcome_def].values

    if model_type == 'rf':
        m = RandomForestRegressor(n_estimators=100)
    elif model_type == 'dt':
        m = DecisionTreeRegressor()
    elif model_type == 'linear':
        m = LinearRegression()
    elif model_type == 'ridge':
        m = RidgeCV()
    elif model_type == 'svm':
        m = SVR(gamma='scale')
    elif model_type == 'gb':
        m = GradientBoostingRegressor()
    elif model_type == 'irf':
        m = irf.ensemble.wrf()
    elif 'nn' in model_type: # neural nets
        
        """
        train fully connected neural network
        """
        
        H = kwargs['fcnn_hidden_neurons'] if 'fcnn_hidden_neurons' in kwargs else 40
        epochs = kwargs['fcnn_epochs'] if 'fcnn_epochs' in kwargs else 1000
        batch_size = kwargs['fcnn_batch_size'] if 'fcnn_batch_size' in kwargs else 1000
        track_name = kwargs['track_name'] if 'track_name' in kwargs else 'X_same_length'
        D_in = len(df[track_name].iloc[0])
        
        m = neural_net_sklearn(D_in=D_in, 
                             H=H, 
                             p=len(feat_names)-1,
                             epochs=epochs,
                             batch_size=batch_size,
                             track_name=track_name,
                             arch=model_type)

    # scores_cv = {s: [] for s in scorers.keys()}
    # scores_test = {s: [] for s in scorers.keys()}
    imps = {'model': [], 'imps': []}

    cell_nums_train = np.array(list(set(df.cell_num.values)))
    kf = KFold(n_splits=len(cell_nums_train))

    # split testing data based on cell num
    #idxs_test = df.cell_num.isin(cell_nums_test)
    #idxs_train = df.cell_num.isin(cell_nums_train)
    #X_test, Y_test = X[idxs_test], y[idxs_test]
    num_pts_by_fold_cv = []
    y_preds = {}
    cv_score = []
    cv_pearsonr = []
    
    print("Looping over cv...")
    # loops over cv, where test set order is cell_nums_train[0], ..., cell_nums_train[-1]
    for cv_idx, cv_val_idx in tqdm(kf.split(cell_nums_train)):
        # get sample indices
        
        
        idxs_cv = df.cell_num.isin(cell_nums_train[np.array(cv_idx)])
        idxs_val_cv = df.cell_num.isin(cell_nums_train[np.array(cv_val_idx)])
        X_train_cv, Y_train_cv = X[idxs_cv], y[idxs_cv]
        X_val_cv, Y_val_cv = X[idxs_val_cv], y[idxs_val_cv]
        num_pts_by_fold_cv.append(X_val_cv.shape[0])

        # resample training data

        # fit
        m.fit(X_train_cv, Y_train_cv)

        # get preds
        preds = m.predict(X_val_cv)
        y_preds[cell_nums_train[np.array(cv_val_idx)][0]] = preds
        if 'log' in outcome_def:
            cv_score.append(r2_score(np.exp(Y_val_cv), np.exp(preds)))
            cv_pearsonr.append(pearsonr(np.exp(Y_val_cv), np.exp(preds))[0])
        else:
            print(r2_score(Y_val_cv, preds))
            cv_score.append(r2_score(Y_val_cv, preds))
            cv_pearsonr.append(pearsonr(Y_val_cv, preds)[0])
    
    print("Training with full data...")
    # cv_score = cv_score/len(cell_nums_train)
    m.fit(X, y)
    #print(cv_score)
    #test_preds = m.predict(X_test)
    results = {'y_preds': y_preds,
               'y': y,
               'model_state_dict': m.model.state_dict(),
               #'test_preds': test_preds,
               'cv': {'r2': cv_score, 'pearsonr': cv_pearsonr},
               'model_type': model_type,
               #'model': m,
               'num_pts_by_fold_cv': np.array(num_pts_by_fold_cv),
               }
    if model_type in ['rf', 'linear', 'ridge', 'gb', 'svm', 'irf']:
        results['model'] = m
    # save results
    # os.makedirs(out_dir, exist_ok=True)

    pkl.dump(results, open(out_name, 'wb'))