Module src.ref.rf_neighbors

Expand source code
import sys

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold

sys.path.append('lib')
import collections

cell_nums_feature_selection = np.array([1])
cell_nums_train = np.array([1, 2, 3, 4, 5])
cell_nums_test = np.array([6])


def get_rf_neighbors(df, feat_names, outcome_def='y_thresh',
                     balancing='ros', balancing_ratio=1, out_name='results/classify/test.pkl',
                     feature_selection=None, feature_selection_num=3, seed=42):
    # pre-processing same as train.train

    np.random.seed(seed)
    X = df[feat_names]
    y = df[outcome_def].values

    m = RandomForestClassifier(n_estimators=100)

    kf = KFold(n_splits=len(cell_nums_train))

    # feature selection on cell num 1    
    feature_selector = None
    if feature_selection is not None:
        if feature_selection == 'select_lasso':
            feature_selector_model = Lasso()
        elif feature_selection == 'select_rf':
            feature_selector_model = RandomForestClassifier()
        # select only feature_selection_num features
        feature_selector = SelectFromModel(feature_selector_model, threshold=-np.inf,
                                           max_features=feature_selection_num)
        idxs = df.cell_num.isin(cell_nums_feature_selection)
        feature_selector.fit(X[idxs], y[idxs])
        X = feature_selector.transform(X)
        support = np.array(feature_selector.get_support())
    else:
        support = np.ones(len(feat_names)).astype(np.bool)

    # split testing data based on cell num
    idxs_test = df.cell_num.isin(cell_nums_test)
    X_test, Y_test = X[idxs_test], y[idxs_test]
    idxs_train = df.cell_num.isin(cell_nums_train)
    X_train, Y_train = X[idxs_train], y[idxs_train]
    # num_pts_by_fold_cv = []

    # build dictionary, key is leaf node, value is list of training samples in the node

    m.fit(X_train, Y_train)
    node_indices = m.apply(X_train)
    node_indices_test = m.apply(X_test)
    similarity_matrix = np.zeros((len(X_test), len(X_train)))
    for tree in range(100):
        node_samples = collections.defaultdict(list)
        for i in range(len(X_train)):
            node_samples[node_indices[i, tree]].append(i)
        for i in range(len(X_test)):
            node = node_indices_test[i, tree]
            for j in node_samples[node]:
                similarity_matrix[i, j] += 1
    preds_proba = m.predict_proba(X_test)[:, 1]

    # nearest neighbors and similarity
    nearest_neighbors = [np.argsort(similarity_matrix[i, :])[::-1][:10] for i in range(len(X_test))]
    similarity = [np.sort(similarity_matrix[i, :])[::-1][:10] for i in range(len(X_test))]

    idxs_test = np.where(idxs_test == True)
    idxs_train = np.where(idxs_train == True)
    df_train = df.iloc[idxs_train]
    df_test = df.iloc[idxs_test]

    df_test['preds_proba'] = preds_proba
    df_test['nearest_neighbors'] = nearest_neighbors
    df_test['similarity'] = similarity

    return df_train, df_test

Functions

def get_rf_neighbors(df, feat_names, outcome_def='y_thresh', balancing='ros', balancing_ratio=1, out_name='results/classify/test.pkl', feature_selection=None, feature_selection_num=3, seed=42)
Expand source code
def get_rf_neighbors(df, feat_names, outcome_def='y_thresh',
                     balancing='ros', balancing_ratio=1, out_name='results/classify/test.pkl',
                     feature_selection=None, feature_selection_num=3, seed=42):
    # pre-processing same as train.train

    np.random.seed(seed)
    X = df[feat_names]
    y = df[outcome_def].values

    m = RandomForestClassifier(n_estimators=100)

    kf = KFold(n_splits=len(cell_nums_train))

    # feature selection on cell num 1    
    feature_selector = None
    if feature_selection is not None:
        if feature_selection == 'select_lasso':
            feature_selector_model = Lasso()
        elif feature_selection == 'select_rf':
            feature_selector_model = RandomForestClassifier()
        # select only feature_selection_num features
        feature_selector = SelectFromModel(feature_selector_model, threshold=-np.inf,
                                           max_features=feature_selection_num)
        idxs = df.cell_num.isin(cell_nums_feature_selection)
        feature_selector.fit(X[idxs], y[idxs])
        X = feature_selector.transform(X)
        support = np.array(feature_selector.get_support())
    else:
        support = np.ones(len(feat_names)).astype(np.bool)

    # split testing data based on cell num
    idxs_test = df.cell_num.isin(cell_nums_test)
    X_test, Y_test = X[idxs_test], y[idxs_test]
    idxs_train = df.cell_num.isin(cell_nums_train)
    X_train, Y_train = X[idxs_train], y[idxs_train]
    # num_pts_by_fold_cv = []

    # build dictionary, key is leaf node, value is list of training samples in the node

    m.fit(X_train, Y_train)
    node_indices = m.apply(X_train)
    node_indices_test = m.apply(X_test)
    similarity_matrix = np.zeros((len(X_test), len(X_train)))
    for tree in range(100):
        node_samples = collections.defaultdict(list)
        for i in range(len(X_train)):
            node_samples[node_indices[i, tree]].append(i)
        for i in range(len(X_test)):
            node = node_indices_test[i, tree]
            for j in node_samples[node]:
                similarity_matrix[i, j] += 1
    preds_proba = m.predict_proba(X_test)[:, 1]

    # nearest neighbors and similarity
    nearest_neighbors = [np.argsort(similarity_matrix[i, :])[::-1][:10] for i in range(len(X_test))]
    similarity = [np.sort(similarity_matrix[i, :])[::-1][:10] for i in range(len(X_test))]

    idxs_test = np.where(idxs_test == True)
    idxs_train = np.where(idxs_train == True)
    df_train = df.iloc[idxs_train]
    df_test = df.iloc[idxs_test]

    df_test['preds_proba'] = preds_proba
    df_test['nearest_neighbors'] = nearest_neighbors
    df_test['similarity'] = similarity

    return df_train, df_test