Module src.outcomes
Expand source code
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn' - caution: this turns off setting with copy warning
from viz import *
def add_rule_based_label(df):
df['Y_peak_time_frac'] = df['Y_peak_idx'].values / df['lifetime'].values
df['y_z_score'] = (df['Y_max'].values - df['Y_mean'].values) / df['Y_std'].values
X_max_around_Y_peak = []
X_max_after_Y_peak = []
for i in range(len(df)):
pt = df['Y_peak_idx'].values[i]
lt = df['lifetime'].values[i]
left_bf = np.int(0.2 * lt) + 1 # look at a window with length = 30%*lifetime
right_bf = np.int(0.1 * lt) + 1
arr_around = df['X'].iloc[i][max(0, pt - left_bf): min(pt + right_bf, lt)]
arr_after = df['X'].iloc[i][min(pt + right_bf, lt - 1):]
X_max_around_Y_peak.append(max(arr_around))
if len(arr_after) > 0:
X_max_after_Y_peak.append(max(arr_after))
else:
X_max_after_Y_peak.append(max(arr_around))
df['X_max_around_Y_peak'] = X_max_around_Y_peak
df['X_max_after_Y_peak'] = X_max_after_Y_peak
df['X_max_diff'] = df['X_max_around_Y_peak'] - df['X_max_after_Y_peak']
def rule_based_model(track):
# three rules:
# if aux peaks too early -- negative
# elif:
# if y_consec_sig or y_conservative_thresh or (cla drops around aux peak, and aux max is greater than
# mean + 2.6*std), then positive
# else: negative
if track['Y_peak_time_frac'] < 0.2:
return 0
if track['y_consec_sig'] or track['y_conservative_thresh']:
return 1
# if track['X_max_diff'] > 260 and track['y_z_score'] > 2.6:
# return 1
if track['X_max_diff'] > 260 and track['Y_max'] > 560:
return 1
return 0
df['y_rule_based'] = np.array([rule_based_model(df.iloc[i]) for i in range(len(df))])
return df
def add_outcomes(df, LABELS=None, thresh=3.25, p_thresh=0.05, aux_peak=642.375, aux_thresh=973):
'''Add binary outcome of whether spike happened and info on whether events were questionable
'''
df['y_score'] = df['Y_max'].values - (df['Y_mean'].values + thresh * df['Y_std'].values)
df['y_thresh'] = (df['y_score'].values > 0).astype(np.int) # Y_max was big
df['y'] = df['Y_max'] > aux_peak
# outcomes based on significant p-values
num_sigs = [np.array(df['Y_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])]
df['y_num_sig'] = np.array([num_sigs[i].sum() for i in range(df.shape[0])]).astype(np.int)
df['y_single_sig'] = np.array([num_sigs[i].sum() > 0 for i in range(df.shape[0])]).astype(np.int)
df['y_double_sig'] = np.array([num_sigs[i].sum() > 1 for i in range(df.shape[0])]).astype(np.int)
df['y_conservative_thresh'] = (df['Y_max'].values > aux_thresh).astype(np.int)
y_consec_sig = []
y_sig_min_diff = []
for i in range(df.shape[0]):
idxs_sig = np.where(num_sigs[i] == 1)[0] # indices of significance
if len(idxs_sig) > 1:
y_sig_min_diff.append(np.min(np.diff(idxs_sig)))
else:
y_sig_min_diff.append(np.nan)
# find whether there were consecutive sig. indices
if len(idxs_sig) > 1 and np.min(np.diff(idxs_sig)) == 1:
y_consec_sig.append(1)
else:
y_consec_sig.append(0)
df['y_consec_sig'] = y_consec_sig
df['y_sig_min_diff'] = y_sig_min_diff
df['y_consec_thresh'] = np.logical_or(df['y_consec_sig'], df['y_conservative_thresh'])
def add_hotspots(df, num_sigs, outcome_def='consec_sig'):
'''Identify hotspots as any track which over its time course has multiple events
events must meet the event definition, then for a time not meet it, then meet it again
Example: two consecutive significant p-values, then non-significant p-value, then 2 more consecutive p-values
'''
if outcome_def == 'consec_sig':
hotspots = np.zeros(df.shape[0]).astype(np.int)
for i in range(df.shape[0]):
idxs_sig = np.where(num_sigs[i] == 1)[0] # indices of significance
if idxs_sig.size < 5:
hotspots[i] = 0
else:
diffs = np.diff(idxs_sig)
consecs = np.where(diffs == 1)[0] # diffs==1 means there were consecutive sigs
consec_diffs = np.diff(consecs)
if consec_diffs.shape[0] > 0 and np.max(
consec_diffs) > 2: # there were greated than 2 non-consec sigs between the consec sigs
hotspots[i] = 1
else:
hotspots[i] = 0
df['sig_idxs'] = num_sigs
df['hotspots'] = hotspots == 1
return df
df = add_hotspots(df, num_sigs)
if LABELS is not None:
df['y_consec_thresh'][df.pid.isin(LABELS['pos'])] = 1 # add manual pos labels
df['y_consec_thresh'][df.pid.isin(LABELS['neg'])] = 0 # add manual neg labels
df['hotspots'][df.pid.isin(LABELS['hotspots'])] = True # add manual hotspot labels
df = add_rule_based_label(df)
return df
def add_sig_mean(df, resp_tracks=['Y']):
"""add response of regression problem: mean auxilin strength among significant observations
"""
for track in resp_tracks:
sig_mean = []
for i in range(len(df)):
r = df.iloc[i]
sigs = np.array(r[f'{track}_pvals']) < 0.05
if sum(sigs)>0:
sig_mean.append(np.mean(np.array(r[track])[sigs]))
else:
sig_mean.append(0)
df[f'{track}_sig_mean'] = sig_mean
df[f'{track}_sig_mean_normalized'] = sig_mean
for cell in set(df['cell_num']):
cell_idx = np.where(df['cell_num'].values == cell)[0]
y = df[f'{track}_sig_mean'].values[cell_idx]
df[f'{track}_sig_mean_normalized'].values[cell_idx] = (y - np.mean(y))/np.std(y)
return df
def add_aux_dyn_outcome(df, p_thresh=0.05, clath_thresh=1500, dyn_thresh=2000,
dyn_cons_thresh=5, clath_sig_frac=0.5, clath_consec_thresh_frac=0.15):
"""add response of regression problem: mean auxilin strength among significant observations
"""
# look for clathrin significance
num_sigs = [np.array(df['X_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])]
x_consec_sig = []
x_frac_sig = []
lifetime_steps = np.array([len(df['X'].iloc[i]) for i in range(df.shape[0])]) # get lifetimes
for i in range(df.shape[0]):
l = lifetime_steps[i]
sigs = num_sigs[i]
x_frac_sig.append(np.mean(sigs) >= clath_sig_frac)
cons = 0
consec_flag = False
for j in range(len(sigs)):
if sigs[j] == 1:
cons += 1
else:
cons = 0
if cons >= max(l * clath_consec_thresh_frac, 5):
consec_flag = True
break
if consec_flag:
x_consec_sig.append(1)
else:
x_consec_sig.append(0)
# outcomes based on significant p-values
df['clath_conservative_thresh'] = (df['X_max'].values > clath_thresh).astype(np.int)
df['clath_sig'] = np.logical_and(x_consec_sig, x_frac_sig)
df['successful'] = np.logical_and(df['y_consec_thresh'], df['clath_conservative_thresh'])
df['successful_dynamin'] = df['successful']
df['successful_full'] = np.logical_and(df['clath_sig'], df['successful_dynamin'])
# look for dynamin peak
if 'Z' in df.keys():
num_sigs = [np.array(df['Z_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])]
z_consec_sig = []
for i in range(df.shape[0]):
sigs = num_sigs[i]
cons = 0
consec_flag = False
for j in range(len(sigs)):
if sigs[j] == 1:
cons += 1
else:
cons = 0
if cons >= dyn_cons_thresh:
consec_flag = True
break
if consec_flag:
z_consec_sig.append(1)
else:
z_consec_sig.append(0)
df['z_consec_sig'] = z_consec_sig
df['Z_max'] = [np.max(df.iloc[i]['Z']) for i in range(df.shape[0])]
df['z_thresh'] = df['Z_max'] > dyn_thresh
df['z_consec_thresh'] = np.logical_and(df['z_consec_sig'], df['z_thresh'])
df['Y_peak_idx'] = np.nan_to_num(np.array([np.argmax(y) for y in df.Y]))
df['Z_peak_idx'] = np.nan_to_num(np.array([np.argmax(z) for z in df.Z]))
df['z_peaked_first'] = df['Z_peak_idx'] < df['Y_peak_idx']
df['z_peak'] = np.logical_and(df['z_consec_thresh'], df['z_peaked_first'])
# peaks must happen at end of track
df['z_peak'] = np.logical_and(df['z_peak'], df['Z_peak_idx'] > lifetime_steps / 2)
df['successful_dynamin'] = np.logical_or(
df['successful'],
np.logical_and(df['clath_conservative_thresh'], df['z_peak'])
)
df['successful_full'] = np.logical_and(df['clath_sig'], df['successful_dynamin'])
return df
Functions
def add_aux_dyn_outcome(df, p_thresh=0.05, clath_thresh=1500, dyn_thresh=2000, dyn_cons_thresh=5, clath_sig_frac=0.5, clath_consec_thresh_frac=0.15)
-
add response of regression problem: mean auxilin strength among significant observations
Expand source code
def add_aux_dyn_outcome(df, p_thresh=0.05, clath_thresh=1500, dyn_thresh=2000, dyn_cons_thresh=5, clath_sig_frac=0.5, clath_consec_thresh_frac=0.15): """add response of regression problem: mean auxilin strength among significant observations """ # look for clathrin significance num_sigs = [np.array(df['X_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])] x_consec_sig = [] x_frac_sig = [] lifetime_steps = np.array([len(df['X'].iloc[i]) for i in range(df.shape[0])]) # get lifetimes for i in range(df.shape[0]): l = lifetime_steps[i] sigs = num_sigs[i] x_frac_sig.append(np.mean(sigs) >= clath_sig_frac) cons = 0 consec_flag = False for j in range(len(sigs)): if sigs[j] == 1: cons += 1 else: cons = 0 if cons >= max(l * clath_consec_thresh_frac, 5): consec_flag = True break if consec_flag: x_consec_sig.append(1) else: x_consec_sig.append(0) # outcomes based on significant p-values df['clath_conservative_thresh'] = (df['X_max'].values > clath_thresh).astype(np.int) df['clath_sig'] = np.logical_and(x_consec_sig, x_frac_sig) df['successful'] = np.logical_and(df['y_consec_thresh'], df['clath_conservative_thresh']) df['successful_dynamin'] = df['successful'] df['successful_full'] = np.logical_and(df['clath_sig'], df['successful_dynamin']) # look for dynamin peak if 'Z' in df.keys(): num_sigs = [np.array(df['Z_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])] z_consec_sig = [] for i in range(df.shape[0]): sigs = num_sigs[i] cons = 0 consec_flag = False for j in range(len(sigs)): if sigs[j] == 1: cons += 1 else: cons = 0 if cons >= dyn_cons_thresh: consec_flag = True break if consec_flag: z_consec_sig.append(1) else: z_consec_sig.append(0) df['z_consec_sig'] = z_consec_sig df['Z_max'] = [np.max(df.iloc[i]['Z']) for i in range(df.shape[0])] df['z_thresh'] = df['Z_max'] > dyn_thresh df['z_consec_thresh'] = np.logical_and(df['z_consec_sig'], df['z_thresh']) df['Y_peak_idx'] = np.nan_to_num(np.array([np.argmax(y) for y in df.Y])) df['Z_peak_idx'] = np.nan_to_num(np.array([np.argmax(z) for z in df.Z])) df['z_peaked_first'] = df['Z_peak_idx'] < df['Y_peak_idx'] df['z_peak'] = np.logical_and(df['z_consec_thresh'], df['z_peaked_first']) # peaks must happen at end of track df['z_peak'] = np.logical_and(df['z_peak'], df['Z_peak_idx'] > lifetime_steps / 2) df['successful_dynamin'] = np.logical_or( df['successful'], np.logical_and(df['clath_conservative_thresh'], df['z_peak']) ) df['successful_full'] = np.logical_and(df['clath_sig'], df['successful_dynamin']) return df
def add_outcomes(df, LABELS=None, thresh=3.25, p_thresh=0.05, aux_peak=642.375, aux_thresh=973)
-
Add binary outcome of whether spike happened and info on whether events were questionable
Expand source code
def add_outcomes(df, LABELS=None, thresh=3.25, p_thresh=0.05, aux_peak=642.375, aux_thresh=973): '''Add binary outcome of whether spike happened and info on whether events were questionable ''' df['y_score'] = df['Y_max'].values - (df['Y_mean'].values + thresh * df['Y_std'].values) df['y_thresh'] = (df['y_score'].values > 0).astype(np.int) # Y_max was big df['y'] = df['Y_max'] > aux_peak # outcomes based on significant p-values num_sigs = [np.array(df['Y_pvals'].iloc[i]) < p_thresh for i in range(df.shape[0])] df['y_num_sig'] = np.array([num_sigs[i].sum() for i in range(df.shape[0])]).astype(np.int) df['y_single_sig'] = np.array([num_sigs[i].sum() > 0 for i in range(df.shape[0])]).astype(np.int) df['y_double_sig'] = np.array([num_sigs[i].sum() > 1 for i in range(df.shape[0])]).astype(np.int) df['y_conservative_thresh'] = (df['Y_max'].values > aux_thresh).astype(np.int) y_consec_sig = [] y_sig_min_diff = [] for i in range(df.shape[0]): idxs_sig = np.where(num_sigs[i] == 1)[0] # indices of significance if len(idxs_sig) > 1: y_sig_min_diff.append(np.min(np.diff(idxs_sig))) else: y_sig_min_diff.append(np.nan) # find whether there were consecutive sig. indices if len(idxs_sig) > 1 and np.min(np.diff(idxs_sig)) == 1: y_consec_sig.append(1) else: y_consec_sig.append(0) df['y_consec_sig'] = y_consec_sig df['y_sig_min_diff'] = y_sig_min_diff df['y_consec_thresh'] = np.logical_or(df['y_consec_sig'], df['y_conservative_thresh']) def add_hotspots(df, num_sigs, outcome_def='consec_sig'): '''Identify hotspots as any track which over its time course has multiple events events must meet the event definition, then for a time not meet it, then meet it again Example: two consecutive significant p-values, then non-significant p-value, then 2 more consecutive p-values ''' if outcome_def == 'consec_sig': hotspots = np.zeros(df.shape[0]).astype(np.int) for i in range(df.shape[0]): idxs_sig = np.where(num_sigs[i] == 1)[0] # indices of significance if idxs_sig.size < 5: hotspots[i] = 0 else: diffs = np.diff(idxs_sig) consecs = np.where(diffs == 1)[0] # diffs==1 means there were consecutive sigs consec_diffs = np.diff(consecs) if consec_diffs.shape[0] > 0 and np.max( consec_diffs) > 2: # there were greated than 2 non-consec sigs between the consec sigs hotspots[i] = 1 else: hotspots[i] = 0 df['sig_idxs'] = num_sigs df['hotspots'] = hotspots == 1 return df df = add_hotspots(df, num_sigs) if LABELS is not None: df['y_consec_thresh'][df.pid.isin(LABELS['pos'])] = 1 # add manual pos labels df['y_consec_thresh'][df.pid.isin(LABELS['neg'])] = 0 # add manual neg labels df['hotspots'][df.pid.isin(LABELS['hotspots'])] = True # add manual hotspot labels df = add_rule_based_label(df) return df
def add_rule_based_label(df)
-
Expand source code
def add_rule_based_label(df): df['Y_peak_time_frac'] = df['Y_peak_idx'].values / df['lifetime'].values df['y_z_score'] = (df['Y_max'].values - df['Y_mean'].values) / df['Y_std'].values X_max_around_Y_peak = [] X_max_after_Y_peak = [] for i in range(len(df)): pt = df['Y_peak_idx'].values[i] lt = df['lifetime'].values[i] left_bf = np.int(0.2 * lt) + 1 # look at a window with length = 30%*lifetime right_bf = np.int(0.1 * lt) + 1 arr_around = df['X'].iloc[i][max(0, pt - left_bf): min(pt + right_bf, lt)] arr_after = df['X'].iloc[i][min(pt + right_bf, lt - 1):] X_max_around_Y_peak.append(max(arr_around)) if len(arr_after) > 0: X_max_after_Y_peak.append(max(arr_after)) else: X_max_after_Y_peak.append(max(arr_around)) df['X_max_around_Y_peak'] = X_max_around_Y_peak df['X_max_after_Y_peak'] = X_max_after_Y_peak df['X_max_diff'] = df['X_max_around_Y_peak'] - df['X_max_after_Y_peak'] def rule_based_model(track): # three rules: # if aux peaks too early -- negative # elif: # if y_consec_sig or y_conservative_thresh or (cla drops around aux peak, and aux max is greater than # mean + 2.6*std), then positive # else: negative if track['Y_peak_time_frac'] < 0.2: return 0 if track['y_consec_sig'] or track['y_conservative_thresh']: return 1 # if track['X_max_diff'] > 260 and track['y_z_score'] > 2.6: # return 1 if track['X_max_diff'] > 260 and track['Y_max'] > 560: return 1 return 0 df['y_rule_based'] = np.array([rule_based_model(df.iloc[i]) for i in range(len(df))]) return df
def add_sig_mean(df, resp_tracks=['Y'])
-
add response of regression problem: mean auxilin strength among significant observations
Expand source code
def add_sig_mean(df, resp_tracks=['Y']): """add response of regression problem: mean auxilin strength among significant observations """ for track in resp_tracks: sig_mean = [] for i in range(len(df)): r = df.iloc[i] sigs = np.array(r[f'{track}_pvals']) < 0.05 if sum(sigs)>0: sig_mean.append(np.mean(np.array(r[track])[sigs])) else: sig_mean.append(0) df[f'{track}_sig_mean'] = sig_mean df[f'{track}_sig_mean_normalized'] = sig_mean for cell in set(df['cell_num']): cell_idx = np.where(df['cell_num'].values == cell)[0] y = df[f'{track}_sig_mean'].values[cell_idx] df[f'{track}_sig_mean_normalized'].values[cell_idx] = (y - np.mean(y))/np.std(y) return df