Module src.features
Expand source code
import os
from copy import deepcopy
from os.path import join as oj
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, RidgeCV
pd.options.mode.chained_assignment = None # default='warn' - caution: this turns off setting with copy warning
import pickle as pkl
#from viz import *
import config
from scipy.interpolate import UnivariateSpline
from sklearn.decomposition import DictionaryLearning, NMF
from sklearn import decomposition
import trend_filtering
import data
from scipy.stats import skew, pearsonr
def add_pcs(df):
'''adds 10 pcs based on feature names
'''
feat_names = data.get_feature_names(df)
X = df[feat_names]
X = (X - X.mean()) / X.std()
pca = decomposition.PCA(whiten=True)
pca.fit(X[df.valid])
X_reduced = pca.transform(X)
for i in range(10):
df['pc_' + str(i)] = X_reduced[:, i]
return df
def add_dict_features(df,
sc_comps_file=oj(config.DIR_INTERIM, 'dictionaries/sc_12_alpha=1.pkl'),
nmf_comps_file=oj(config.DIR_INTERIM, 'dictionaries/nmf_12.pkl'),
use_processed=True):
'''Add features from saved dictionary to df
'''
def sparse_code(X_mat, n_comps=12, alpha=1, out_dir=oj(config.DIR_INTERIM, 'dictionaries')):
print('sparse coding...')
d = DictionaryLearning(n_components=n_comps, alpha=alpha, random_state=42)
d.fit(X_mat)
pkl.dump(d, open(oj(out_dir, f'sc_{n_comps}_alpha={alpha}.pkl'), 'wb'))
def nmf(X_mat, n_comps=12, out_dir=oj(config.DIR_INTERIM, 'dictionaries')):
print('running nmf...')
d = NMF(n_components=n_comps, random_state=42)
d.fit(X_mat)
pkl.dump(d, open(oj(out_dir, f'nmf_{n_comps}.pkl'), 'wb'))
X_mat = extract_X_mat(df)
X_mat -= np.min(X_mat)
# if feats don't exist, compute them
if not use_processed or not os.path.exists(sc_comps_file):
os.makedirs(oj(config.DIR_INTERIM, 'dictionaries'), exist_ok=True)
sparse_code(X_mat)
nmf(X_mat)
try:
# sc
d_sc = pkl.load(open(sc_comps_file, 'rb'))
encoding = d_sc.transform(X_mat)
for i in range(encoding.shape[1]):
df[f'sc_{i}'] = encoding[:, i]
# nmf
d_nmf = pkl.load(open(nmf_comps_file, 'rb'))
encoding_nmf = d_nmf.transform(X_mat)
for i in range(encoding_nmf.shape[1]):
df[f'nmf_{i}'] = encoding_nmf[:, i]
except:
print('dict features not added!')
return df
def add_smoothed_splines(df,
method='spline',
s_spl=0.004):
X_smooth_spl = []
X_smooth_spl_dx = []
X_smooth_spl_d2x = []
def num_local_maxima(x):
return (len([i for i in range(1, len(x) - 1) if x[i] > x[i - 1] and x[i] > x[i + 1]]))
for x in df['X']:
spl = UnivariateSpline(x=range(len(x)),
y=x,
w=[1.0 / len(x)] * len(x),
s=np.var(x) * s_spl)
spl_dx = spl.derivative()
spl_d2x = spl_dx.derivative()
X_smooth_spl.append(spl(range(len(x))))
X_smooth_spl_dx.append(spl_dx(range(len(x))))
X_smooth_spl_d2x.append(spl_d2x(range(len(x))))
df['X_smooth_spl'] = np.array(X_smooth_spl)
df['X_smooth_spl_dx'] = np.array(X_smooth_spl_dx)
df['X_smooth_spl_d2x'] = np.array(X_smooth_spl_d2x)
df['X_max_spl'] = np.array([np.max(x) for x in X_smooth_spl])
df['dx_max_spl'] = np.array([np.max(x) for x in X_smooth_spl_dx])
df['d2x_max_spl'] = np.array([np.max(x) for x in X_smooth_spl_d2x])
df['num_local_max_spl'] = np.array([num_local_maxima(x) for x in X_smooth_spl])
df['num_local_min_spl'] = np.array([num_local_maxima(-1 * x) for x in X_smooth_spl])
# linear fits
x = np.arange(5).reshape(-1, 1)
df['end_linear_fit'] = [LinearRegression().fit(x, end).coef_[0] for end in df['X_ends']]
df['start_linear_fit'] = [LinearRegression().fit(x, start).coef_[0] for start in df['X_starts']]
return df
def add_trend_filtering(df):
df_tf = deepcopy(df)
for i in range(len(df)):
df_tf['X'].iloc[i] = trend_filtering.trend_filtering(y=df['X'].iloc[i], vlambda=len(df['X'].iloc[i]) * 5,
order=1)
df_tf = add_features(df_tf)
feat_names = data.get_feature_names(df_tf)
feat_names = [x for x in feat_names
if not x.startswith('sc_')
and not x.startswith('nmf_')
and not x in ['center_max', 'left_max', 'right_max', 'up_max', 'down_max',
'X_max_around_Y_peak', 'X_max_after_Y_peak', 'X_max_diff_after_Y_peak',
'X_tf']
and not x.startswith('pc_')
# and not 'local' in x
# and not 'X_peak' in x
# and not 'slope' in x
# and not x in ['fall_final', 'fall_slope', 'fall_imp', 'fall']
]
for feat in feat_names:
df[feat + '_tf_smooth'] = df_tf[feat]
return df
def add_basic_features(df):
'''Add a bunch of extra features to the df based on df.X, df.X_extended, df.Y, df.lifetime
'''
df = df[df.lifetime > 2]
df['X_max'] = np.array([max(x) for x in df.X.values])
df['X_max_extended'] = np.array([max(x) for x in df.X_extended.values])
df['X_min'] = np.array([min(x) for x in df.X.values])
df['X_mean'] = np.nan_to_num(np.array([np.nanmean(x) for x in df.X.values]))
df['X_std'] = np.nan_to_num(np.array([np.std(x) for x in df.X.values]))
df['Y_max'] = np.array([max(y) for y in df.Y.values])
df['Y_mean'] = np.nan_to_num(np.array([np.nanmean(y) for y in df.Y.values]))
df['Y_std'] = np.nan_to_num(np.array([np.std(y) for y in df.Y.values]))
df['X_peak_idx'] = np.nan_to_num(np.array([np.argmax(x) for x in df.X]))
df['Y_peak_idx'] = np.nan_to_num(np.array([np.argmax(y) for y in df.Y]))
df['X_peak_time_frac'] = df['X_peak_idx'].values / df['lifetime'].values
# df['slope_end'] = df.apply(lambda row: (row['X_max'] - row['X'][-1]) / (row['lifetime'] - row['X_peak_idx']),
# axis=1)
df['X_peak_last_15'] = df['X_peak_time_frac'] >= 0.85
df['X_peak_last_5'] = df['X_peak_time_frac'] >= 0.95
# hand-engineeredd features
def calc_rise(x):
'''max change before peak
'''
idx_max = np.argmax(x)
val_max = x[idx_max]
return val_max - np.min(x[:idx_max + 1])
def calc_fall(x):
'''max change after peak
'''
idx_max = np.argmax(x)
val_max = x[idx_max]
return val_max - np.min(x[idx_max:])
def calc_rise_slope(x):
'''slope to max change before peak
'''
idx_max = np.argmax(x)
val_max = x[idx_max]
x_early = x[:idx_max + 1]
idx_min = np.argmin(x_early)
denom = (idx_max - idx_min)
if denom == 0:
return 0
return (val_max - np.min(x_early)) / denom
def calc_fall_slope(x):
'''slope to max change after peak
'''
idx_max = np.argmax(x)
val_max = x[idx_max]
x_late = x[idx_max:]
idx_min = np.argmin(x_late)
denom = idx_min
if denom == 0:
return 0
return (val_max - np.min(x_late)) / denom
def max_diff(x):
return np.max(np.diff(x))
def min_diff(x):
return np.min(np.diff(x))
df['rise'] = df.apply(lambda row: calc_rise(row['X']), axis=1)
df['fall'] = df.apply(lambda row: calc_fall(row['X']), axis=1)
df['rise_extended'] = df.apply(lambda row: calc_rise(row['X_extended']), axis=1)
df['fall_extended'] = df.apply(lambda row: calc_fall(row['X_extended']), axis=1)
df['fall_late_extended'] = df.apply(lambda row: row['fall_extended'] if row['X_peak_last_15'] else row['fall'],
axis=1)
# df['fall_final'] = df.apply(lambda row: row['X'][-3] - row['X'][-1], axis=1)
df['rise_slope'] = df.apply(lambda row: calc_rise_slope(row['X']), axis=1)
df['fall_slope'] = df.apply(lambda row: calc_fall_slope(row['X']), axis=1)
num = 3
df['rise_local_3'] = df.apply(lambda row:
calc_rise(np.array(row['X'][max(0, row['X_peak_idx'] - num):
row['X_peak_idx'] + num + 1])),
axis=1)
df['fall_local_3'] = df.apply(lambda row:
calc_fall(np.array(row['X'][max(0, row['X_peak_idx'] - num):
row['X_peak_idx'] + num + 1])),
axis=1)
num2 = 11
df['rise_local_11'] = df.apply(lambda row:
calc_rise(np.array(row['X'][max(0, row['X_peak_idx'] - num2):
row['X_peak_idx'] + num2 + 1])),
axis=1)
df['fall_local_11'] = df.apply(lambda row:
calc_fall(np.array(row['X'][max(0, row['X_peak_idx'] - num2):
row['X_peak_idx'] + num2 + 1])),
axis=1)
df['max_diff'] = df.apply(lambda row: max_diff(row['X']), axis=1)
df['min_diff'] = df.apply(lambda row: min_diff(row['X']), axis=1)
# imputed feats
d = df[['X_max', 'X_mean', 'lifetime', 'rise', 'fall']]
d = d[df['X_peak_time_frac'] <= 0.8]
# m = RidgeCV().fit(d[['X_max', 'X_mean', 'lifetime', 'rise']], d['fall'])
# fall_pred = m.predict(df[['X_max', 'X_mean', 'lifetime', 'rise']])
# fall_imp = df['fall']
# fall_imp[df['X_peak_time_frac'] > 0.8] = fall_pred[df['X_peak_time_frac'] > 0.8]
# df['fall_imp'] = fall_imp
return df
def extract_X_mat(df):
'''Extract matrix for X filled with zeros after sequences
Width of matrix is length of longest lifetime
'''
p = df.lifetime.max()
n = df.shape[0]
X_mat = np.zeros((n, p)).astype(np.float32)
X = df['X'].values
for i in range(n):
x = X[i]
num_timepoints = min(p, len(x))
X_mat[i, :num_timepoints] = x[:num_timepoints]
X_mat = np.nan_to_num(X_mat)
X_mat -= np.min(X_mat)
X_mat /= np.std(X_mat)
return X_mat
def add_binary_features(df, outcome_def):
'''binarize features at the difference between the mean of each class
'''
feat_names = data.get_feature_names(df)
threshes = (df[df[outcome_def] == 1].mean() + df[df[outcome_def] == 0].mean()) / 2
for i, k in tqdm(enumerate(feat_names)):
thresh = threshes.loc[k]
df[k + '_binary'] = df[k] >= thresh
return df
def add_dasc_features(df, bins=100, by_cell=True):
"""
add DASC features from Wang et al. 2020 paper
Parameters:
df: pd.DataFrame
bins: int
number of bins
default value is 100: the intensity level of clathrin is assigned to 100 equal-length bins
from vmin(min intensity across all tracks) to vmax(max intensity across all tracks)
by_cell: Boolean
whether to do binning within each cell
"""
x_dist = {}
n = len(df)
# gather min and max clathrin intensity within each cell
if by_cell == True:
for cell in set(df['cell_num']):
x = []
cell_idx = np.where(df['cell_num'].values == cell)[0]
for i in cell_idx:
x += df['X'].values[i]
x_dist[cell] = (min(x), max(x))
else:
x = []
for i in range(n):
x += df['X'].values[i]
for cell in set(df['cell_num']):
x_dist[cell] = (min(x), max(x))
# transform the clathrin intensity to a value between 0 to 100
X_quantiles = []
for i in range(n):
r = df.iloc[i]
cell = r['cell_num']
X_quantiles.append([np.int(1.0*bins*(x - x_dist[cell][0])/(x_dist[cell][1] - x_dist[cell][0])) if not np.isnan(x) else 0 for x in r['X']])
df['X_quantiles'] = X_quantiles
# compute transition probability between different intensities, for different frames
trans_prob = {}
tmax = max([len(df['X_quantiles'].values[i]) for i in range(len(df))])
for t in range(tmax - 1):
int_pairs = []
for i in range(n):
if len(df['X_quantiles'].values[i]) > t + 1:
int_pairs.append([df['X_quantiles'].values[i][t], df['X_quantiles'].values[i][t + 1]])
int_pairs = np.array(int_pairs)
trans_prob_t = {}
for i in range(bins + 1):
x1 = np.where(int_pairs[:,0]== i)[0]
lower_states_num = np.zeros((i, 2))
for j in range(len(int_pairs)):
if int_pairs[j, 0] < i:
lower_states_num[int_pairs[j, 0], 0] += 1
if int_pairs[j, 1] == i:
lower_states_num[int_pairs[j, 0], 1] += 1
lower_prob = [1.*lower_states_num[k, 1]/lower_states_num[k, 0] for k in range(i) if lower_states_num[k, 0] > 0]
trans_prob_t[i] = (np.nanmean(int_pairs[x1,1] < i),
#np.nanmean(int_pairs[x1,1] > i)
sum(lower_prob)
)
trans_prob[t] = trans_prob_t
# compute D sequence
X_d = [[] for i in range(len(df))]
for i in range(len(df)):
for j, q in enumerate(df['X_quantiles'].values[i][:-1]):
probs = trans_prob[j][q]
if 0 < probs[0] and 0 < probs[1]:
X_d[i].append(np.log(probs[0]/probs[1]))
else:
X_d[i].append(0)
# compute features
d1 = [np.mean(x) for x in X_d]
d2 = [np.log(max((np.max(x) - np.min(x))/len(x), 1e-4)) for x in X_d]
d3 = [skew(x) for x in X_d]
df['X_d1'] = d1
df['X_d2'] = d2
df['X_d3'] = d3
return df
def downsample(x, length, padding='end'):
"""
downsample (clathrin) track
Parameters:
==========================================================
x: list
original clathrin track (of different lengths)
length: int
length of track after downsampling
Returns:
==========================================================
x_ds: list
downsampled track
"""
x = np.array(x)[np.where(np.isnan(x) == False)]
n = len(x)
if n >= length:
# if length of original track is greater than targeted length, downsample
x_ds = [x[np.int(1.0 * (n-1) * i/(length - 1))] for i in range(length)]
else:
# if length of original track is smaller than targeted length, fill the track with 0s
if padding == 'front':
x_ds = [0]*(length - len(x)) + list(x)
else:
x_ds = list(x) + [0]*(length - len(x))
return x_ds
def downsample_video(x, length):
"""
downsample video feature in the same way
"""
n = len(x)
if n >= length:
# if length of original track is greater than targeted length, downsample
time_index = [np.int(1.0 * (n-1) * i/(length - 1)) for i in range(length)]
x_ds = x[time_index, :, :]
elif n > 0:
# if length of original track is smaller than targeted length, fill the track with 0s
x_ds = np.vstack((x, np.zeros((length - n, 10, 10))))
else:
x_ds = np.zeros((40, 10, 10))
return x_ds
def normalize_track(df, track='X_same_length', by_time_point=True):
"""
normalize tracks
"""
df[f'{track}_normalized'] = df[track].values
for cell in set(df['cell_num']):
cell_idx = np.where(df['cell_num'].values == cell)[0]
y = df[track].values[cell_idx]
y = np.array(list(y))
if by_time_point:
df[f'{track}_normalized'].values[cell_idx] = list((y - np.mean(y, axis=0))/np.std(y, axis=0))
else:
df[f'{track}_normalized'].values[cell_idx] = list((y - np.mean(y))/np.std(y))
return df
def normalize_feature(df, feat):
"""
normalize scalar features
"""
df = df.astype({feat: 'float64'})
for cell in set(df['cell_num']):
cell_idx = np.where(df['cell_num'].values == cell)[0]
y = df[feat].values[cell_idx]
#y = np.array(list(y))
df[feat].values[cell_idx] = (y - np.nanmean(y))/np.nanstd(y)
return df
def normalize_video(df, video='X_video'):
"""
normalize videos (different frames are normalized separately)
e.g. to normalize the first frame, we take the first frame of all videos,
flatten and concatenate them into one 1-d array,
and extract the mean and std
"""
df[f'{video}_normalized'] = df[video].values
for cell in set(df['cell_num']):
cell_idx = np.where(df['cell_num'].values == cell)[0]
y = df[video].values[cell_idx]
video_shape = y[0].shape
video_mean, video_std = np.zeros(video_shape), np.zeros(video_shape)
for j in (range(video_shape[0])):
all_frames_j = np.array([y[i][j].reshape(1, -1)[0] for i in range(len(y))]).reshape(1, -1)[0]
video_mean[j] = np.mean(all_frames_j) * np.ones((video_shape[1], video_shape[2]))
video_std[j] = np.std(all_frames_j) * np.ones((video_shape[1], video_shape[2]))
df[f'{video}_normalized'].values[cell_idx] = list((list(y) - video_mean)/(video_std))
return df
Functions
def add_basic_features(df)
-
Add a bunch of extra features to the df based on df.X, df.X_extended, df.Y, df.lifetime
Expand source code
def add_basic_features(df): '''Add a bunch of extra features to the df based on df.X, df.X_extended, df.Y, df.lifetime ''' df = df[df.lifetime > 2] df['X_max'] = np.array([max(x) for x in df.X.values]) df['X_max_extended'] = np.array([max(x) for x in df.X_extended.values]) df['X_min'] = np.array([min(x) for x in df.X.values]) df['X_mean'] = np.nan_to_num(np.array([np.nanmean(x) for x in df.X.values])) df['X_std'] = np.nan_to_num(np.array([np.std(x) for x in df.X.values])) df['Y_max'] = np.array([max(y) for y in df.Y.values]) df['Y_mean'] = np.nan_to_num(np.array([np.nanmean(y) for y in df.Y.values])) df['Y_std'] = np.nan_to_num(np.array([np.std(y) for y in df.Y.values])) df['X_peak_idx'] = np.nan_to_num(np.array([np.argmax(x) for x in df.X])) df['Y_peak_idx'] = np.nan_to_num(np.array([np.argmax(y) for y in df.Y])) df['X_peak_time_frac'] = df['X_peak_idx'].values / df['lifetime'].values # df['slope_end'] = df.apply(lambda row: (row['X_max'] - row['X'][-1]) / (row['lifetime'] - row['X_peak_idx']), # axis=1) df['X_peak_last_15'] = df['X_peak_time_frac'] >= 0.85 df['X_peak_last_5'] = df['X_peak_time_frac'] >= 0.95 # hand-engineeredd features def calc_rise(x): '''max change before peak ''' idx_max = np.argmax(x) val_max = x[idx_max] return val_max - np.min(x[:idx_max + 1]) def calc_fall(x): '''max change after peak ''' idx_max = np.argmax(x) val_max = x[idx_max] return val_max - np.min(x[idx_max:]) def calc_rise_slope(x): '''slope to max change before peak ''' idx_max = np.argmax(x) val_max = x[idx_max] x_early = x[:idx_max + 1] idx_min = np.argmin(x_early) denom = (idx_max - idx_min) if denom == 0: return 0 return (val_max - np.min(x_early)) / denom def calc_fall_slope(x): '''slope to max change after peak ''' idx_max = np.argmax(x) val_max = x[idx_max] x_late = x[idx_max:] idx_min = np.argmin(x_late) denom = idx_min if denom == 0: return 0 return (val_max - np.min(x_late)) / denom def max_diff(x): return np.max(np.diff(x)) def min_diff(x): return np.min(np.diff(x)) df['rise'] = df.apply(lambda row: calc_rise(row['X']), axis=1) df['fall'] = df.apply(lambda row: calc_fall(row['X']), axis=1) df['rise_extended'] = df.apply(lambda row: calc_rise(row['X_extended']), axis=1) df['fall_extended'] = df.apply(lambda row: calc_fall(row['X_extended']), axis=1) df['fall_late_extended'] = df.apply(lambda row: row['fall_extended'] if row['X_peak_last_15'] else row['fall'], axis=1) # df['fall_final'] = df.apply(lambda row: row['X'][-3] - row['X'][-1], axis=1) df['rise_slope'] = df.apply(lambda row: calc_rise_slope(row['X']), axis=1) df['fall_slope'] = df.apply(lambda row: calc_fall_slope(row['X']), axis=1) num = 3 df['rise_local_3'] = df.apply(lambda row: calc_rise(np.array(row['X'][max(0, row['X_peak_idx'] - num): row['X_peak_idx'] + num + 1])), axis=1) df['fall_local_3'] = df.apply(lambda row: calc_fall(np.array(row['X'][max(0, row['X_peak_idx'] - num): row['X_peak_idx'] + num + 1])), axis=1) num2 = 11 df['rise_local_11'] = df.apply(lambda row: calc_rise(np.array(row['X'][max(0, row['X_peak_idx'] - num2): row['X_peak_idx'] + num2 + 1])), axis=1) df['fall_local_11'] = df.apply(lambda row: calc_fall(np.array(row['X'][max(0, row['X_peak_idx'] - num2): row['X_peak_idx'] + num2 + 1])), axis=1) df['max_diff'] = df.apply(lambda row: max_diff(row['X']), axis=1) df['min_diff'] = df.apply(lambda row: min_diff(row['X']), axis=1) # imputed feats d = df[['X_max', 'X_mean', 'lifetime', 'rise', 'fall']] d = d[df['X_peak_time_frac'] <= 0.8] # m = RidgeCV().fit(d[['X_max', 'X_mean', 'lifetime', 'rise']], d['fall']) # fall_pred = m.predict(df[['X_max', 'X_mean', 'lifetime', 'rise']]) # fall_imp = df['fall'] # fall_imp[df['X_peak_time_frac'] > 0.8] = fall_pred[df['X_peak_time_frac'] > 0.8] # df['fall_imp'] = fall_imp return df
def add_binary_features(df, outcome_def)
-
binarize features at the difference between the mean of each class
Expand source code
def add_binary_features(df, outcome_def): '''binarize features at the difference between the mean of each class ''' feat_names = data.get_feature_names(df) threshes = (df[df[outcome_def] == 1].mean() + df[df[outcome_def] == 0].mean()) / 2 for i, k in tqdm(enumerate(feat_names)): thresh = threshes.loc[k] df[k + '_binary'] = df[k] >= thresh return df
def add_dasc_features(df, bins=100, by_cell=True)
-
add DASC features from Wang et al. 2020 paper
Parameters
df
- pd.DataFrame
bins
- int number of bins default value is 100: the intensity level of clathrin is assigned to 100 equal-length bins from vmin(min intensity across all tracks) to vmax(max intensity across all tracks)
by_cell
- Boolean whether to do binning within each cell
Expand source code
def add_dasc_features(df, bins=100, by_cell=True): """ add DASC features from Wang et al. 2020 paper Parameters: df: pd.DataFrame bins: int number of bins default value is 100: the intensity level of clathrin is assigned to 100 equal-length bins from vmin(min intensity across all tracks) to vmax(max intensity across all tracks) by_cell: Boolean whether to do binning within each cell """ x_dist = {} n = len(df) # gather min and max clathrin intensity within each cell if by_cell == True: for cell in set(df['cell_num']): x = [] cell_idx = np.where(df['cell_num'].values == cell)[0] for i in cell_idx: x += df['X'].values[i] x_dist[cell] = (min(x), max(x)) else: x = [] for i in range(n): x += df['X'].values[i] for cell in set(df['cell_num']): x_dist[cell] = (min(x), max(x)) # transform the clathrin intensity to a value between 0 to 100 X_quantiles = [] for i in range(n): r = df.iloc[i] cell = r['cell_num'] X_quantiles.append([np.int(1.0*bins*(x - x_dist[cell][0])/(x_dist[cell][1] - x_dist[cell][0])) if not np.isnan(x) else 0 for x in r['X']]) df['X_quantiles'] = X_quantiles # compute transition probability between different intensities, for different frames trans_prob = {} tmax = max([len(df['X_quantiles'].values[i]) for i in range(len(df))]) for t in range(tmax - 1): int_pairs = [] for i in range(n): if len(df['X_quantiles'].values[i]) > t + 1: int_pairs.append([df['X_quantiles'].values[i][t], df['X_quantiles'].values[i][t + 1]]) int_pairs = np.array(int_pairs) trans_prob_t = {} for i in range(bins + 1): x1 = np.where(int_pairs[:,0]== i)[0] lower_states_num = np.zeros((i, 2)) for j in range(len(int_pairs)): if int_pairs[j, 0] < i: lower_states_num[int_pairs[j, 0], 0] += 1 if int_pairs[j, 1] == i: lower_states_num[int_pairs[j, 0], 1] += 1 lower_prob = [1.*lower_states_num[k, 1]/lower_states_num[k, 0] for k in range(i) if lower_states_num[k, 0] > 0] trans_prob_t[i] = (np.nanmean(int_pairs[x1,1] < i), #np.nanmean(int_pairs[x1,1] > i) sum(lower_prob) ) trans_prob[t] = trans_prob_t # compute D sequence X_d = [[] for i in range(len(df))] for i in range(len(df)): for j, q in enumerate(df['X_quantiles'].values[i][:-1]): probs = trans_prob[j][q] if 0 < probs[0] and 0 < probs[1]: X_d[i].append(np.log(probs[0]/probs[1])) else: X_d[i].append(0) # compute features d1 = [np.mean(x) for x in X_d] d2 = [np.log(max((np.max(x) - np.min(x))/len(x), 1e-4)) for x in X_d] d3 = [skew(x) for x in X_d] df['X_d1'] = d1 df['X_d2'] = d2 df['X_d3'] = d3 return df
def add_dict_features(df, sc_comps_file='/accounts/projects/vision/chandan/auxilin-prediction/src/../data/interim/dictionaries/sc_12_alpha=1.pkl', nmf_comps_file='/accounts/projects/vision/chandan/auxilin-prediction/src/../data/interim/dictionaries/nmf_12.pkl', use_processed=True)
-
Add features from saved dictionary to df
Expand source code
def add_dict_features(df, sc_comps_file=oj(config.DIR_INTERIM, 'dictionaries/sc_12_alpha=1.pkl'), nmf_comps_file=oj(config.DIR_INTERIM, 'dictionaries/nmf_12.pkl'), use_processed=True): '''Add features from saved dictionary to df ''' def sparse_code(X_mat, n_comps=12, alpha=1, out_dir=oj(config.DIR_INTERIM, 'dictionaries')): print('sparse coding...') d = DictionaryLearning(n_components=n_comps, alpha=alpha, random_state=42) d.fit(X_mat) pkl.dump(d, open(oj(out_dir, f'sc_{n_comps}_alpha={alpha}.pkl'), 'wb')) def nmf(X_mat, n_comps=12, out_dir=oj(config.DIR_INTERIM, 'dictionaries')): print('running nmf...') d = NMF(n_components=n_comps, random_state=42) d.fit(X_mat) pkl.dump(d, open(oj(out_dir, f'nmf_{n_comps}.pkl'), 'wb')) X_mat = extract_X_mat(df) X_mat -= np.min(X_mat) # if feats don't exist, compute them if not use_processed or not os.path.exists(sc_comps_file): os.makedirs(oj(config.DIR_INTERIM, 'dictionaries'), exist_ok=True) sparse_code(X_mat) nmf(X_mat) try: # sc d_sc = pkl.load(open(sc_comps_file, 'rb')) encoding = d_sc.transform(X_mat) for i in range(encoding.shape[1]): df[f'sc_{i}'] = encoding[:, i] # nmf d_nmf = pkl.load(open(nmf_comps_file, 'rb')) encoding_nmf = d_nmf.transform(X_mat) for i in range(encoding_nmf.shape[1]): df[f'nmf_{i}'] = encoding_nmf[:, i] except: print('dict features not added!') return df
def add_pcs(df)
-
adds 10 pcs based on feature names
Expand source code
def add_pcs(df): '''adds 10 pcs based on feature names ''' feat_names = data.get_feature_names(df) X = df[feat_names] X = (X - X.mean()) / X.std() pca = decomposition.PCA(whiten=True) pca.fit(X[df.valid]) X_reduced = pca.transform(X) for i in range(10): df['pc_' + str(i)] = X_reduced[:, i] return df
def add_smoothed_splines(df, method='spline', s_spl=0.004)
-
Expand source code
def add_smoothed_splines(df, method='spline', s_spl=0.004): X_smooth_spl = [] X_smooth_spl_dx = [] X_smooth_spl_d2x = [] def num_local_maxima(x): return (len([i for i in range(1, len(x) - 1) if x[i] > x[i - 1] and x[i] > x[i + 1]])) for x in df['X']: spl = UnivariateSpline(x=range(len(x)), y=x, w=[1.0 / len(x)] * len(x), s=np.var(x) * s_spl) spl_dx = spl.derivative() spl_d2x = spl_dx.derivative() X_smooth_spl.append(spl(range(len(x)))) X_smooth_spl_dx.append(spl_dx(range(len(x)))) X_smooth_spl_d2x.append(spl_d2x(range(len(x)))) df['X_smooth_spl'] = np.array(X_smooth_spl) df['X_smooth_spl_dx'] = np.array(X_smooth_spl_dx) df['X_smooth_spl_d2x'] = np.array(X_smooth_spl_d2x) df['X_max_spl'] = np.array([np.max(x) for x in X_smooth_spl]) df['dx_max_spl'] = np.array([np.max(x) for x in X_smooth_spl_dx]) df['d2x_max_spl'] = np.array([np.max(x) for x in X_smooth_spl_d2x]) df['num_local_max_spl'] = np.array([num_local_maxima(x) for x in X_smooth_spl]) df['num_local_min_spl'] = np.array([num_local_maxima(-1 * x) for x in X_smooth_spl]) # linear fits x = np.arange(5).reshape(-1, 1) df['end_linear_fit'] = [LinearRegression().fit(x, end).coef_[0] for end in df['X_ends']] df['start_linear_fit'] = [LinearRegression().fit(x, start).coef_[0] for start in df['X_starts']] return df
def add_trend_filtering(df)
-
Expand source code
def add_trend_filtering(df): df_tf = deepcopy(df) for i in range(len(df)): df_tf['X'].iloc[i] = trend_filtering.trend_filtering(y=df['X'].iloc[i], vlambda=len(df['X'].iloc[i]) * 5, order=1) df_tf = add_features(df_tf) feat_names = data.get_feature_names(df_tf) feat_names = [x for x in feat_names if not x.startswith('sc_') and not x.startswith('nmf_') and not x in ['center_max', 'left_max', 'right_max', 'up_max', 'down_max', 'X_max_around_Y_peak', 'X_max_after_Y_peak', 'X_max_diff_after_Y_peak', 'X_tf'] and not x.startswith('pc_') # and not 'local' in x # and not 'X_peak' in x # and not 'slope' in x # and not x in ['fall_final', 'fall_slope', 'fall_imp', 'fall'] ] for feat in feat_names: df[feat + '_tf_smooth'] = df_tf[feat] return df
def downsample(x, length, padding='end')
-
downsample (clathrin) track
Parameters:
x: list original clathrin track (of different lengths) length: int length of track after downsampling
Returns:
x_ds: list downsampled track
Expand source code
def downsample(x, length, padding='end'): """ downsample (clathrin) track Parameters: ========================================================== x: list original clathrin track (of different lengths) length: int length of track after downsampling Returns: ========================================================== x_ds: list downsampled track """ x = np.array(x)[np.where(np.isnan(x) == False)] n = len(x) if n >= length: # if length of original track is greater than targeted length, downsample x_ds = [x[np.int(1.0 * (n-1) * i/(length - 1))] for i in range(length)] else: # if length of original track is smaller than targeted length, fill the track with 0s if padding == 'front': x_ds = [0]*(length - len(x)) + list(x) else: x_ds = list(x) + [0]*(length - len(x)) return x_ds
def downsample_video(x, length)
-
downsample video feature in the same way
Expand source code
def downsample_video(x, length): """ downsample video feature in the same way """ n = len(x) if n >= length: # if length of original track is greater than targeted length, downsample time_index = [np.int(1.0 * (n-1) * i/(length - 1)) for i in range(length)] x_ds = x[time_index, :, :] elif n > 0: # if length of original track is smaller than targeted length, fill the track with 0s x_ds = np.vstack((x, np.zeros((length - n, 10, 10)))) else: x_ds = np.zeros((40, 10, 10)) return x_ds
def extract_X_mat(df)
-
Extract matrix for X filled with zeros after sequences Width of matrix is length of longest lifetime
Expand source code
def extract_X_mat(df): '''Extract matrix for X filled with zeros after sequences Width of matrix is length of longest lifetime ''' p = df.lifetime.max() n = df.shape[0] X_mat = np.zeros((n, p)).astype(np.float32) X = df['X'].values for i in range(n): x = X[i] num_timepoints = min(p, len(x)) X_mat[i, :num_timepoints] = x[:num_timepoints] X_mat = np.nan_to_num(X_mat) X_mat -= np.min(X_mat) X_mat /= np.std(X_mat) return X_mat
def normalize_feature(df, feat)
-
normalize scalar features
Expand source code
def normalize_feature(df, feat): """ normalize scalar features """ df = df.astype({feat: 'float64'}) for cell in set(df['cell_num']): cell_idx = np.where(df['cell_num'].values == cell)[0] y = df[feat].values[cell_idx] #y = np.array(list(y)) df[feat].values[cell_idx] = (y - np.nanmean(y))/np.nanstd(y) return df
def normalize_track(df, track='X_same_length', by_time_point=True)
-
normalize tracks
Expand source code
def normalize_track(df, track='X_same_length', by_time_point=True): """ normalize tracks """ df[f'{track}_normalized'] = df[track].values for cell in set(df['cell_num']): cell_idx = np.where(df['cell_num'].values == cell)[0] y = df[track].values[cell_idx] y = np.array(list(y)) if by_time_point: df[f'{track}_normalized'].values[cell_idx] = list((y - np.mean(y, axis=0))/np.std(y, axis=0)) else: df[f'{track}_normalized'].values[cell_idx] = list((y - np.mean(y))/np.std(y)) return df
def normalize_video(df, video='X_video')
-
normalize videos (different frames are normalized separately)
e.g. to normalize the first frame, we take the first frame of all videos, flatten and concatenate them into one 1-d array, and extract the mean and std
Expand source code
def normalize_video(df, video='X_video'): """ normalize videos (different frames are normalized separately) e.g. to normalize the first frame, we take the first frame of all videos, flatten and concatenate them into one 1-d array, and extract the mean and std """ df[f'{video}_normalized'] = df[video].values for cell in set(df['cell_num']): cell_idx = np.where(df['cell_num'].values == cell)[0] y = df[video].values[cell_idx] video_shape = y[0].shape video_mean, video_std = np.zeros(video_shape), np.zeros(video_shape) for j in (range(video_shape[0])): all_frames_j = np.array([y[i][j].reshape(1, -1)[0] for i in range(len(y))]).reshape(1, -1)[0] video_mean[j] = np.mean(all_frames_j) * np.ones((video_shape[1], video_shape[2])) video_std[j] = np.std(all_frames_j) * np.ones((video_shape[1], video_shape[2])) df[f'{video}_normalized'].values[cell_idx] = list((list(y) - video_mean)/(video_std)) return df