import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
import scipy.stats as ss
import scikit_posthocs as sp
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
regression_B_link = 'regression_data/inc_analysis_regression_1/Data for Regression B-Table 1.csv'
df_clusters = pd.read_csv(regression_B_link).reset_index()
df_clusters.columns = df_clusters.iloc[0].values
df_clusters = df_clusters.iloc[1:]
df_clusters.head()
df_clusters['Epoch'] = pd.to_numeric(df_clusters['Epoch'])
df_clusters['Epoch'].describe()
df_final = df_clusters[['InclusionDistanceAvg','InclusionDistanceStdDev','NoOfCommittees','AttesterSlashings', 'VoluntaryExits', 'MissingBlocks']]
df_final.head()
k = 5
k_means = KMeans(n_clusters=k)
k_means_fit = k_means.fit(df_final)
pd.DataFrame(k_means_fit.fit_predict(df_final))
silhouette_score(df_final,k_means_fit.fit_predict(df_final))
r = {
'k': [],
'silhouette_score': []
}
for k in range(3, 30):
k_means = KMeans(n_clusters=k)
k_means_fit = k_means.fit(df_final)
r['k'].append(k)
r['silhouette_score'].append(silhouette_score(df_final,k_means_fit.fit_predict(df_final)))
print("Fitted ", k)
r
plt.figure(figsize=(20,15), )
cluster_scores = sns.lineplot(x=r['k'], y= r['silhouette_score'], linewidth = 5)
cluster_scores.axes.set_title("Cluster count (k) vs Silhouette Score ",fontsize=40)
cluster_scores.set_xlabel("K cluster",fontsize=20)
cluster_scores.set_ylabel("Silhouette Score",fontsize=20)
cluster_scores.tick_params(labelsize=15)
plt.savefig('cluster_scores_per_k.png')
r['silhouette_score']
r['k']
# optimal number of clusters is 9
k_means = KMeans(n_clusters=9, random_state=42)
k_means_fit = k_means.fit(df_final)
k_means_fit.labels_
df_labeled = df_final
df_labeled['clusters'] = k_means_fit.labels_
df_labeled.head()
df_labeled.info()
mask = (df_labeled['clusters'] != 8) & (df_labeled['clusters'] != 7)
df_labeled = df_labeled[mask]
df_labeled
df_labeled['clusters'].value_counts()
# Removed clusters 8 and 9 because there was < 30.
df_labeled['clusters'] = df_labeled['clusters'].astype(str)
df_labeled.head()
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()
fig = (px.scatter_3d(df_labeled, x='InclusionDistanceAvg', y='InclusionDistanceStdDev', z='MissingBlocks',
color='clusters', size_max=10, opacity=.3))
# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.write_html("inc_distance_plotplot.html")
df_labeled.columns
# Computing Dunn test results for all of the following
# InclusionDistanceAvg vs clusters
# InclusionDistanceStdDev vs clusters
# NoOfCommittees vs clusters
# AttesterSlashings vs clusters
# VoluntaryExits vs clusters
# MissingBlocks vs clusters
df_inc_dist_avg = df_labeled[['InclusionDistanceAvg', 'clusters']]
df_inc_dist_stdev = df_labeled[['InclusionDistanceStdDev', 'clusters']]
# df_num_committees = df_labeled[['NoOfCommittees', 'clusters']]
df_att_slashings = df_labeled[['AttesterSlashings', 'clusters']]
df_vol_exits = df_labeled[['VoluntaryExits', 'clusters']]
df_missing_blocks = df_labeled[['MissingBlocks', 'clusters']]
dfs = [df_inc_dist_avg, df_inc_dist_stdev, df_att_slashings, df_vol_exits, df_missing_blocks]
# make sure leading column is numeric
for i in range(5):
dfs[i][dfs[i].columns[0]] = pd.to_numeric(dfs[i][dfs[i].columns[0]])
dunn_test_results_raw = []
for df in dfs:
cols = df.columns
dunn_test_results_raw.append(sp.posthoc_dunn(df, val_col=cols[0], group_col=cols[1], p_adjust = 'bonferroni'))
len(dunn_test_results_raw)
dunn_test_results_modified = []
for i, r in enumerate(dunn_test_results_raw):
dunn_test_results_modified.append({ element: [] for element in r.columns })
for column in r.columns:
for item in r[column]:
if item <= 0.01:
dunn_test_results_modified[i][column].append(1)
else:
dunn_test_results_modified[i][column].append(0)
dunn_test_results_modified_dfs = [ pd.DataFrame(df) for df in dunn_test_results_modified ]
dunn_test_results_modified_dfs[0]
# Saving output dfs
for i in range(5):
pd.DataFrame(dunn_test_results_modified_dfs[i], index=dunn_test_results_modified_dfs[i].columns).to_csv(f'dunn_test_clusters_output_{dfs[i].columns[0]}.csv')
df_labeled.head()
median_per_cluster_dfs = []
for df in dfs:
median_per_cluster_dfs.append(df.groupby('clusters')
.median()
.reset_index()
.sort_values(df.columns[1], ascending=False))
avgs_per_cluster_dfs = []
for df in dfs:
avgs_per_cluster_dfs.append(df.groupby('clusters')
.mean()
.reset_index()
.sort_values(df.columns[1], ascending=False))
stdev_per_cluster_dfs = []
for df in dfs:
stdev_per_cluster_dfs.append(df.groupby('clusters')
.std()
.reset_index()
.sort_values(df.columns[1], ascending=False))
for df in median_per_cluster_dfs:
df.sort_values(df.columns[1], ascending=False, inplace=True)
for df in avgs_per_cluster_dfs:
df.sort_values(df.columns[1], ascending=False, inplace=True)
for df in stdev_per_cluster_dfs:
df.sort_values(df.columns[1], ascending=False, inplace=True)
median_per_cluster_dfs
median_per_cluster_dfs[1].columns
# Box plots!
sns.set_style(style='white')
for i, df in enumerate(dfs):
plt.figure(figsize=(25,18))
a = sns.boxplot(x=df.columns[1], y=df.columns[0], data=df, order=median_per_cluster_dfs[i].index)
a.set_xlabel(df.columns[1].capitalize() ,fontsize=35)
a.set_ylabel(df.columns[0].capitalize() ,fontsize=35)
a.tick_params(labelsize=35)
plt.savefig(f'boxplots/boxplot_{df.columns[0]}_{df.columns[1]}.png')
# Bar plots!
sns.set_style(style='white')
for i, df in enumerate(median_per_cluster_dfs):
plt.figure(figsize=(25,18))
a = sns.barplot(x=df.columns[0], y=df.columns[1], data=df, order=median_per_cluster_dfs[i].index)
a.axes.set_title(f"Median {df.columns[1]} per Cluster",fontsize=40)
a.set_xlabel(df.columns[0].capitalize() ,fontsize=35)
a.set_ylabel(df.columns[1].capitalize() ,fontsize=35)
a.tick_params(labelsize=35)
plt.savefig(f'barplots/barplot_median_{df.columns[0]}_{df.columns[1]}.png')
# Bar plots!
sns.set_style(style='white')
for i, df in enumerate(avgs_per_cluster_dfs):
plt.figure(figsize=(25,18))
a = sns.barplot(x=df.columns[0], y=df.columns[1], data=df, order=avgs_per_cluster_dfs[i].index)
a.axes.set_title(f"Mean {df.columns[1]} per Cluster",fontsize=40)
a.set_xlabel(df.columns[0].capitalize() ,fontsize=35)
a.set_ylabel(df.columns[1].capitalize() ,fontsize=35)
a.tick_params(labelsize=35)
plt.savefig(f'barplots/barplot_mean_{df.columns[0]}_{df.columns[1]}.png')
# Bar plots!
sns.set_style(style='white')
for i, df in enumerate(stdev_per_cluster_dfs):
plt.figure(figsize=(25,18))
a = sns.barplot(x=df.columns[0], y=df.columns[1], data=df, order=stdev_per_cluster_dfs[i].index)
a.axes.set_title(f"Standard Deviation of {df.columns[1]} per Cluster",fontsize=40)
a.set_xlabel(df.columns[0].capitalize() ,fontsize=25)
a.set_ylabel(df.columns[1].capitalize() ,fontsize=25)
a.tick_params(labelsize=20)
plt.savefig(f'barplots/barplot_stdev_{df.columns[0]}_{df.columns[1]}.png')
df_labeled.shape
df_labeled['VoluntaryExits'].value_counts()
df_labeled['AttesterSlashings'].value_counts()
df_labeled_no_zeroes = df_labeled[(df_labeled['AttesterSlashings'] != '0') & (df_labeled['VoluntaryExits'] != '0')]
df_labeled_no_zeroes.head()
df_att_slashings_no_zeros = df_labeled[df_labeled['AttesterSlashings'] != '0'][['AttesterSlashings', 'clusters']]
df_vol_exits_no_zeros = df_labeled[df_labeled['VoluntaryExits'] != '0'][['VoluntaryExits', 'clusters']]
sns.distplot(df_att_slashings_no_zeros, kde=False)
sns.distplot(df_vol_exits_no_zeros, kde=False)
df_labeled.head()
df_labeled.columns
#Multi-distribution plot
plt.subplots(figsize=(12,8), dpi=100)
sns.distplot( df_labeled['InclusionDistanceAvg'].loc[df_labeled.clusters=='0'] , color="dodgerblue", label="Cluster 0", hist=False)
sns.distplot( df_labeled['InclusionDistanceAvg'].loc[df_labeled.clusters=='1'] , color="orange", label="Cluster 1", hist=False)
sns.distplot( df_labeled['InclusionDistanceAvg'].loc[df_labeled.clusters=='2'] , color="deeppink", label="Cluster 2", hist=False)
sns.distplot( df_labeled['InclusionDistanceAvg'].loc[df_labeled.clusters=='3'] , color="red", label="Cluster 3", hist=False)
sns.distplot( df_labeled['InclusionDistanceAvg'].loc[df_labeled.clusters=='4'] , color="blue", label="Cluster 4", hist=False)
sns.distplot( df_labeled['InclusionDistanceAvg'].loc[df_labeled.clusters=='5'] , color="purple", label="Cluster 5", hist=False)
sns.distplot( df_labeled['InclusionDistanceAvg'].loc[df_labeled.clusters=='6'] , color="green", label="Cluster 6", hist=False)
plt.title('Inclusion Distance Avg Across Clusters')
plt.legend();
plt.savefig('inc_dist_avg_across_cluster_distributions.png')
#Multi-distribution plot
plt.subplots(figsize=(12,8), dpi=100)
sns.distplot( df_labeled['InclusionDistanceStdDev'].loc[df_labeled.clusters=='0'] , color="dodgerblue", label="Cluster 0", hist=False)
sns.distplot( df_labeled['InclusionDistanceStdDev'].loc[df_labeled.clusters=='1'] , color="orange", label="Cluster 1", hist=False)
sns.distplot( df_labeled['InclusionDistanceStdDev'].loc[df_labeled.clusters=='2'] , color="deeppink", label="Cluster 2", hist=False)
sns.distplot( df_labeled['InclusionDistanceStdDev'].loc[df_labeled.clusters=='3'] , color="red", label="Cluster 3", hist=False)
sns.distplot( df_labeled['InclusionDistanceStdDev'].loc[df_labeled.clusters=='4'] , color="blue", label="Cluster 4", hist=False)
sns.distplot( df_labeled['InclusionDistanceStdDev'].loc[df_labeled.clusters=='5'] , color="purple", label="Cluster 5", hist=False)
sns.distplot( df_labeled['InclusionDistanceStdDev'].loc[df_labeled.clusters=='6'] , color="green", label="Cluster 6", hist=False)
plt.title('Inclusion Distance Standard Deviation Across Clusters')
plt.legend();
plt.savefig('inc_dist_std_across_cluster_distributions.png')
#Multi-distribution plot
plt.subplots(figsize=(12,8), dpi=100)
sns.distplot( df_labeled['MissingBlocks'].loc[df_labeled.clusters=='0'] , color="dodgerblue", label="Cluster 0", hist=False)
sns.distplot( df_labeled['MissingBlocks'].loc[df_labeled.clusters=='1'] , color="orange", label="Cluster 1", hist=False)
sns.distplot( df_labeled['MissingBlocks'].loc[df_labeled.clusters=='2'] , color="deeppink", label="Cluster 2", hist=False)
sns.distplot( df_labeled['MissingBlocks'].loc[df_labeled.clusters=='3'] , color="red", label="Cluster 3", hist=False)
sns.distplot( df_labeled['MissingBlocks'].loc[df_labeled.clusters=='4'] , color="blue", label="Cluster 4", hist=False)
sns.distplot( df_labeled['MissingBlocks'].loc[df_labeled.clusters=='5'] , color="purple", label="Cluster 5", hist=False)
sns.distplot( df_labeled['MissingBlocks'].loc[df_labeled.clusters=='6'] , color="green", label="Cluster 6", hist=False)
plt.title('Missing Blocks Across Clusters')
plt.legend();
plt.savefig('missing_blocks_across_cluster_distributions.png')