Добавил:
darkwarius13@gmail.com Рад если помог :). Можешь на почту спасибо сказать Опубликованный материал нарушает ваши авторские права? Сообщите нам.
Вуз: Предмет: Файл:

lab_4_Vika

.py
Скачиваний:
4
Добавлен:
27.06.2021
Размер:
10.82 Кб
Скачать
import math
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
from seaborn import scatterplot as scatter
from sklearn.manifold import TSNE
import copy
import seaborn as sn
#Lab 4
#%%

file = pd.ExcelFile('breast-cancer-wisconsin.xlsx')
df_lab_4 = pd.read_excel(file, header=None)

#split data and diagnoses
data_values = df_lab_4.columns[:-1].values.tolist()
x = df_lab_4.loc[:, data_values].values
y = df_lab_4.loc[:,[9]].values

df_stan_4 = StandardScaler().fit_transform(x)


#PCA TSNE Origin Data
#%%
from mpl_toolkits.mplot3d import Axes3D

tsne = TSNE (n_components = 3, perplexity = 50, n_iter=1700, random_state = 23, learning_rate = 100, n_jobs=-1)
x_2D = tsne.fit_transform(df_stan_4)

tsneDf = pd.DataFrame(data = x_2D, columns = ['dim 1', 'dim 2', 'dim 3'])
final_tsneDf = pd.concat([tsneDf, df_lab_4[[9]]], axis = 1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('dim 1', fontsize = 15)
ax.set_ylabel('dim 2', fontsize = 15)
ax.set_title('2D TSNE', fontsize = 20)
targets = [2, 4]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = final_tsneDf[9] == target
    ax.scatter(final_tsneDf.loc[indicesToKeep, 'dim 1']
               , final_tsneDf.loc[indicesToKeep, 'dim 2']
               , c = color
               , s = 30)
ax.legend(targets)
ax.grid()

#3D TSNE
#%%
fig = plt.figure(figsize = (8,8))
ax = Axes3D(fig, elev=-170, azim=80) 
ax.set_xlabel('dim 1', fontsize = 15)
ax.set_ylabel('dim 2', fontsize = 15)
ax.set_zlabel('dim 3', fontsize = 15)
ax.set_title('t-SNE', fontsize = 20)
targets = [2, 4]
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = final_tsneDf[9] == target
    ax.scatter(final_tsneDf.loc[indicesToKeep, 'dim 1'],
               final_tsneDf.loc[indicesToKeep, 'dim 2'],
               final_tsneDf.loc[indicesToKeep, 'dim 3'],
               c = color, alpha = 1,
               s = 50)
ax.legend(targets)
ax.grid()

#PCA 2d
#%%

pca = PCA(n_components=3, svd_solver = 'full', random_state=22)
principalComponents = pca.fit_transform(df_stan_4)

principalDf = pd.DataFrame(data = principalComponents [:, 0:3],
columns = ['principal component 1', 'principal component 2', 'principal component 3'])
finalDf = pd.concat([principalDf, df_lab_4[[9]]], axis = 1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [2, 4]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf[9] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

#PCA 3d
#%%

fig = plt.figure(figsize = (8,8))
ax = Axes3D(fig, elev=-160, azim=60) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 component PCA', fontsize = 20)
targets = [2, 4]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf[9] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'],
               finalDf.loc[indicesToKeep, 'principal component 2'],
               finalDf.loc[indicesToKeep, 'principal component 3'],
               c = color, alpha = 1, edgecolor='k',
               s = 50)
ax.legend(targets)
ax.grid()

#KMeans
#%%

kmeans = KMeans(n_clusters = 2, init = 'k-means++', max_iter=300, n_init=10, random_state = 41)
y_kmeans = kmeans.fit_predict(df_stan_4)

y_k = copy.copy(y)
y_k[y == 2] = 0
y_k[y == 4] = 1

expected = y_k
predicted = y_kmeans

print('Adjusted rand score:',metrics.adjusted_rand_score(expected[:,0], predicted))
print('Adjusted mutual info score:',metrics.adjusted_mutual_info_score(expected[:,0], predicted))

print('\n\nClassification report:')
target_names = ['доброкачественная', 'злокачественная']
print(metrics.classification_report(expected, predicted, target_names=target_names))

confusionMatrics = metrics.confusion_matrix(expected, predicted)
print('\n\nConfusion Matrix')
#visualizing confusion matrix
sn.set(font_scale=1.4) # for label size
sn.heatmap(confusionMatrics, annot=True, annot_kws={"size": 16}) # font size
plt.show()


#KMeans PCA + Origin Data
#%%

pca = PCA(n_components=2, svd_solver = 'full', random_state=55)
pca_centers = pca.fit_transform(kmeans.cluster_centers_)
f, axes = plt.subplots(1, 2, figsize=(15,7))                     
scatter(finalDf.loc[:, 'principal component 1']
        , finalDf.loc[:, 'principal component 2']
        , ax=axes[0]
        , hue=y_k[:,0])
scatter(finalDf.loc[:, 'principal component 1']
        , finalDf.loc[:, 'principal component 2']
        , ax=axes[1]
        , hue=y_kmeans)
axes[0].set_title('Origin data')
axes[1].set_title('KMeans')
scatter(pca_centers[:,0], pca_centers[:,1], ax=axes[1], marker="s",s=200)
plt.show()

#DBSCAN
#%%

dbscan = DBSCAN(eps=1.85, min_samples=4)
dbscan.fit(df_stan_4)


predicted = dbscan.labels_
print('Adjusted rand score:',metrics.adjusted_rand_score(expected[:,0], predicted))
print('Adjusted mutual info score:',metrics.adjusted_mutual_info_score(expected[:,0], predicted))

print('\n\nClassification report:')
print(metrics.classification_report(expected, predicted, zero_division =1))

print('\n\nConfusion Matrix')
#visualizing confusion matrix
confusionMatrics = metrics.confusion_matrix(expected, predicted)
sn.set(font_scale=1.4) # for label size

sn.heatmap(confusionMatrics, annot=True, annot_kws={"size": 16}) # font size
plt.show()

#DBSCAN PCA
#%%

f, axes = plt.subplots(1, 2, figsize=(15,7))                     
scatter(finalDf.loc[:, 'principal component 1']
        , finalDf.loc[:, 'principal component 2']
        , ax=axes[0]
        , hue=y_k[:,0])

scatter(finalDf.loc[:, 'principal component 1']
        , finalDf.loc[:, 'principal component 2']
        , ax=axes[1]
        , hue=dbscan.labels_)

plt.show()

#Function for metrics
#%%
def getMetrics(methodName, predictedData):
    print('-------------------------------------', methodName, '-------------------------------------')
    print('adjusted_rand_score: ', metrics.adjusted_rand_score(expected[:,0], predictedData))
    print('adjusted_mutual_info_score: ', metrics.adjusted_mutual_info_score(expected[:,0], predictedData))
    print('homogeneity_score: ', metrics.homogeneity_score(expected[:,0], predictedData))
    print('completeness_score: ', metrics.completeness_score(expected[:,0], predictedData))
    print('v_measure_score: ', metrics.v_measure_score(expected[:,0], predictedData))
    print('silhouette_score: ', metrics.silhouette_score(df_stan_4, predictedData))
    
def getErrorScores(predictedData, modelName):
    predictedData = predictedData.flatten()
    print('---------------------- ', modelName, '---------------------')
    print('mean_absolute_error: ', mean_absolute_error(Y_test.flatten(), predictedData) )
    print('mean_squared_error: ',  mean_squared_error(Y_test.flatten(), predictedData) )
    print('r2_score: ', r2_score(Y_test.flatten(), predictedData))
    
def showCR(cr_data, cr_name):
    cr_data = cr_data.ravel()
    target_names = ['доброкачественная', 'злокачественная']
    print(cr_name, ' --- classification_report \n')
    print(metrics.classification_report(Y_test.flatten(), cr_data, target_names=target_names))
    
def showMatrix(matrix):
    sn.set(font_scale=1.4) # for label size
    sn.heatmap(matrix, annot=True, annot_kws={"size": 16}) # font size
    plt.show()
   
def showPCA(data_right, title_right):
    pca = PCA(n_components=2)
    pca_diagnosis = pca.fit_transform(df_stan_4)
    principalDf = pd.DataFrame(data = pca_diagnosis [:, 0:2], columns = ['principal component 1', 'principal component 2'])
    pcaDF = pd.concat([principalDf, pd.DataFrame(data = y)], axis=1)                    
    f, axes = plt.subplots(1, 2, figsize=(15,7))
    scatter(pcaDF.loc[:, 'principal component 1'], pcaDF.loc[:, 'principal component 2'], ax=axes[0], hue=y[:,0])
    scatter(pcaDF.loc[:, 'principal component 1'], pcaDF.loc[:, 'principal component 2'], ax=axes[1], hue=data_right)
    axes[0].set_title('Diagnoses')
    axes[1].set_title(title_right)
    plt.show()
    
#split (65% and 35%)
#%%
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_stan_4, y, test_size=0.35, random_state=22)


#KNeighborsClassifier
#%%

from sklearn.neighbors import KNeighborsClassifier
K_neigh_4 = KNeighborsClassifier(n_neighbors=3)
K_neigh_4.fit(X_train, Y_train.flatten())
Y_K_neigh_predict = K_neigh_4.predict(X_test)
K_neigh_conf_matrix = metrics.confusion_matrix(Y_test.flatten(), Y_K_neigh_predict.flatten());

#show classification report
showCR(Y_K_neigh_predict, 'KNeighborsClassifier')

getErrorScores(Y_K_neigh_predict, 'KNeighborsClassifier')

#visualizing confusion matrix
showMatrix(K_neigh_conf_matrix)

K_neigh_all_predicted = K_neigh_4.predict(df_stan_4)

showPCA(K_neigh_all_predicted.ravel(), "KNeighborsClassifier")

#MLPClassifier
#%%

from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier()
MLP.fit(X_train, Y_train.flatten())
MLP_predict = MLP.predict(X_test)
MLP_conf_matrix = metrics.confusion_matrix(Y_test.flatten(), MLP_predict.flatten());

#show classification report
showCR(MLP_predict, 'MLPClassifier')
getErrorScores(MLP_predict, 'MLPClassifier')
#visualizing confusion matrix
showMatrix(MLP_conf_matrix)
MLP_all_predicted = MLP.predict(df_stan_4)

showPCA(MLP_all_predicted.ravel(), "MLPClassifier")

#DecisionTreeClassifier
#%%

from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(max_depth=5)
DTC.fit(X_train, Y_train.flatten())
DTC_predict = DTC.predict(X_test)
DTC_conf_matrix = metrics.confusion_matrix(Y_test.flatten(), DTC_predict.flatten());

#show classification report
showCR(DTC_predict, 'DecisionTreeClassifier')
getErrorScores(DTC_predict, 'DecisionTreeClassifier')
#visualizing confusion matrix
showMatrix(DTC_conf_matrix)
DTC_all_predicted = DTC.predict(df_stan_4)

showPCA(DTC_all_predicted.ravel(), "DecisionTreeClassifier")

Соседние файлы в предмете Интеллектуальная обработка данных в распределенных информационных средах