Добавил:
darkwarius13@gmail.com Рад если помог :). Можешь на почту спасибо сказать Опубликованный материал нарушает ваши авторские права? Сообщите нам.
Вуз: Предмет: Файл:

лаб 1 / V_lb1

.py
Скачиваний:
8
Добавлен:
27.06.2021
Размер:
5.78 Кб
Скачать
import numpy as np
import math
import copy as copy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA
from seaborn import scatterplot as scatter
from sklearn.manifold import TSNE

# Check if value is nan
def isNa(value):
    if isinstance(value, int) or isinstance(value, float):
        if math.isnan(value) and value != 0:
            return True
    return False

# Drop maxNaPercents percents of nan values in DataFrame
def dropNaRowByPercent (df, maxNaPercents):
    for y, colObject in df.iteritems():
        countOfNa = 0;
        countOfRows = len(colObject);
        for key in colObject:
            if isNa(key):
                countOfNa += 1
        percentNaInRow = (countOfNa / countOfRows) * 100;
        if percentNaInRow > maxNaPercents:
            #print('deleting', colObject.name)
            df = df.drop(colObject.name, axis=1)
    return df 

# Generate value instead nan
def generateValue(naPosX, naPosY, rowCount):
    distances = []
    for x, naFreeRow in df_dropNa.iterrows():
        res = 0
        for y, value in df_res.iloc[naPosX].iteritems():
            if (isNa(value) != True) and (isinstance(value, int) or isinstance(value, float)):
                res += abs(naFreeRow[y] - value)
        distances.append(res/rowCount)
    
    inverseDistancesSum = 0
    for distance in distances:
        inverseDistancesSum += 1/distance
    
    affiliationLevels = []
    for distance in distances:
        affiliationLevels.append((1/distance)/inverseDistancesSum)
    
    naValue = 0
    iterator = 0
    for x, value in df_dropNa[naPosY].iteritems():
        naValue += value * affiliationLevels[iterator]
        iterator += 1
    return naValue
                    
#%%
file = pd.ExcelFile('Dataset.xlsx')
df = pd.read_excel(file, sheet_name='Лист1', header=1);1
df_res = dropNaRowByPercent(df, 4);     
df_dropNa = df_res.dropna()
resultData = df_res.copy()  

for x, row in df_res.iterrows():
   for y, value in row.iteritems():
        if isNa(value):
            resultData.loc[x, y] = generateValue(x, y, len(df_dropNa.index) + 1)

# lab_2 
#%%            
hey = resultData.columns[1:].values.tolist()
mainData = resultData.dropna().loc[:].drop(columns=['№иб']).drop(columns=['Діагноз']).values
diagnoses = resultData.dropna().loc[:,['Діагноз']].replace('ХОЗЛ', 2).replace('БА', 1).replace('Пневмонія', 0).values
x_stan = StandardScaler().fit_transform(mainData)
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(x_stan)

print(y_kmeans)
expected = diagnoses
predicted = y_kmeans
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

#PCA diagnose + kmeans
#%%
pca = PCA(n_components=2)
pca_diagnosis = pca.fit_transform(x_stan)
principalDf = pd.DataFrame(data = pca_diagnosis [:, 0:2], columns = ['principal component 1', 'principal component 2'])

finalDf = pd.concat([principalDf, pd.DataFrame(data = diagnoses)], axis=1)                    
centers = kmeans.cluster_centers_
f, axes = plt.subplots(1, 2, figsize=(15,7))
scatter(finalDf.loc[:, 'principal component 1'], finalDf.loc[:, 'principal component 2'], ax=axes[0], hue=diagnoses[:,0])
scatter(finalDf.loc[:, 'principal component 1'], finalDf.loc[:, 'principal component 2'], ax=axes[1], hue=y_kmeans)
scatter(centers[:,0], centers[:,1], ax=axes[1], marker='s',s=200)
plt.show()

#TSNE diagnose + kmeans
#%%
tsne = TSNE (n_components = 2, perplexity = 10, n_iter=1000, random_state = 33, learning_rate = 100)
x_2D = tsne.fit_transform(x_stan)
tsneDf = pd.DataFrame(data = x_2D, columns = ['dim 1', 'dim 2'])
final_tsneDf = pd.concat([tsneDf,  pd.DataFrame(data = diagnoses)], axis = 1)
f, axes = plt.subplots(1, 2, figsize=(15,7))
scatter(final_tsneDf.loc[:, 'dim 1'], final_tsneDf.loc[:, 'dim 2'], ax=axes[0], hue=diagnoses[:,0])
scatter(final_tsneDf.loc[:,'dim 1'], final_tsneDf.loc[:, 'dim 2'], ax=axes[1], hue=y_kmeans)
scatter(centers[:,0], centers[:,1], ax=axes[1], marker="s",s=200)
plt.show()

#PCA diagnose + DBSCAN
#%%
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=9, min_samples=3)
dbscan.fit(x_stan)

f, axes = plt.subplots(1, 2, figsize=(15,7))
scatter(finalDf.loc[:, 'principal component 1'], finalDf.loc[:, 'principal component 2'], ax=axes[0], hue=diagnoses[:,0])
scatter(finalDf.loc[:, 'principal component 1'], finalDf.loc[:, 'principal component 2'], ax=axes[1], hue=dbscan.labels_)

plt.title("PCA DBSCAN")
plt.show()

#TSNE diagnose + DBSCAN
#%%
tsne = TSNE (n_components = 2, perplexity = 10, n_iter=1000, random_state = 33, learning_rate = 100)
x_2D = tsne.fit_transform(x_stan)
tsneDf = pd.DataFrame(data = x_2D, columns = ['dim 1', 'dim 2'])
final_tsneDf = pd.concat([tsneDf,  pd.DataFrame(data = diagnoses)], axis = 1)
f, axes = plt.subplots(1, 2, figsize=(15,7))
scatter(final_tsneDf.loc[:, 'dim 1'], final_tsneDf.loc[:, 'dim 2'], ax=axes[0], hue=diagnoses[:,0])
scatter(final_tsneDf.loc[:,'dim 1'], final_tsneDf.loc[:, 'dim 2'], ax=axes[1], hue=dbscan.labels_)
plt.show()

#Сompare clustering methods
#%%

predicted = dbscan.labels_
print(metrics.adjusted_rand_score(expected[:,0], predicted))
print(metrics.adjusted_mutual_info_score(expected[:,0], predicted))
print(metrics.homogeneity_score(expected[:,0], predicted))
print(metrics.completeness_score(expected[:,0], predicted))
print(metrics.v_measure_score(expected[:,0], predicted))
print(metrics.silhouette_score(x_stan,expected[:,0]))
Соседние файлы в папке лаб 1