Unsupervised Learning (II): Dimensionality Reduction

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

1. The Penguins Dataset

penguins = sns.load_dataset('penguins')
penguins

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
...	...	...	...	...	...	...	...
339	Gentoo	Biscoe	NaN	NaN	NaN	NaN	NaN
340	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

344 rows × 7 columns

sns.pairplot(penguins[["species", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]], hue="species", height=2.0)

<seaborn.axisgrid.PairGrid at 0x2d01b3f4ac0>

../../_images/3f92e659409669f6d35340de3238bcebc44cb7666331cbfadb14d1913504452e.png

sns.pairplot(penguins, hue="island", height=2.0)

<seaborn.axisgrid.PairGrid at 0x2d021aa7910>

../../_images/2d5169a386a877287cc526a94749f96ee4744bcf80fa30a7baaf6ea0ddea3bf6.png

sns.pairplot(penguins, hue="sex", height=2.0)

<seaborn.axisgrid.PairGrid at 0x2d01e16bb80>

../../_images/ed13666857dd9667e47fe707cd2b64fa6ae56abce57211e9a67b6623e1b2d55a.png

2. Data Processing

# remove missing values
penguins_dimR = penguins.dropna()

# check duplicate values from dataset
penguins_dimR.duplicated().value_counts()

False    333
Name: count, dtype: int64

3. Data Preprocessing

species = penguins_dimR["species"]

# select and standardize numerical features

from sklearn.preprocessing import StandardScaler

X = penguins_dimR[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

4. Training Model & Evaluating Model Performance

4.1 Principal Component Analysis (PCA)

from sklearn.decomposition import PCA

# construct a PCA model with 2 PCs
pca_2 = PCA(n_components=2)
X_pca_2 = pca_2.fit_transform(X_scaled)
explained_var_2 = pca_2.explained_variance_ratio_

print(f'''The explained variance of PC1    is {explained_var_2[0]:.2%}
The explained variance of PC2    is {explained_var_2[1]:.2%}
The explained variance (PC1+PC2) is {explained_var_2.sum():.2%}''')

The explained variance of PC1    is 68.63%
The explained variance of PC2    is 19.45%
The explained variance (PC1+PC2) is 88.09%

X_pca_2_species = penguins_dimR.join(pd.DataFrame(X_pca_2,
                                                  index = penguins_dimR.index, 
                                                  columns = ['PC_1', 'PC_2']))
X_pca_2_species

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	PC_1	PC_2
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male	-1.853593	0.032069
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female	-1.316254	-0.443527
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female	-1.376605	-0.161230
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female	-1.885288	-0.012351
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male	-1.919981	0.817598
...	...	...	...	...	...	...	...	...	...
338	Gentoo	Biscoe	47.2	13.7	214.0	4925.0	Female	1.997716	-0.976771
340	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female	1.832651	-0.784510
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male	2.751505	0.266556
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female	1.713854	-0.725875
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male	2.018537	0.336554

333 rows × 9 columns

fig, axes = plt.subplots(1, 3, figsize=(13,4))

# left subpolt: hua by species
sns.scatterplot(data=X_pca_2_species, x='PC_1', y='PC_2', hue='species', s=32, ax=axes[0])
axes[0].set_title("PCA Scatterplot by Species", fontsize=12)
axes[0].set_xlabel("Principal Component 1", fontsize=12)
axes[0].set_ylabel("Principal Component 2", fontsize=12)
axes[0].legend(title="Species", fontsize=12, title_fontsize=12)
axes[0].set_xticks([])
axes[0].set_yticks([])

# middle subplot: hue by island
sns.scatterplot(data=X_pca_2_species, x='PC_1', y='PC_2', hue='island', s=32, ax=axes[1])
axes[1].set_title("PCA Scatterplot by Island", fontsize=12)
axes[1].set_xlabel("Principal Component 1", fontsize=12)
axes[1].set_ylabel("Principal Component 2", fontsize=12)
axes[1].legend(title="Island", fontsize=12, title_fontsize=12)
axes[1].set_xticks([])
axes[1].set_yticks([])

# right subplot: hue by sex
sns.scatterplot(data=X_pca_2_species, x='PC_1', y='PC_2', hue='sex', s=32, ax=axes[2])
axes[2].set_title("PCA Scatterplot by Sex", fontsize=12)
axes[2].set_xlabel("Principal Component 1", fontsize=12)
axes[2].set_ylabel("Principal Component 2", fontsize=12)
axes[2].legend(title="Sex", fontsize=12, title_fontsize=12)
axes[2].set_xticks([])
axes[2].set_yticks([])

plt.tight_layout()
plt.show()
# plt.savefig("./8-pca2-penguins-distributions-species-sex-island.png")

../../_images/ca2869eba68316506706f63d0d2e043f4c4c7ddc9b8003320081cf3d2515dfce.png

How many PCs do we need?

# how many PCs do we need?

X_scaled_temp = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# build a new PCA model with 4 PCs
pca_4 = PCA(n_components=4)
X_pca_4 = pca_4.fit_transform(X_scaled_temp)
explained_var_4 = pca_4.explained_variance_ratio_

print(f'''The explained variance of PC1 is {explained_var_4[0]:.2%}
The explained variance of PC2 is {explained_var_4[1]:.2%}
The explained variance of PC3 is {explained_var_4[2]:.2%}
The explained variance of PC4 is {explained_var_4[3]:.2%}
The explained variance of ALL is {explained_var_4.sum():.2%}''')

The explained variance of PC1 is 68.63%
The explained variance of PC2 is 19.45%
The explained variance of PC3 is 9.22%
The explained variance of PC4 is 2.70%
The explained variance of ALL is 100.00%

accumulated_var_4 = np.cumsum(explained_var_4 * 100)

explained_var_data = {'Explained Variance': explained_var_4, 'Explained Variability': explained_var_4*100.0, 'Cumulative Variability': accumulated_var_4}
explained_var_table = pd.DataFrame(explained_var_data, index=['Component {}'.format(i) for i in range(1, pca_4.n_components_+1)]) 

print(explained_var_table)

             Explained Variance  Explained Variability  Cumulative Variability
Component 1            0.686339              68.633893               68.633893
Component 2            0.194529              19.452929               88.086822
Component 3            0.092161               9.216063               97.302885
Component 4            0.026971               2.697115              100.000000

# create DataFrame for PCA results
pca_columns = [f'PC_{i+1}' for i in range(X_pca_4.shape[1])]
X_pca_temp = pd.DataFrame(X_pca_4, columns=pca_columns, index=X.index)

# concatenate with original DataFrame
penguins_z_cp = pd.concat([X, X_pca_temp], axis=1)
penguins_z_cp

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	PC_1	PC_2	PC_3	PC_4
0	39.1	18.7	181.0	3750.0	-1.853593	0.032069	-0.234902	-0.528397
1	39.5	17.4	186.0	3800.0	-1.316254	-0.443527	-0.027470	-0.401727
2	40.3	18.0	195.0	3250.0	-1.376605	-0.161230	0.189689	0.528662
4	36.7	19.3	193.0	3450.0	-1.885288	-0.012351	-0.628873	0.472893
5	39.3	20.6	190.0	3650.0	-1.919981	0.817598	-0.701051	0.196416
...	...	...	...	...	...	...	...	...
338	47.2	13.7	214.0	4925.0	1.997716	-0.976771	0.379425	-0.160892
340	46.8	14.3	215.0	4850.0	1.832651	-0.784510	0.240758	0.008955
341	50.4	15.7	222.0	5750.0	2.751505	0.266556	-0.419306	-0.236256
342	45.2	14.8	212.0	5200.0	1.713854	-0.725875	-0.262764	-0.330004
343	49.9	16.1	213.0	5400.0	2.018537	0.336554	-0.155331	-0.438802

333 rows × 8 columns

cp_variables = penguins_z_cp.columns

n_variables = pca_4.n_features_in_

# calculate the covariance matrix between veriables and Components
cov_var_comp = penguins_z_cp.cov()
cov_var_comp = cov_var_comp.iloc[:pca_4.n_features_in_, pca_4.n_features_in_:]

# calculate the correlation matrix between variables and Components
corr_var_comp = penguins_z_cp.corr()
corr_var_comp = corr_var_comp.iloc[:pca_4.n_features_in_, pca_4.n_features_in_:]

corr_var_comp

	PC_1	PC_2	PC_3	PC_4
bill_length_mm	0.751829	0.529438	0.390097	-0.047682
bill_depth_mm	-0.661186	0.702309	-0.258529	0.052522
flipper_length_mm	0.955748	0.005106	-0.143347	0.256849
body_mass_g	0.910762	0.067449	-0.359279	-0.192045

cos2 = corr_var_comp**2

# normalize the cos2 matrix by row (each row sums to 1)
cos2_normalized = cos2.div(cos2.sum(axis=1), axis=0)

# plot the normalized heatmap
plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(cos2_normalized,
                      annot=True,
                      fmt=".3f",
                      cmap='OrRd',
                      cbar_kws={'label': 'Relative Contribution per Feature'})
plt.title('Relative Contributions of Features to Components')
plt.tight_layout()
plt.show()
# plt.savefig("./8-pca4-correlation-variable-component.png")

../../_images/5c2b98bf9fc9bab57f49f6fba0b24e9ded02d4f2b43b28524d309bf5cbe95969.png

Correlation circle

# create loadings DataFrame (correlations between variables and components)

loadings = pca_4.components_.T * np.sqrt(pca_4.explained_variance_)
loadings_df = pd.DataFrame(loadings, 
                           columns=[f'PC{i+1}' for i in range(4)], # n_components
                           index=X.columns)

# create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle('Correlation Circle Plots for Principal Components for Penguins Dataset', fontsize=14)

# cunction to create a correlation circle on a given axis
def plot_correlation_circle(loadings, pc_x, pc_y, ax, title):
    # set the aspect ratio to equal for a perfect circle
    ax.set_aspect('equal', adjustable='box')
    
    # create a unit circle
    circle = plt.Circle((0, 0), 1, color='lightblue', fill=False, linestyle='--', alpha=0.8)
    ax.add_patch(circle)
    
    # plot arrows and labels for each variable
    for i, feature in enumerate(loadings.index):
        x = loadings.loc[feature, pc_x]
        y = loadings.loc[feature, pc_y]
        
        # plot arrow
        ax.arrow(0, 0, x, y, head_width=0.05, head_length=0.05, fc='k', ec='k', alpha=0.6)
        
        # plot label (offset slightly from the arrow tip)
        ax.text(x * 1.15, y * 1.15, feature, ha='center', va='center', 
                fontweight='bold', fontsize=10,
                bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.3'))

    ax.axhline(y=0, color='k', linestyle='-', alpha=0.3)
    ax.axvline(x=0, color='k', linestyle='-', alpha=0.3)

    ax.set_xlim(-1.1, 1.1)
    ax.set_ylim(-1.1, 1.1)
    
    # calculate variance explained for the specific components
    pc_x_idx = int(pc_x[2:]) - 1  # Extract number from 'PC1' -> 0
    pc_y_idx = int(pc_y[2:]) - 1  # Extract number from 'PC2' -> 1
    
    ax.set_xlabel(f'{pc_x} ({pca_4.explained_variance_ratio_[pc_x_idx]:.1%} variance)', 
                 fontsize=12)
    ax.set_ylabel(f'{pc_y} ({pca_4.explained_variance_ratio_[pc_y_idx]:.1%} variance)', 
                 fontsize=12)
    ax.set_title(title, fontsize=14)
    ax.grid(True, linestyle='--', alpha=0.7)

# PC1 vs PC2
plot_correlation_circle(loadings_df, 'PC1', 'PC2', ax1, 'Principal Components 1-2 ("Size" vs "Bill Shape")')

# PC3 vs PC4
plot_correlation_circle(loadings_df, 'PC3', 'PC4', ax2, 'Principal Components 3-4 (Residual Patterns)')

plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
# plt.savefig("./8-pca4-correlation-circle.png")

../../_images/195e20a527d536d65ca3df943a9b902279c1f812fea8ddb6febf2eb7e3fcabc6.png

4.2 t-Distributed Stochastic Neighbor Embedding (t-SNE)

from sklearn.manifold import TSNE

# build a t-SNE model havin 2 PCs
tsne = TSNE(n_components = 2, perplexity = 50)
X_tsne = tsne.fit_transform(X_scaled)

X_tsne_species = penguins_dimR.join(pd.DataFrame(X_tsne, index = penguins_dimR.index,
                           columns = ['PC_1', 'PC_2']))
X_tsne_species

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	PC_1	PC_2
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male	-14.726804	1.726723
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female	-11.068352	0.976119
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female	-10.417482	1.170999
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female	-13.651107	-0.007318
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male	-15.621012	-2.306440
...	...	...	...	...	...	...	...	...	...
338	Gentoo	Biscoe	47.2	13.7	214.0	4925.0	Female	21.190952	4.659019
340	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female	21.486044	4.326966
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male	26.877960	2.305213
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female	21.936352	2.852136
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male	25.421112	1.833549

333 rows × 9 columns

fig, axes = plt.subplots(1, 3, figsize=(13,4))

# left subpolt: hua by species
sns.scatterplot(data=X_tsne_species, x='PC_1', y='PC_2', hue='species', s=32, ax=axes[0])
axes[0].set_title("t-SNE Scatterplot by Species", fontsize=12)
axes[0].set_xlabel("Principal Component 1", fontsize=12)
axes[0].set_ylabel("Principal Component 2", fontsize=12)
axes[0].legend(title="Species", fontsize=12, title_fontsize=12)
axes[0].set_xticks([])
axes[0].set_yticks([])

# middle subplot: hue by island
sns.scatterplot(data=X_tsne_species, x='PC_1', y='PC_2', hue='island', s=32, ax=axes[1])
axes[1].set_title("t-SNE Scatterplot by Island", fontsize=12)
axes[1].set_xlabel("Principal Component 1", fontsize=12)
axes[1].set_ylabel("Principal Component 2", fontsize=12)
axes[1].legend(title="Island", fontsize=12, title_fontsize=12)
axes[1].set_xticks([])
axes[1].set_yticks([])

# right subplot: hue by sex
sns.scatterplot(data=X_tsne_species, x='PC_1', y='PC_2', hue='sex', s=32, ax=axes[2])
axes[2].set_title("t-SNE Scatterplot by Sex", fontsize=12)
axes[2].set_xlabel("Principal Component 1", fontsize=12)
axes[2].set_ylabel("Principal Component 2", fontsize=12)
axes[2].legend(title="Sex", fontsize=12, title_fontsize=12)
axes[2].set_xticks([])
axes[2].set_yticks([])

plt.tight_layout()
plt.show()
# plt.savefig("./8-tsne-penguins-distributions-species-sex-island.png")

../../_images/931a7f6a8ce442582aa9a15475bdca696e4f568513cdc79976d8c456f221adb1.png

# use K-Means to clusterize dataset

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=9)
kmeans.fit(X_tsne)
clusters = kmeans.predict(X_tsne)

tx = X_tsne[:, 0] # PC 1
ty = X_tsne[:, 1] # PC 2

plt.figure(1, figsize=(8, 6))
plt.scatter(tx, ty, s=100, linewidth=0, c=clusters)

for x, y in kmeans.cluster_centers_:
    plt.scatter(x, y, s=400, c='r', marker='X',linewidths=1)

plt.title("Scatterplot of Penguins Species clusterized using K-Means model on the t-SNE dataset ", fontsize=12)
plt.xlabel("Principle Component 1", fontsize = 12)
plt.ylabel("Principle Component 2", fontsize = 12)

plt.xticks([])
plt.yticks([])

plt.tight_layout()
plt.show()
# plt.savefig("./8-tsne-kmeans-clustering.png")

../../_images/3582971cab1daf84d509c438c03809337c7a45f574d6a96293f5e6f90017e675.png

4.3 Uniform Manifold Approximation and Projection (UMAP)

import umap

# apply umap model
umap_model = umap.UMAP(n_components = 2, n_neighbors=10)
X_umap = umap_model.fit_transform(X_scaled)

X_umap_species = penguins_dimR.join(pd.DataFrame(X_umap, index = penguins_dimR.index,
                           columns = ['PC_1', 'PC_2']))
X_umap_species

D:\00_program\241011-anaconda\envs\practical_machine_learning\lib\site-packages\sklearn\utils\deprecation.py:132: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.
  warnings.warn(

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	PC_1	PC_2
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male	8.458319	5.285900
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female	7.822329	6.300371
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female	7.391644	6.798053
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female	7.587664	4.122643
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male	8.228061	2.592930
...	...	...	...	...	...	...	...	...	...
338	Gentoo	Biscoe	47.2	13.7	214.0	4925.0	Female	-7.256217	0.875539
340	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female	-6.874177	1.165327
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male	-5.922271	5.097818
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female	-6.268187	1.683391
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male	-5.616757	4.167082

333 rows × 9 columns

fig, axes = plt.subplots(1, 3, figsize=(13, 4))

# left subpolt: hua by species
sns.scatterplot(data=X_umap_species, x='PC_1', y='PC_2', hue='species', s=32, ax=axes[0])
axes[0].set_title("UMAP Scatterplot by Species", fontsize=12)
axes[0].set_xlabel("Principal Component 1", fontsize=12)
axes[0].set_ylabel("Principal Component 2", fontsize=12)
axes[0].legend(title="Species", fontsize=12, title_fontsize=12)
axes[0].set_xticks([])
axes[0].set_yticks([])

# middle subplot: hue by island
sns.scatterplot(data=X_umap_species, x='PC_1', y='PC_2', hue='island', s=32, ax=axes[1])
axes[1].set_title("UMAP Scatterplot by Island", fontsize=12)
axes[1].set_xlabel("Principal Component 1", fontsize=12)
axes[1].set_ylabel("Principal Component 2", fontsize=12)
axes[1].legend(title="Island", fontsize=12, title_fontsize=12)
axes[1].set_xticks([])
axes[1].set_yticks([])

# right subplot: hue by sex
sns.scatterplot(data=X_umap_species, x='PC_1', y='PC_2', hue='sex', s=32, ax=axes[2])
axes[2].set_title("UMAP Scatterplot by Sex", fontsize=12)
axes[2].set_xlabel("Principal Component 1", fontsize=12)
axes[2].set_ylabel("Principal Component 2", fontsize=12)
axes[2].legend(title="Sex", fontsize=12, title_fontsize=12)
axes[2].set_xticks([])
axes[2].set_yticks([])

plt.tight_layout()
plt.show()
# plt.savefig("./8-umap-penguins-distributions-species-sex-island.png")

../../_images/cd0e089483f76db13d154320ac097d7f1f4572831fe8067896bbbedc58a21168.png

# use K-Means to clusterize dataset

from sklearn.cluster import KMeans

kmeans3 = KMeans(n_clusters=3)
kmeans3.fit(X_umap)
kmeans3_clusters = kmeans3.predict(X_umap)

kmeans5 = KMeans(n_clusters=5)
kmeans5.fit(X_umap)
kmeans5_clusters = kmeans5.predict(X_umap)

kmeans7 = KMeans(n_clusters=7)
kmeans7.fit(X_umap)
kmeans7_clusters = kmeans7.predict(X_umap)

tx = X_umap[:, 0] # PC 1
ty = X_umap[:, 1] # PC 2

fig, axes = plt.subplots(1, 3, figsize=(13, 4))
# fig.suptitle('Scatterplot of Penguins Species clusterized using K-Means model on the UMAP dataset', fontsize=12)

# --------- k = 3 (clusters) ---------
axes[0].scatter(tx, ty, s=32, linewidth=0, c=kmeans3_clusters, cmap='viridis')
for x, y in kmeans3.cluster_centers_:
    axes[0].scatter(x, y, s=100, c='r', marker='X', linewidths=1)
axes[0].set_title(" k = 3 ", fontsize=14)
axes[0].set_xlabel("Principal Component 1", fontsize=12)
axes[0].set_ylabel("Principal Component 2", fontsize=12)
axes[0].set_xticks([])
axes[0].set_yticks([])

# --------- k = 5 (clusters) ---------
axes[1].scatter(tx, ty, s=32, linewidth=0, c=kmeans5_clusters, cmap='viridis')
for x, y in kmeans5.cluster_centers_:
    axes[1].scatter(x, y, s=100, c='r', marker='X', linewidths=1)
axes[1].set_title(" k = 5 ", fontsize=14)
axes[1].set_xlabel("Principal Component 1", fontsize=12)
axes[1].set_ylabel("Principal Component 2", fontsize=12)
axes[1].set_xticks([])
axes[1].set_yticks([])

# --------- k = 7 (clusters) ---------
axes[2].scatter(tx, ty, s=32, linewidth=0, c=kmeans7_clusters, cmap='viridis')
for x, y in kmeans7.cluster_centers_:
    axes[2].scatter(x, y, s=100, c='r', marker='X', linewidths=1)
axes[2].set_title(" k = 7 ", fontsize=14)
axes[2].set_xlabel("Principal Component 1", fontsize=12)
axes[2].set_ylabel("Principal Component 2", fontsize=12)
axes[2].set_xticks([])
axes[2].set_yticks([])

plt.tight_layout()
plt.show()
# plt.savefig("./8-umap-kmeans-clustering.png")

../../_images/10910005b3e1c964bb237a47bb05e7722d6cab30f9bb7456d592fa8c0129a866.png