Unsupervised Learning (I): Clustering

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

1. The Penguins Dataset

penguins = sns.load_dataset('penguins')
penguins
sns.pairplot(penguins[["species", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]], hue="species", height=2.0)

2. Data Processing

# remove missing values
penguins_clustering = penguins.dropna()

# check duplicate values from dataset
penguins_clustering.duplicated().value_counts()

3. Data Processing

# select numerical features for clustering

X = penguins_clustering[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
# standardize features (important for distance-based methods like KMeans)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

4. Training Model & Evaluating Model Performance

4.1 K-Means Clustering

from sklearn.cluster import KMeans

k = 3  # we know there are 3 species

kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)
kmeans_labels
# add cluster labels to dataframe

penguins_cluster = penguins.dropna().copy()
penguins_cluster['kmeans_cluster'] = kmeans_labels
penguins_cluster
# evaluate clustering

from sklearn.metrics import silhouette_score, adjusted_rand_score

sil_score_kmeans = silhouette_score(X_scaled, kmeans_labels)
ari_score_kmeans = adjusted_rand_score(penguins_cluster['species'], kmeans_labels)
print(f"Silhouette Score: {sil_score_kmeans:.3f}")
print(f"Adjusted Rand Index (vs true species): {ari_score_kmeans:.3f}")
# visualization: clusters vs. true labels
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# plot KMeans clusters
axes[0].scatter(
    penguins_cluster['flipper_length_mm'],
    penguins_cluster['body_mass_g'],
    c=penguins_cluster['kmeans_cluster'],
    cmap='viridis',
    s=60
)
axes[0].scatter(
    kmeans.cluster_centers_[:, 2] * scaler.scale_[2] + scaler.mean_[2],  # inverse scaling
    kmeans.cluster_centers_[:, 3] * scaler.scale_[3] + scaler.mean_[3],
    c='red', marker='X', s=200, label='Centroids'
)
axes[0].set_title("K-Means Clustering")
axes[0].set_xlabel("Flipper Length (mm)")
axes[0].set_ylabel("Body Mass (g)")
axes[0].legend()

# Plot true species for comparison
species_map = {'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2}
colors = [species_map[s] for s in penguins_cluster['species']]
scatter = axes[1].scatter(
    penguins_cluster['flipper_length_mm'],
    penguins_cluster['body_mass_g'],
    c=colors,
    cmap='viridis',
    s=60
)
axes[1].set_title("True Species Labels")
axes[1].set_xlabel("Flipper Length (mm)")
axes[1].set_ylabel("Body Mass (g)")

# Legend for species
handles, labels = scatter.legend_elements()
axes[1].legend(handles, species_map.keys(), title="Species")

plt.tight_layout()
plt.show()
print("\nSpecies distribution in clusters:")
cross_tab = pd.crosstab(penguins_cluster['kmeans_cluster'], penguins_cluster['species'])
print(cross_tab)
    
# plot species distribution in clusters
cross_tab.plot(kind='bar', stacked=True, figsize=(9, 6))
plt.title('Species Distribution Across Clusters')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.legend(title='Species')
plt.tight_layout()
plt.show()
# plt.savefig("7-penguins-in-clusters.png")
penguins_cluster

Determining optimal number of clusters

# determining optimal number of clusters

max_clusters = 15
wcss = []  # within-Cluster Sum of Square
silhouette_scores = []

for i in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    
    # calculate silhouette score
    silhouette_avg = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append(silhouette_avg)
    
    print(f"Clusters: {i}, WCSS: {kmeans.inertia_:.2f}, Silhouette: {silhouette_avg:.3f}")

# plot elbow method
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(2, max_clusters + 1), wcss, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method')

plt.subplot(1, 2, 2)
plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o', color='red')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score Method')

plt.tight_layout()
plt.show()
# plt.savefig("./7-kmeans-optimal-parameter.png")

optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
print(f"Suggested optimal number of clusters: {optimal_clusters}")
# comparison of k = 2, 3, and 4

penguins_cluster_k234 = penguins_cluster.copy()

# apply K-Means with k=2
kmeans2 = KMeans(n_clusters=2, random_state=42, n_init=10)
labels2 = kmeans2.fit_predict(X_scaled)
penguins_cluster_k234['cluster_k2'] = labels2

# apply K-Means with k=3
kmeans3 = KMeans(n_clusters=3, random_state=42, n_init=10)
labels3 = kmeans3.fit_predict(X_scaled)
penguins_cluster_k234['cluster_k3'] = labels3

# apply K-Means with k=4
kmeans4 = KMeans(n_clusters=4, random_state=42, n_init=10)
labels4 = kmeans4.fit_predict(X_scaled)
penguins_cluster_k234['cluster_k4'] = labels4
# plot the distributions of data points in clusters
fig, axes = plt.subplots(1, 3, figsize=(16,5))

# k=2 clusters
axes[0].scatter(penguins_cluster_k234['flipper_length_mm'], penguins_cluster_k234['body_mass_g'], 
                c=penguins_cluster_k234['cluster_k2'], cmap='viridis', s=80)
axes[0].set_title('K-Means Clusters (k=2)')
axes[0].set_xlabel('Flipper Length (mm)')
axes[0].set_ylabel('Body Mass (g)')

# k=3 clusters
axes[1].scatter(penguins_cluster_k234['flipper_length_mm'], penguins_cluster_k234['body_mass_g'], 
                c=penguins_cluster_k234['cluster_k3'], cmap='viridis', s=80)
axes[1].set_title('K-Means Clusters (k=3)')
axes[1].set_xlabel('Flipper Length (mm)')
axes[1].set_ylabel('Body Mass (g)')

# k=4 clusters
axes[2].scatter(penguins_cluster_k234['flipper_length_mm'], penguins_cluster_k234['body_mass_g'], 
                c=penguins_cluster_k234['cluster_k4'], cmap='viridis', s=80)
axes[2].set_title('K-Means Clusters (k=4)')
axes[2].set_xlabel('Flipper Length (mm)')
axes[2].set_ylabel('Body Mass (g)')

plt.tight_layout()
plt.show()
# plt.savefig("./7-kmeans-234-clusters.png")
# print cluster counts

print("Cluster counts for k=2:\n", penguins_cluster_k234['cluster_k2'].value_counts())
print("Cluster counts for k=3:\n", penguins_cluster_k234['cluster_k3'].value_counts())
print("Cluster counts for k=4:\n", penguins_cluster_k234['cluster_k4'].value_counts())

4.2 Hierarchical Clustering

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# compute linkage matrix
linked = linkage(X_scaled, method='ward')

# assign 3 clusters based on dendrogram cut
labels_scipy = fcluster(linked, t=3, criterion='maxclust')

# evaluate clustering
sil_score_hier_scipy = silhouette_score(X_scaled, labels_scipy)
ari_score_hier_scipy = adjusted_rand_score(penguins_cluster['species'], labels_scipy)
print(f"Silhouette Score: {sil_score_hier_scipy:.3f}")
print(f"Adjusted Rand Index (vs true species): {ari_score_hier_scipy:.3f}")

# penguins_cluster = penguins.dropna().copy()
penguins_cluster['hier_cluster_scipy'] = labels_scipy
penguins_cluster
# plot dendrogram to visualize clustering structure

plt.figure(figsize=(16, 6))
dendrogram(
    linked,
    labels=penguins_cluster['species'].values,  # for leaf labels
    leaf_rotation=90,
    leaf_font_size=8
)
plt.title("Hierarchical Clustering Dendrogram (Ward linkage)", fontsize=13)
plt.xlabel("Penguin (species labels)", fontsize=13)
plt.ylabel("Distance", fontsize=13)
plt.tight_layout()
plt.show()
# plt.savefig("./7-hierarcical-dendrogram.png")
# agglomerative clustering from scikit-learn (choose number of clusters, e.g., 3)

from sklearn.cluster import AgglomerativeClustering

hc = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
labels_aggl = hc.fit_predict(X_scaled)

# evaluate clustering
sil_score_hier_aggl = silhouette_score(X_scaled, labels_aggl)
ari_score_hier_aggl = adjusted_rand_score(penguins_cluster['species'], labels_aggl)
print(f"Silhouette Score: {sil_score_hier_aggl:.3f}")
print(f"Adjusted Rand Index (vs true species): {ari_score_hier_aggl:.3f}")

# add cluster labels to dataframe
penguins_cluster['hier_cluster_aggl'] = labels_aggl
penguins_cluster
# check distribution of species in clusters

print("\nSpecies distribution in hierarchical clusters:\n")

print(pd.crosstab(penguins_cluster['species'], penguins_cluster['hier_cluster_scipy']), '\n')
print(pd.crosstab(penguins_cluster['species'], penguins_cluster['hier_cluster_aggl']))
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# clusters determined using SciPy
scatter1 = axes[0].scatter(
    penguins_cluster['flipper_length_mm'],
    penguins_cluster['body_mass_g'],
    c=labels_scipy,
    cmap='viridis',
    s=80
)
axes[0].set_xlabel("Flipper Length (mm)")
axes[0].set_ylabel("Body Mass (g)")
axes[0].set_title("Hierarchical Clustering (SciPy)")
cbar1 = plt.colorbar(scatter1, ax=axes[0])
cbar1.set_label("Cluster ID")

# clusters determined using Scikit-Learn
scatter2 = axes[1].scatter(
    penguins_cluster['flipper_length_mm'],
    penguins_cluster['body_mass_g'],
    c=labels_aggl,
    cmap='viridis',
    s=80
)
axes[1].set_xlabel("Flipper Length (mm)")
axes[1].set_ylabel("Body Mass (g)")
axes[1].set_title("Hierarchical Clustering (Scikit-Learn)")
cbar2 = plt.colorbar(scatter2, ax=axes[1])
cbar2.set_label("Cluster ID")

plt.tight_layout()
plt.show()
# plt.savefig("./7-hierarchical-clusters-from-scipy-scikit-learn.png")
# calculation of confusion matrix

# clusters determined using Scikit-Learn
# map clusters to species (majority vote)
cluster_mapping = penguins_cluster.groupby('hier_cluster_aggl')['species'].agg(lambda x: x.value_counts().idxmax()).to_dict()
predicted_species = penguins_cluster['hier_cluster_aggl'].map(cluster_mapping)
# compute confusion matrix as a DataFrame
cm_aggl = pd.crosstab(penguins_cluster['species'], predicted_species)

# clusters determined using SciPy
cluster_mapping = penguins_cluster.groupby('hier_cluster_scipy')['species'].agg(lambda x: x.value_counts().idxmax()).to_dict()
predicted_species = penguins_cluster['hier_cluster_scipy'].map(cluster_mapping)
cm_scipy = pd.crosstab(penguins_cluster['species'], predicted_species)
# visualisation

fig, axes = plt.subplots(1, 2, figsize=(14, 6))  # 1 row, 2 columns

# SciPy clusters
sns.heatmap(cm_scipy, annot=True, fmt="d", cmap="OrRd", ax=axes[0])
axes[0].set_title("Confusion Matrix: SciPy Clusters")
axes[0].set_xlabel("Predicted Species (clusters)")
axes[0].set_ylabel("True Species")

# Scikit-Learn clusters
sns.heatmap(cm_aggl, annot=True, fmt="d", cmap="OrRd", ax=axes[1])
axes[1].set_title("Confusion Matrix: Scikit-Learn Clusters")
axes[1].set_xlabel("Predicted Species (clusters)")
axes[1].set_ylabel("True Species")

plt.tight_layout()
plt.show()
# plt.savefig("./7-hierarchical-confusion-matrix.png")

4.3 DBSCAN (Density-Based Spatial Clustering of Applications with Noise)

from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.55, min_samples=5)
labels = dbscan.fit_predict(X_scaled)

# evaluate clustering (only if at least 2 clusters found)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"DBSCAN found {n_clusters} clusters (and {sum(labels==-1)} noise points).")

if n_clusters > 1:
    sil_score_dbscan = silhouette_score(X_scaled, labels)  # FIXED
    ari_score_dbscan = adjusted_rand_score(penguins_cluster['species'], labels)
    print(f"Silhouette Score: {sil_score_dbscan:.3f}")
    print(f"Adjusted Rand Index (vs true species): {ari_score_dbscan:.3f}")
# add cluster labels to dataframe

penguins_cluster['dbscan_cluster'] = labels
penguins_cluster
# visualization

plt.figure(figsize=(9,6))
scatter = plt.scatter(
    penguins_cluster['flipper_length_mm'], 
    penguins_cluster['body_mass_g'],
    c=labels, cmap='viridis', s=80
)
plt.xlabel("Flipper Length (mm)")
plt.ylabel("Body Mass (g)")
plt.title("DBSCAN Clustering of Penguins (flipper length vs body mass)")
plt.colorbar(scatter, label="Cluster ID (-1 = noise)")
plt.tight_layout()
plt.show()
# plt.savefig("./7-dbscan-point-types.png")
# cross-tabulation: species distribution in clusters

cross_tab = pd.crosstab(penguins_cluster['dbscan_cluster'], penguins_cluster['species'])

cross_tab.plot(kind='bar', stacked=True, figsize=(9,6))

plt.title("Distribution of Penguin Species in Each DBSCAN Cluster")
plt.xlabel("Cluster ID")
plt.ylabel("Number of Penguins")
plt.legend(title="Species")
plt.tight_layout()
plt.show()
# plt.savefig("./7-dbscan-penguins-in-clusters.png")

4.4 Gaussian Mixture Models

from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_rand_score, silhouette_score

# build GMM model with 3 components (clusters)
gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=42)
gmm.fit(X_scaled)
labels_gmm = gmm.predict(X_scaled)

# evaluate clustering performance
sil_score_gmm = silhouette_score(X_scaled, labels_gmm)
ari_score_gmm = adjusted_rand_score(penguins_cluster['species'], labels)
print(f"Silhouette Score: {sil_score_gmm:.3f}")
print(f"Adjusted Rand Index: {ari_score_gmm:.3f}")


# add cluster labels to dataframe
penguins_cluster['gmm_cluster'] = labels_gmm
penguins_cluster
# check species distribution in each cluster

print("\nSpecies distribution per GMM cluster:\n")
print(pd.crosstab(penguins_cluster['species'], penguins_cluster['gmm_cluster']))
# visualization (flipper length vs body mass)

plt.figure(figsize=(8,6))
scatter = plt.scatter(
    penguins_cluster['flipper_length_mm'],
    penguins_cluster['body_mass_g'],
    c=labels_gmm, cmap='viridis', s=80
)
plt.xlabel("Flipper Length (mm)")
plt.ylabel("Body Mass (g)")
plt.title("Gaussian Mixture Model Clustering of Penguins")
plt.colorbar(scatter, label="Cluster ID")
plt.show()
# map clusters to species (majority vote)
cluster_mapping = penguins_cluster.groupby('gmm_cluster')['species'].agg(lambda x: x.value_counts().idxmax()).to_dict()
predicted_species = penguins_cluster['gmm_cluster'].map(cluster_mapping)

# compute confusion matrix as a DataFrame
cm_gmm = pd.crosstab(penguins_cluster['species'], predicted_species)

# plot heatmap
plt.figure(figsize=(7,5))
sns.heatmap(cm_gmm, annot=True, fmt="d", cmap="OrRd")
plt.title("Confusion Matrix: Species vs Gaussian Mixture Model")
plt.xlabel("Predicted Species (from clusters)")
plt.ylabel("True Species")
plt.show()

Shapes of clusters (KMeans vs GMM)

penguins_test = sns.load_dataset("penguins").dropna()
features_test = ['flipper_length_mm', 'body_mass_g']
X_test = penguins_test[features_test].values

scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

# K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_test_scaled)
kmeans_centers = kmeans.cluster_centers_
kmeans_centers
# GMM clustering

gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=42)
gmm.fit(X_test_scaled)
gmm_labels = gmm.predict(X_test_scaled)
gmm_means = gmm.means_
gmm_covs = gmm.covariances_
# function to draw GMM ellipses

def plot_gmm_ellipses(gmm, ax):
    for i in range(gmm.n_components):
        mean = gmm.means_[i]
        cov = gmm.covariances_[i]
        v, w = np.linalg.eigh(cov)
        angle = np.arctan2(w[0][1], w[0][0])
        angle = np.degrees(angle)
        width, height = 2 * np.sqrt(v) * 1.5
        ell = Ellipse(xy=mean, width=width, height=height, angle=angle, edgecolor='red', fc='None', lw=2)
        ax.add_patch(ell)
# plot shapes of clusters

from matplotlib.patches import Circle, Ellipse

fig, axes = plt.subplots(1, 2, figsize=(12,6))

# left: K-Means with circles
axes[0].scatter(X_test_scaled[:,0], X_test_scaled[:, 1], c=kmeans_labels, cmap='viridis', s=80)
for center in kmeans_centers:
    circle = Circle(center, radius=0.8, edgecolor='red', fc='None', lw=2)  # circular clusters
    axes[0].add_patch(circle)
axes[0].set_title("K-Means (Circular clusters)")
axes[0].set_xlabel("Flipper length (scaled)")
axes[0].set_ylabel("Body mass (scaled)")

# right: GMM with elliptical/larger shapes
axes[1].scatter(X_test_scaled[:,0], X_test_scaled[:,1], c=gmm_labels, cmap='viridis', s=80)
plot_gmm_ellipses(gmm, axes[1])
axes[1].set_title("GMM (Elliptical clusters)")
axes[1].set_xlabel("Flipper length (scaled)")
axes[1].set_ylabel("Body mass (scaled)")

plt.tight_layout()
plt.show()
# plt.savefig("./7-gmm-elliptical-clusters.png")

4.5 Spectral Clustering

from sklearn.cluster import SpectralClustering

# build model using Spectral Clustering (graph-based)
spectral = SpectralClustering(
    n_clusters=3,
    affinity='rbf',   # Gaussian kernel
    gamma=1.0,        # controls width of the Gaussian
    assign_labels='kmeans',
    random_state=42
)
labels = spectral.fit_predict(X_scaled)
penguins_cluster['spectral_cluster'] = labels

# evaluate clustering
sil_score_spectral = silhouette_score(X_scaled, labels)
ari_score_spectral = adjusted_rand_score(penguins_cluster['species'], labels)
print(f"Silhouette Score: {sil_score_spectral:.3f}")
print(f"Adjusted Rand Index (vs species): {ari_score_spectral:.3f}")
# visualization (flipper length vs body mass)

plt.figure(figsize=(8,6))
scatter = plt.scatter(
    penguins_cluster['flipper_length_mm'],
    penguins_cluster['body_mass_g'],
    c=labels, cmap='viridis', s=80
)
plt.xlabel("Flipper Length (mm)")
plt.ylabel("Body Mass (g)")
plt.title("Spectral Clustering (Graph-Based) of Penguins")
plt.colorbar(scatter, label="Cluster ID")
plt.show()
# species distribution in each cluster
print("\nSpecies distribution per Spectral cluster:")
print(pd.crosstab(penguins_cluster['species'], penguins_cluster['spectral_cluster']))
# confusion matrix

cluster_mapping = penguins_cluster.groupby('kmeans_cluster')['species'].agg(lambda x: x.value_counts().idxmax()).to_dict()
predicted_species = penguins_cluster['kmeans_cluster'].map(cluster_mapping)
cm_kmeans = pd.crosstab(penguins_cluster['species'], predicted_species)

cluster_mapping = penguins_cluster.groupby('spectral_cluster')['species'].agg(lambda x: x.value_counts().idxmax()).to_dict()
predicted_species = penguins_cluster['spectral_cluster'].map(cluster_mapping)
cm_spectral = pd.crosstab(penguins_cluster['species'], predicted_species)

# visualisation
fig, axes = plt.subplots(1, 3, figsize=(19, 6))

# Kmeans clusters
sns.heatmap(cm_kmeans, annot=True, fmt="d", cmap="OrRd", ax=axes[0])
axes[0].set_title("Confusion Matrix: K-Means Clustering")
axes[0].set_xlabel("Predicted Species (clusters)")
axes[0].set_ylabel("True Species")

# GMM clusters
sns.heatmap(cm_gmm, annot=True, fmt="d", cmap="OrRd", ax=axes[1])
axes[1].set_title("Confusion Matrix: Gaussian Mixture Models")
axes[1].set_xlabel("Predicted Species (clusters)")
axes[1].set_ylabel("True Species")

# Spectral clusters
sns.heatmap(cm_spectral, annot=True, fmt="d", cmap="OrRd", ax=axes[2])
axes[2].set_title("Confusion Matrix: Spectral Clustering")
axes[2].set_xlabel("Predicted Species (clusters)")
axes[2].set_ylabel("True Species")

plt.tight_layout()
plt.show()
# plt.savefig("./7-spectral-confusion-matrix-kmeans-gmm.png")

Spectral Clustering vs. K-Means models on the Two-Moon Dataset

from sklearn.datasets import make_moons
from sklearn.cluster import KMeans, SpectralClustering

# generate the "two moons" dataset
X, y = make_moons(n_samples=1000, noise=0.05, random_state=0)

# K-Means will fail because the clusters are not spherical
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
y_kmeans = kmeans.fit_predict(X)

# Spectral Clustering will succeed because it sees the connectivity.
spectral = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', n_neighbors=30, random_state=0)
y_spectral = spectral.fit_predict(X)

# visualize the results
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# plot K-Means results
axes[0].scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='summer', alpha=0.75)
axes[0].set_title('K-Means Clustering (Fails)')
axes[0].set_xticks([])
axes[0].set_yticks([])

# plot Spectral Clustering results
axes[1].scatter(X[:, 0], X[:, 1], c=y_spectral, s=50, cmap='summer', alpha=0.75)
axes[1].set_title('Spectral Clustering (Succeeds)')
axes[1].set_xticks([])
axes[1].set_yticks([])

plt.suptitle('Comparison of Clustering Algorithms on Non-Convex Data')
plt.tight_layout()
plt.show()
# plt.savefig("./7-spectral-kmeans-two-moon-dataset.png")
penguins_cluster

5. Comparison of Trained Models

model_names = ['Spectral Clustering', 'Gaussian Mixture Models', 'DBSCAN', 
               'Hierarchical Clustering (Scikit-Learn)',
               'Hierarchical Clustering (SciPy)', 'K-Means Clustering']

sil_accuracy_scores = [sil_score_spectral, sil_score_gmm, sil_score_dbscan,
                       sil_score_hier_aggl, sil_score_hier_scipy, sil_score_kmeans]

ari_accuracy_scores = [ari_score_spectral, ari_score_gmm, ari_score_dbscan,
                       ari_score_hier_aggl, ari_score_hier_scipy, ari_score_kmeans]
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# silhouette scores
axes[0].barh(model_names, sil_accuracy_scores, color='tab:orange', alpha=0.75)
axes[0].set_xlabel('Silhouette Score')
axes[0].set_xlim(0.35, 0.5)
for i, score in enumerate(sil_accuracy_scores):
    axes[0].text(score + 0.01, i, f'{score:.2f}', va='center')

# adjusted rand index scores
axes[1].barh(model_names, ari_accuracy_scores, color='tab:green', alpha=0.75)
axes[1].set_xlabel('Adjusted Rand Index Score')
axes[1].set_xlim(0.7, 0.95)
axes[1].set_yticks([])
for i, score in enumerate(ari_accuracy_scores):
    axes[1].text(score + 0.01, i, f'{score:.2f}', va='center')

plt.tight_layout()
plt.show()
# plt.savefig("./7-comparison-sil-ari-scores.png")