import numpy as np
import matplotlib.pyplot as plt

def kmeans(data, k, max_iter=100):
    # Randomly initialize centroids
    np.random.seed(0)  # For reproducibility
    centroids = data[np.random.choice(data.shape[0], k, replace=False)]

    for _ in range(max_iter):
        # Assignment step
        distances = np.linalg.norm(data[:, np.newaxis] - centroids, axis=2)
        clusters = np.argmin(distances, axis=1)

        # Update step
        new_centroids = np.array([data[clusters == i].mean(axis=0) for i in range(k)])

        # Check for convergence
        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids

    return centroids, clusters

def plot_clusters(data, centroids, clusters, title):
    plt.figure(figsize=(8, 6))
    if data.shape[1] == 1:
        plt.scatter(data, np.zeros_like(data), c=clusters, cmap='viridis', s=100)
        plt.scatter(centroids, np.zeros_like(centroids), color='red', marker='X', s=200, label='Centroids')
        plt.title(title)
        plt.xlabel('Feature 1')
        plt.yticks([])
    elif data.shape[1] == 2:
        plt.scatter(data[:, 0], data[:, 1], c=clusters, cmap='viridis', s=100)
        plt.scatter(centroids[:, 0], centroids[:, 1], color='red', marker='X', s=200, label='Centroids')
        plt.title(title)
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
    elif data.shape[1] == 3:
        ax = plt.axes(projection='3d')
        ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=clusters, cmap='viridis', s=100)
        ax.scatter(centroids[:, 0], centroids[:, 1], centroids[:, 2], color='red', marker='X', s=200, label='Centroids')
        ax.set_title(title)
        ax.set_xlabel('Feature 1')
        ax.set_ylabel('Feature 2')
        ax.set_zlabel('Feature 3')

    plt.legend()
    plt.show()

if __name__ == "__main__":
    # Dataset 1: 1D
    data_1d = np.array([[1], [1.5], [2], [2.5], [3], [8], [8.5], [9], [10]])
    k1 = 2
    centroids_1d, clusters_1d = kmeans(data_1d, k1)
    plot_clusters(data_1d, centroids_1d, clusters_1d, "K-means Clustering (1D)")

    # Dataset 2: 2D
    data_2d = np.array([[1, 2], [1, 4], [1, 0],
                        [10, 2], [10, 4], [10, 0],
                        [5, 5], [5, 6], [5, 7]])
    k2 = 3
    centroids_2d, clusters_2d = kmeans(data_2d, k2)
    plot_clusters(data_2d, centroids_2d, clusters_2d, "K-means Clustering (2D)")

    # Dataset 3: 3D
    data_3d = np.array([[1, 2, 3], [1, 4, 3], [1, 0, 3],
                        [10, 2, 3], [10, 4, 3], [10, 0, 3],
                        [5, 2, 5], [5, 3, 5], [5, 4, 5]])
    k3 = 2
    centroids_3d, clusters_3d = kmeans(data_3d, k3)
    plot_clusters(data_3d, centroids_3d, clusters_3d, "K-means Clustering (3D)")
K-Means Clustering for N-Dimensional Feature Spaces

Output:

1 Dimension

2 Dimension

3 Dimension

Subscribe to my newsletter

Mohamad Mahmood

Mohamad Mahmood