UML Db

InvokerInvoker
2 min read
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

#Dataset
data = np.array([
    [15,2],[16,3],[15.5,2.5],[18,4],[50,40],[51,42],[49,39],[12,2],[80,100],[82,98],[81,101],[85,90]])

#Standardize the data for better clustering performance
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

#Apply DBSCAN
eps = .5
min_samples = 3
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(scaled_data)

#Add cluster labels to the data
df = pd.DataFrame(data, columns=["Annual Spending", "Shopping Frequency"])
df["Cluster"] = labels

#Identify Core, Border and Noise Points
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True

df["Type"] = ["Core" if core_samples_mask[i] else "Border" if labels[i] != -1 else "Noise" for i in range(len(labels))]

#Print the resulting table
print("DBSCAN Result:")
print(df)

#Visualization
plt.figure(figsize=(10, 8))
unique_labels = np.unique(labels)

#plot each cluster with distinct colors
for label in unique_labels:
    cluster_points = df[df["Cluster"] == label]
    if label == -1: #Noise points
        color ='red'
    else:
        color = plt.cm.Set1(label / max(unique_labels)) #Unique color for each cluster
    plt.scatter(cluster_points["Annual Spending"], cluster_points["Shopping Frequency"],
    label=f"Cluster {label}" if label != -1 else "Noise",
    color=color, s=100, edgecolor='black')

#Annotate core, border and noise points
for i, row in df.iterrows():
    plt.text(row["Annual Spending"] + 0.5, row["Shopping Frequency"],
             f"{row['Type']}", fontsize=8)

#ADD labels and legend
plt.title("DBSCAN Clustering with Core, Border and Noise Points")
plt.xlabel("Annual Spending ($1000s)")
plt.ylabel("Shopping Frequency (times/year)")
plt.legend()
plt.grid(True)
plt.show()

# Case Study: Customer Behavior Clustering
#Generate synthetic data with multiple clusters
np.random.seed(42)
#Cluster 1: Low income, low spending
cluster_1 = np.random.normal(loc=(30,30), scale=(5,5), size=(50,2))
#Cluster 2: High income, high spending
cluster_2 = np.random.normal(loc=(80,80), scale=(5,5), size=(50,2))
#Cluster 3: Medium income, medium spending
cluster_3 = np.random.normal(loc=(55,55), scale=(5,5), size=(50,2))
# Noise: Random sparse points
noise = np.random.uniform(low=(10,10), high=(90,90), size=(10,2))

#Combine all data
data = np.vstack((cluster_1, cluster_2, cluster_3, noise))
data = pd.DataFrame(data, columns=["Annual Income(k$)", "Spending Score (1-100)"])

#Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

#apply DBSCAN
eps = 0.5
min_samples = 5
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(scaled_data)

#Identify Core, Border and noise points
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True

#Add results
data["Cluster"] = labels
data["Type"] = ["Core" if core_samples_mask[i] else "Border" if labels[i] != -1 else "Noise" for i in range (len(labels))]

#Display results
print("DBSCAN Results:")
print(data)

#Visualization
plt.figure(figsize=(10,8))
unique_labels = np.unique(labels)

#PLot each cluster with unique colors and noise in red
for label in unique_labels:
    cluster_points = data[data["Cluster"] == label]
    if label ==-1: #Noise points
        color = 'black'
    else:
        color = plt.cm.Set1(label / len(unique_labels)) #Unique color for each cluster
    plt.scatter(cluster_points["Annual Income(k$)"], cluster_points["Spending Score (1-100)"],
            label=f"Cluster {label}" if label !=-1 else "Noise",
            color = color, s=100, edgecolor='black')

#Annotate core, border and noise points
for i, row in data.iterrows():
    plt.text(row["Annual Income(k$)"] +1, row["Spending Score (1-100)"], f"{row['Type']}", fontsize=8)

#Add labels and legend
plt.title("DBSCAN Clustering on Synthetic Customer Data")
plt.xlabel("Annual Income(K$)")
plt.ylabel("Spending Scoe (1-100)")
plt.legend()
plt.grid(True)
plt.show()
0
Subscribe to my newsletter

Read articles from Invoker directly inside your inbox. Subscribe to the newsletter, and don't miss out.

Written by

Invoker
Invoker