UML Db

2 min read
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
#Dataset
data = np.array([
[15,2],[16,3],[15.5,2.5],[18,4],[50,40],[51,42],[49,39],[12,2],[80,100],[82,98],[81,101],[85,90]])
#Standardize the data for better clustering performance
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
#Apply DBSCAN
eps = .5
min_samples = 3
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(scaled_data)
#Add cluster labels to the data
df = pd.DataFrame(data, columns=["Annual Spending", "Shopping Frequency"])
df["Cluster"] = labels
#Identify Core, Border and Noise Points
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True
df["Type"] = ["Core" if core_samples_mask[i] else "Border" if labels[i] != -1 else "Noise" for i in range(len(labels))]
#Print the resulting table
print("DBSCAN Result:")
print(df)
#Visualization
plt.figure(figsize=(10, 8))
unique_labels = np.unique(labels)
#plot each cluster with distinct colors
for label in unique_labels:
cluster_points = df[df["Cluster"] == label]
if label == -1: #Noise points
color ='red'
else:
color = plt.cm.Set1(label / max(unique_labels)) #Unique color for each cluster
plt.scatter(cluster_points["Annual Spending"], cluster_points["Shopping Frequency"],
label=f"Cluster {label}" if label != -1 else "Noise",
color=color, s=100, edgecolor='black')
#Annotate core, border and noise points
for i, row in df.iterrows():
plt.text(row["Annual Spending"] + 0.5, row["Shopping Frequency"],
f"{row['Type']}", fontsize=8)
#ADD labels and legend
plt.title("DBSCAN Clustering with Core, Border and Noise Points")
plt.xlabel("Annual Spending ($1000s)")
plt.ylabel("Shopping Frequency (times/year)")
plt.legend()
plt.grid(True)
plt.show()
# Case Study: Customer Behavior Clustering
#Generate synthetic data with multiple clusters
np.random.seed(42)
#Cluster 1: Low income, low spending
cluster_1 = np.random.normal(loc=(30,30), scale=(5,5), size=(50,2))
#Cluster 2: High income, high spending
cluster_2 = np.random.normal(loc=(80,80), scale=(5,5), size=(50,2))
#Cluster 3: Medium income, medium spending
cluster_3 = np.random.normal(loc=(55,55), scale=(5,5), size=(50,2))
# Noise: Random sparse points
noise = np.random.uniform(low=(10,10), high=(90,90), size=(10,2))
#Combine all data
data = np.vstack((cluster_1, cluster_2, cluster_3, noise))
data = pd.DataFrame(data, columns=["Annual Income(k$)", "Spending Score (1-100)"])
#Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
#apply DBSCAN
eps = 0.5
min_samples = 5
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(scaled_data)
#Identify Core, Border and noise points
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True
#Add results
data["Cluster"] = labels
data["Type"] = ["Core" if core_samples_mask[i] else "Border" if labels[i] != -1 else "Noise" for i in range (len(labels))]
#Display results
print("DBSCAN Results:")
print(data)
#Visualization
plt.figure(figsize=(10,8))
unique_labels = np.unique(labels)
#PLot each cluster with unique colors and noise in red
for label in unique_labels:
cluster_points = data[data["Cluster"] == label]
if label ==-1: #Noise points
color = 'black'
else:
color = plt.cm.Set1(label / len(unique_labels)) #Unique color for each cluster
plt.scatter(cluster_points["Annual Income(k$)"], cluster_points["Spending Score (1-100)"],
label=f"Cluster {label}" if label !=-1 else "Noise",
color = color, s=100, edgecolor='black')
#Annotate core, border and noise points
for i, row in data.iterrows():
plt.text(row["Annual Income(k$)"] +1, row["Spending Score (1-100)"], f"{row['Type']}", fontsize=8)
#Add labels and legend
plt.title("DBSCAN Clustering on Synthetic Customer Data")
plt.xlabel("Annual Income(K$)")
plt.ylabel("Spending Scoe (1-100)")
plt.legend()
plt.grid(True)
plt.show()
0
Subscribe to my newsletter
Read articles from Invoker directly inside your inbox. Subscribe to the newsletter, and don't miss out.
Written by
