Training Data Clustering Model
Clustering is unsupervised learning that reveals hidden data structure: customer segments, thematic document clusters, anomalous transaction groups. Choice of algorithm and correct determination of the number of clusters critically affect business interpretation of results.
Clustering Algorithm Selection
| Algorithm | Num. Clusters | Cluster Shape | Scale | Application |
|---|---|---|---|---|
| K-Means | Must specify | Spherical | >100K | Customer segmentation |
| DBSCAN | Auto | Any | ~50K | Anomalies, geospatial |
| HDBSCAN | Auto | Any | >100K | Texts, images |
| Agglomerative | Must specify | Any | ~10K | Document hierarchy |
| GMM | Must specify | Ellipsoidal | ~50K | Soft probabilities |
K-Means with Optimal Number of Clusters
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import numpy as np
import matplotlib.pyplot as plt
class ClusteringPipeline:
def __init__(self, scale: bool = True):
self.scaler = StandardScaler() if scale else None
self.model = None
def find_optimal_k(self, X: np.ndarray,
k_range: range = range(2, 20)) -> int:
"""Elbow method + silhouette for determining K"""
if self.scaler:
X = self.scaler.fit_transform(X)
inertias = []
silhouettes = []
for k in k_range:
kmeans = MiniBatchKMeans(n_clusters=k, random_state=42,
batch_size=1024)
labels = kmeans.fit_predict(X)
inertias.append(kmeans.inertia_)
if len(X) > 50000:
sample_idx = np.random.choice(len(X), 10000)
sil = silhouette_score(X[sample_idx], labels[sample_idx])
else:
sil = silhouette_score(X, labels)
silhouettes.append(sil)
# Elbow method — inflection point
diffs = np.diff(inertias)
diffs2 = np.diff(diffs)
elbow_k = k_range[np.argmax(diffs2) + 2]
# Best silhouette
best_sil_k = k_range[np.argmax(silhouettes)]
# Consensus: closest k from two methods
optimal_k = (elbow_k + best_sil_k) // 2
print(f"Elbow method: k={elbow_k}, Silhouette: k={best_sil_k}, Chosen: k={optimal_k}")
return optimal_k
def fit(self, X: np.ndarray, k: int = None):
if self.scaler:
X_scaled = self.scaler.fit_transform(X)
else:
X_scaled = X
if k is None:
k = self.find_optimal_k(X_scaled)
self.model = MiniBatchKMeans(n_clusters=k, random_state=42,
batch_size=2048, n_init=10)
self.labels = self.model.fit_predict(X_scaled)
return self
def evaluate(self, X: np.ndarray) -> dict:
X_scaled = self.scaler.transform(X) if self.scaler else X
return {
'silhouette': silhouette_score(X_scaled, self.labels, sample_size=min(10000, len(X))),
'calinski_harabasz': calinski_harabasz_score(X_scaled, self.labels),
'n_clusters': len(np.unique(self.labels)),
'cluster_sizes': dict(zip(*np.unique(self.labels, return_counts=True)))
}
HDBSCAN for Text Data
import hdbscan
from sentence_transformers import SentenceTransformer
def cluster_documents(texts: list[str], min_cluster_size: int = 10) -> list[int]:
# Embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, batch_size=256, show_progress_bar=True)
# Dimensionality reduction before clustering
from umap import UMAP
umap_model = UMAP(n_components=10, random_state=42, metric='cosine')
reduced = umap_model.fit_transform(embeddings)
# HDBSCAN
clusterer = hdbscan.HDBSCAN(
min_cluster_size=min_cluster_size,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True
)
labels = clusterer.fit_predict(reduced)
# -1 = noise/outlier
print(f"Found {len(np.unique(labels[labels >= 0]))} clusters")
print(f"Noise points: {(labels == -1).sum()}")
return labels
Cluster Interpretation
def describe_clusters(X_df: pd.DataFrame, labels: np.ndarray) -> dict:
"""Automatic description of each cluster"""
cluster_descriptions = {}
for cluster_id in np.unique(labels):
if cluster_id == -1:
continue
mask = labels == cluster_id
cluster_df = X_df[mask]
# Cluster centroid in feature space
centroid = cluster_df.mean()
# Most distinctive features (above/below average)
overall_mean = X_df.mean()
diff = (centroid - overall_mean) / X_df.std()
top_features = diff.abs().nlargest(5).index.tolist()
cluster_descriptions[cluster_id] = {
'size': mask.sum(),
'size_pct': mask.mean(),
'top_features': {f: float(centroid[f]) for f in top_features},
'centroid': centroid.to_dict()
}
return cluster_descriptions
Good customer clustering has silhouette coefficient > 0.3, business-interpretable clusters, and stability on re-runs (Jaccard similarity > 0.8 between runs).







