Data Clustering Model Training

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
Data Clustering Model Training
Medium
~3-5 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1214
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    823

Training Data Clustering Model

Clustering is unsupervised learning that reveals hidden data structure: customer segments, thematic document clusters, anomalous transaction groups. Choice of algorithm and correct determination of the number of clusters critically affect business interpretation of results.

Clustering Algorithm Selection

Algorithm Num. Clusters Cluster Shape Scale Application
K-Means Must specify Spherical >100K Customer segmentation
DBSCAN Auto Any ~50K Anomalies, geospatial
HDBSCAN Auto Any >100K Texts, images
Agglomerative Must specify Any ~10K Document hierarchy
GMM Must specify Ellipsoidal ~50K Soft probabilities

K-Means with Optimal Number of Clusters

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import numpy as np
import matplotlib.pyplot as plt

class ClusteringPipeline:
    def __init__(self, scale: bool = True):
        self.scaler = StandardScaler() if scale else None
        self.model = None

    def find_optimal_k(self, X: np.ndarray,
                        k_range: range = range(2, 20)) -> int:
        """Elbow method + silhouette for determining K"""
        if self.scaler:
            X = self.scaler.fit_transform(X)

        inertias = []
        silhouettes = []

        for k in k_range:
            kmeans = MiniBatchKMeans(n_clusters=k, random_state=42,
                                    batch_size=1024)
            labels = kmeans.fit_predict(X)
            inertias.append(kmeans.inertia_)

            if len(X) > 50000:
                sample_idx = np.random.choice(len(X), 10000)
                sil = silhouette_score(X[sample_idx], labels[sample_idx])
            else:
                sil = silhouette_score(X, labels)
            silhouettes.append(sil)

        # Elbow method — inflection point
        diffs = np.diff(inertias)
        diffs2 = np.diff(diffs)
        elbow_k = k_range[np.argmax(diffs2) + 2]

        # Best silhouette
        best_sil_k = k_range[np.argmax(silhouettes)]

        # Consensus: closest k from two methods
        optimal_k = (elbow_k + best_sil_k) // 2
        print(f"Elbow method: k={elbow_k}, Silhouette: k={best_sil_k}, Chosen: k={optimal_k}")
        return optimal_k

    def fit(self, X: np.ndarray, k: int = None):
        if self.scaler:
            X_scaled = self.scaler.fit_transform(X)
        else:
            X_scaled = X

        if k is None:
            k = self.find_optimal_k(X_scaled)

        self.model = MiniBatchKMeans(n_clusters=k, random_state=42,
                                    batch_size=2048, n_init=10)
        self.labels = self.model.fit_predict(X_scaled)
        return self

    def evaluate(self, X: np.ndarray) -> dict:
        X_scaled = self.scaler.transform(X) if self.scaler else X
        return {
            'silhouette': silhouette_score(X_scaled, self.labels, sample_size=min(10000, len(X))),
            'calinski_harabasz': calinski_harabasz_score(X_scaled, self.labels),
            'n_clusters': len(np.unique(self.labels)),
            'cluster_sizes': dict(zip(*np.unique(self.labels, return_counts=True)))
        }

HDBSCAN for Text Data

import hdbscan
from sentence_transformers import SentenceTransformer

def cluster_documents(texts: list[str], min_cluster_size: int = 10) -> list[int]:
    # Embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(texts, batch_size=256, show_progress_bar=True)

    # Dimensionality reduction before clustering
    from umap import UMAP
    umap_model = UMAP(n_components=10, random_state=42, metric='cosine')
    reduced = umap_model.fit_transform(embeddings)

    # HDBSCAN
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )
    labels = clusterer.fit_predict(reduced)

    # -1 = noise/outlier
    print(f"Found {len(np.unique(labels[labels >= 0]))} clusters")
    print(f"Noise points: {(labels == -1).sum()}")
    return labels

Cluster Interpretation

def describe_clusters(X_df: pd.DataFrame, labels: np.ndarray) -> dict:
    """Automatic description of each cluster"""
    cluster_descriptions = {}

    for cluster_id in np.unique(labels):
        if cluster_id == -1:
            continue
        mask = labels == cluster_id
        cluster_df = X_df[mask]

        # Cluster centroid in feature space
        centroid = cluster_df.mean()

        # Most distinctive features (above/below average)
        overall_mean = X_df.mean()
        diff = (centroid - overall_mean) / X_df.std()
        top_features = diff.abs().nlargest(5).index.tolist()

        cluster_descriptions[cluster_id] = {
            'size': mask.sum(),
            'size_pct': mask.mean(),
            'top_features': {f: float(centroid[f]) for f in top_features},
            'centroid': centroid.to_dict()
        }

    return cluster_descriptions

Good customer clustering has silhouette coefficient > 0.3, business-interpretable clusters, and stability on re-runs (Jaccard similarity > 0.8 between runs).