Data Clustering Model Training

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.

8+Years of workmore info 900+Completed projectsmore info 100+In house employeesmore info 19+Partnersmore info

Offered services

Showing 1 of 1 servicesAll 1566 services

Medium

~3-5 business days

FAQ

AI Development Areas

Discuss your AI project

Free consultation — we'll show you how AI can solve your challenge

Get a quote

We'll estimate the budget and timeline for your AI project

AI Solution Development Stages

Latest works

B2B ADVANCE company website development
1214
Development of a web application for FEEDME
1161
Website development for BELFINGROUP
852
Development of an online store for the company FURNORO
1041
B2B Advance company logo design
561
Development of a web application for Enviok
823

Show more works

Training Data Clustering Model

Clustering is unsupervised learning that reveals hidden data structure: customer segments, thematic document clusters, anomalous transaction groups. Choice of algorithm and correct determination of the number of clusters critically affect business interpretation of results.

Clustering Algorithm Selection

Algorithm	Num. Clusters	Cluster Shape	Scale	Application
K-Means	Must specify	Spherical	>100K	Customer segmentation
DBSCAN	Auto	Any	~50K	Anomalies, geospatial
HDBSCAN	Auto	Any	>100K	Texts, images
Agglomerative	Must specify	Any	~10K	Document hierarchy
GMM	Must specify	Ellipsoidal	~50K	Soft probabilities

K-Means with Optimal Number of Clusters

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import numpy as np
import matplotlib.pyplot as plt

class ClusteringPipeline:
    def __init__(self, scale: bool = True):
        self.scaler = StandardScaler() if scale else None
        self.model = None

    def find_optimal_k(self, X: np.ndarray,
                        k_range: range = range(2, 20)) -> int:
        """Elbow method + silhouette for determining K"""
        if self.scaler:
            X = self.scaler.fit_transform(X)

        inertias = []
        silhouettes = []

        for k in k_range:
            kmeans = MiniBatchKMeans(n_clusters=k, random_state=42,
                                    batch_size=1024)
            labels = kmeans.fit_predict(X)
            inertias.append(kmeans.inertia_)

            if len(X) > 50000:
                sample_idx = np.random.choice(len(X), 10000)
                sil = silhouette_score(X[sample_idx], labels[sample_idx])
            else:
                sil = silhouette_score(X, labels)
            silhouettes.append(sil)

        # Elbow method — inflection point
        diffs = np.diff(inertias)
        diffs2 = np.diff(diffs)
        elbow_k = k_range[np.argmax(diffs2) + 2]

        # Best silhouette
        best_sil_k = k_range[np.argmax(silhouettes)]

        # Consensus: closest k from two methods
        optimal_k = (elbow_k + best_sil_k) // 2
        print(f"Elbow method: k={elbow_k}, Silhouette: k={best_sil_k}, Chosen: k={optimal_k}")
        return optimal_k

    def fit(self, X: np.ndarray, k: int = None):
        if self.scaler:
            X_scaled = self.scaler.fit_transform(X)
        else:
            X_scaled = X

        if k is None:
            k = self.find_optimal_k(X_scaled)

        self.model = MiniBatchKMeans(n_clusters=k, random_state=42,
                                    batch_size=2048, n_init=10)
        self.labels = self.model.fit_predict(X_scaled)
        return self

    def evaluate(self, X: np.ndarray) -> dict:
        X_scaled = self.scaler.transform(X) if self.scaler else X
        return {
            'silhouette': silhouette_score(X_scaled, self.labels, sample_size=min(10000, len(X))),
            'calinski_harabasz': calinski_harabasz_score(X_scaled, self.labels),
            'n_clusters': len(np.unique(self.labels)),
            'cluster_sizes': dict(zip(*np.unique(self.labels, return_counts=True)))
        }

HDBSCAN for Text Data

import hdbscan
from sentence_transformers import SentenceTransformer

def cluster_documents(texts: list[str], min_cluster_size: int = 10) -> list[int]:
    # Embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(texts, batch_size=256, show_progress_bar=True)

    # Dimensionality reduction before clustering
    from umap import UMAP
    umap_model = UMAP(n_components=10, random_state=42, metric='cosine')
    reduced = umap_model.fit_transform(embeddings)

    # HDBSCAN
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )
    labels = clusterer.fit_predict(reduced)

    # -1 = noise/outlier
    print(f"Found {len(np.unique(labels[labels >= 0]))} clusters")
    print(f"Noise points: {(labels == -1).sum()}")
    return labels

Cluster Interpretation

def describe_clusters(X_df: pd.DataFrame, labels: np.ndarray) -> dict:
    """Automatic description of each cluster"""
    cluster_descriptions = {}

    for cluster_id in np.unique(labels):
        if cluster_id == -1:
            continue
        mask = labels == cluster_id
        cluster_df = X_df[mask]

        # Cluster centroid in feature space
        centroid = cluster_df.mean()

        # Most distinctive features (above/below average)
        overall_mean = X_df.mean()
        diff = (centroid - overall_mean) / X_df.std()
        top_features = diff.abs().nlargest(5).index.tolist()

        cluster_descriptions[cluster_id] = {
            'size': mask.sum(),
            'size_pct': mask.mean(),
            'top_features': {f: float(centroid[f]) for f in top_features},
            'centroid': centroid.to_dict()
        }

    return cluster_descriptions

Good customer clustering has silhouette coefficient > 0.3, business-interpretable clusters, and stability on re-runs (Jaccard similarity > 0.8 between runs).