AI Synthetic Test Data Generation System

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Synthetic Test Data Generation System
Medium
from 1 business day to 3 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1215
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    823

Development of a system for generating synthetic data for testing

Synthetic test data is data specifically created to cover edge cases, boundary conditions, and specific scenarios that are rarely encountered in real-world data. Unlike synthetic training data (where statistical realism is important), test data should specifically test specific scenarios.

Test data generation strategies

Rule-based generation - explicit description of rules and restrictions:

from faker import Faker
from dataclasses import dataclass
import random
import uuid

fake = Faker('ru_RU')

@dataclass
class TestUser:
    user_id: str
    email: str
    age: int
    balance: float
    subscription_tier: str

class TestDataFactory:
    def create_valid_user(self) -> TestUser:
        return TestUser(
            user_id=str(uuid.uuid4()),
            email=fake.email(),
            age=random.randint(18, 80),
            balance=round(random.uniform(0, 100_000), 2),
            subscription_tier=random.choice(['free', 'basic', 'premium'])
        )

    def create_edge_cases(self) -> list[TestUser]:
        """Edge cases для тестирования"""
        return [
            # Минимальный возраст
            TestUser(str(uuid.uuid4()), fake.email(), 18, 0.0, 'free'),
            # Максимальный баланс
            TestUser(str(uuid.uuid4()), fake.email(), 65, 999_999.99, 'premium'),
            # Нулевой баланс
            TestUser(str(uuid.uuid4()), fake.email(), 30, 0.0, 'premium'),
            # Специальные символы в email
            TestUser(str(uuid.uuid4()), "[email protected]", 25, 100.0, 'basic'),
        ]

    def create_ml_input_variants(self, n: int = 1000) -> pd.DataFrame:
        """Покрытие feature space для тестирования ML модели"""
        return pd.DataFrame({
            'age': np.linspace(18, 80, n).astype(int),
            'balance': np.logspace(0, 6, n),  # Логарифмическое распределение
            'days_since_last_purchase': np.concatenate([
                np.zeros(n//4),        # 0 дней (только что купили)
                np.ones(n//4) * 365,   # Год назад
                np.random.randint(1, 730, n//2)  # Случайные
            ]),
            'subscription_tier': np.random.choice(['free', 'basic', 'premium'], n)
        })

LLM test text generation

from anthropic import Anthropic

class TextTestDataGenerator:
    def __init__(self):
        self.client = Anthropic()

    def generate_sentiment_test_cases(self) -> list[dict]:
        prompt = """Generate 20 test cases for sentiment analysis testing.
Include:
- 5 clearly positive reviews
- 5 clearly negative reviews
- 5 ambiguous/mixed reviews
- 5 edge cases (sarcasm, neutral, very short, all caps)

Format as JSON array with fields: text, expected_sentiment, category"""

        response = self.client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=2000,
            messages=[{"role": "user", "content": prompt}]
        )
        return json.loads(response.content[0].text)

    def generate_rag_test_queries(self, knowledge_base_summary: str) -> list[dict]:
        """Генерация тестовых запросов для RAG системы"""
        prompt = f"""Given this knowledge base: {knowledge_base_summary}

Generate 30 test queries including:
- Direct factual questions (should return answer from KB)
- Questions outside KB scope (should return 'not found')
- Ambiguous queries (testing retrieval quality)
- Multi-hop questions requiring synthesis

Return JSON array with: query, expected_type, expected_answer_present"""

        response = self.client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=3000,
            messages=[{"role": "user", "content": prompt}]
        )
        return json.loads(response.content[0].text)

Generation for testing ML models

class MLModelTestDataGenerator:
    def generate_distribution_shift(self, train_data: pd.DataFrame,
                                     shift_type: str) -> pd.DataFrame:
        """Генерация данных с намеренным дрифтом для тестирования мониторинга"""
        if shift_type == 'covariate':
            # Сдвиг распределения признаков
            test_data = train_data.copy()
            test_data['age'] = test_data['age'] + 15  # Возрастной сдвиг
            return test_data

        elif shift_type == 'concept':
            # Инвертируем зависимость (для тестирования concept drift детекции)
            test_data = train_data.copy()
            test_data['target'] = 1 - test_data['target']
            return test_data

    def generate_adversarial_examples(self, model, X: np.ndarray,
                                       epsilon: float = 0.1) -> np.ndarray:
        """FGSM adversarial examples для stress testing"""
        import torch
        X_tensor = torch.FloatTensor(X).requires_grad_(True)
        output = model(X_tensor)
        loss = output.sum()
        loss.backward()

        adversarial = X + epsilon * X_tensor.grad.sign().numpy()
        return np.clip(adversarial, X.min(), X.max())

A properly designed test data system covers 95%+ of edge cases automatically, speeds up testing by 3-5 times, and allows the QA team to focus on truly complex scenarios.