Development of a system for generating synthetic data for testing
Synthetic test data is data specifically created to cover edge cases, boundary conditions, and specific scenarios that are rarely encountered in real-world data. Unlike synthetic training data (where statistical realism is important), test data should specifically test specific scenarios.
Test data generation strategies
Rule-based generation - explicit description of rules and restrictions:
from faker import Faker
from dataclasses import dataclass
import random
import uuid
fake = Faker('ru_RU')
@dataclass
class TestUser:
user_id: str
email: str
age: int
balance: float
subscription_tier: str
class TestDataFactory:
def create_valid_user(self) -> TestUser:
return TestUser(
user_id=str(uuid.uuid4()),
email=fake.email(),
age=random.randint(18, 80),
balance=round(random.uniform(0, 100_000), 2),
subscription_tier=random.choice(['free', 'basic', 'premium'])
)
def create_edge_cases(self) -> list[TestUser]:
"""Edge cases для тестирования"""
return [
# Минимальный возраст
TestUser(str(uuid.uuid4()), fake.email(), 18, 0.0, 'free'),
# Максимальный баланс
TestUser(str(uuid.uuid4()), fake.email(), 65, 999_999.99, 'premium'),
# Нулевой баланс
TestUser(str(uuid.uuid4()), fake.email(), 30, 0.0, 'premium'),
# Специальные символы в email
TestUser(str(uuid.uuid4()), "[email protected]", 25, 100.0, 'basic'),
]
def create_ml_input_variants(self, n: int = 1000) -> pd.DataFrame:
"""Покрытие feature space для тестирования ML модели"""
return pd.DataFrame({
'age': np.linspace(18, 80, n).astype(int),
'balance': np.logspace(0, 6, n), # Логарифмическое распределение
'days_since_last_purchase': np.concatenate([
np.zeros(n//4), # 0 дней (только что купили)
np.ones(n//4) * 365, # Год назад
np.random.randint(1, 730, n//2) # Случайные
]),
'subscription_tier': np.random.choice(['free', 'basic', 'premium'], n)
})
LLM test text generation
from anthropic import Anthropic
class TextTestDataGenerator:
def __init__(self):
self.client = Anthropic()
def generate_sentiment_test_cases(self) -> list[dict]:
prompt = """Generate 20 test cases for sentiment analysis testing.
Include:
- 5 clearly positive reviews
- 5 clearly negative reviews
- 5 ambiguous/mixed reviews
- 5 edge cases (sarcasm, neutral, very short, all caps)
Format as JSON array with fields: text, expected_sentiment, category"""
response = self.client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=2000,
messages=[{"role": "user", "content": prompt}]
)
return json.loads(response.content[0].text)
def generate_rag_test_queries(self, knowledge_base_summary: str) -> list[dict]:
"""Генерация тестовых запросов для RAG системы"""
prompt = f"""Given this knowledge base: {knowledge_base_summary}
Generate 30 test queries including:
- Direct factual questions (should return answer from KB)
- Questions outside KB scope (should return 'not found')
- Ambiguous queries (testing retrieval quality)
- Multi-hop questions requiring synthesis
Return JSON array with: query, expected_type, expected_answer_present"""
response = self.client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=3000,
messages=[{"role": "user", "content": prompt}]
)
return json.loads(response.content[0].text)
Generation for testing ML models
class MLModelTestDataGenerator:
def generate_distribution_shift(self, train_data: pd.DataFrame,
shift_type: str) -> pd.DataFrame:
"""Генерация данных с намеренным дрифтом для тестирования мониторинга"""
if shift_type == 'covariate':
# Сдвиг распределения признаков
test_data = train_data.copy()
test_data['age'] = test_data['age'] + 15 # Возрастной сдвиг
return test_data
elif shift_type == 'concept':
# Инвертируем зависимость (для тестирования concept drift детекции)
test_data = train_data.copy()
test_data['target'] = 1 - test_data['target']
return test_data
def generate_adversarial_examples(self, model, X: np.ndarray,
epsilon: float = 0.1) -> np.ndarray:
"""FGSM adversarial examples для stress testing"""
import torch
X_tensor = torch.FloatTensor(X).requires_grad_(True)
output = model(X_tensor)
loss = output.sum()
loss.backward()
adversarial = X + epsilon * X_tensor.grad.sign().numpy()
return np.clip(adversarial, X.min(), X.max())
A properly designed test data system covers 95%+ of edge cases automatically, speeds up testing by 3-5 times, and allows the QA team to focus on truly complex scenarios.







