Dataset Quality Validation for LLM Fine-tuning
Dataset validation is a mandatory step before launching costly fine-tuning. Poor datasets are only discovered after training, when GPU hours and time are spent. Systematic validation prevents this.
Validation Levels
Level 1 — Technical (automated):
from dataclasses import dataclass
import pandas as pd
@dataclass
class ValidationReport:
total_examples: int
issues: dict
pass_rate: float
recommendations: list[str]
class DatasetValidator:
def validate(self, dataset: list[dict]) -> ValidationReport:
issues = {
'empty_outputs': [],
'too_short': [],
'too_long': [],
'truncated': [],
'encoding_issues': [],
'near_duplicates': [],
}
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
for i, ex in enumerate(dataset):
output = ex.get('output', '')
# Empty outputs
if not output.strip():
issues['empty_outputs'].append(i)
continue
# Length in tokens
tokens = tokenizer.encode(output)
if len(tokens) < 5:
issues['too_short'].append(i)
elif len(tokens) > 2000:
issues['too_long'].append(i)
# Potentially truncated text (ends without complete phrase)
if output.strip()[-1] not in '.!?])"\'':
if len(tokens) > 500: # Long text without ending
issues['truncated'].append(i)
# Encoding issues
try:
output.encode('utf-8').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
issues['encoding_issues'].append(i)
total_issues = sum(len(v) for v in issues.values())
pass_rate = 1 - total_issues / len(dataset)
return ValidationReport(
total_examples=len(dataset),
issues=issues,
pass_rate=pass_rate,
recommendations=self._generate_recommendations(issues, len(dataset))
)
Level 2 — Semantic (automated):
class SemanticValidator:
def check_instruction_output_alignment(self, dataset: list[dict],
sample_size: int = 200) -> float:
"""How well output addresses instruction"""
sample = random.sample(dataset, min(sample_size, len(dataset)))
alignment_scores = []
for ex in sample:
score = self._compute_alignment(
ex['instruction'], ex.get('input', ''), ex['output']
)
alignment_scores.append(score)
return np.mean(alignment_scores)
def _compute_alignment(self, instruction: str, input: str, output: str) -> float:
"""LLM-judge for relevance assessment"""
prompt = f"""Does this output correctly address the instruction?
Instruction: {instruction}
Input: {input}
Output: {output[:500]}
Rate relevance 1-5, return only number."""
response = llm_client.complete(prompt, max_tokens=5)
try:
score = int(response.strip()) / 5.0
except ValueError:
score = 0.5 # Uncertainty → average score
return score
Level 3 — Substantive (manual review):
def sample_for_human_review(dataset: list[dict],
n: int = 100) -> list[dict]:
"""Stratified sample for manual review"""
# By output length
short = [ex for ex in dataset if len(ex['output'].split()) < 50]
medium = [ex for ex in dataset if 50 <= len(ex['output'].split()) < 200]
long = [ex for ex in dataset if len(ex['output'].split()) >= 200]
sample = []
per_stratum = n // 3
for stratum in [short, medium, long]:
sample.extend(random.sample(stratum, min(per_stratum, len(stratum))))
return sample
Final Report Before Training
def generate_pre_training_report(dataset: list[dict]) -> str:
validator = DatasetValidator()
semantic_val = SemanticValidator()
tech_report = validator.validate(dataset)
alignment_score = semantic_val.check_instruction_output_alignment(dataset)
report = f"""
## Dataset Validation Report
**Total examples:** {tech_report.total_examples:,}
**Technical pass rate:** {tech_report.pass_rate:.1%}
**Instruction-Output alignment:** {alignment_score:.2f}/1.0
### Issues Found:
- Empty outputs: {len(tech_report.issues['empty_outputs'])}
- Too short (<5 tokens): {len(tech_report.issues['too_short'])}
- Too long (>2000 tokens): {len(tech_report.issues['too_long'])}
- Potentially truncated: {len(tech_report.issues['truncated'])}
- Near-duplicates: {len(tech_report.issues['near_duplicates'])}
### Recommendations:
{chr(10).join('- ' + r for r in tech_report.recommendations)}
**GO / NO-GO:** {'GO' if tech_report.pass_rate > 0.9 and alignment_score > 0.7 else 'NO-GO — fix issues before training'}
"""
return report
Go/no-go threshold: technical pass rate > 90%, alignment score > 0.70. With alignment < 0.70 — dataset contains examples where output does not answer instruction, which actively degrades the model.







