Development of AI System for Contact and Company Deduplication in CRM
Contact duplication in CRM is a chronic problem: one customer exists as "Ivanov I.I.", "Ivan Ivanov", and "[email protected]" as three separate records. AI system automatically finds duplicates and suggests or automatically performs merging.
Approaches to Duplicate Detection
Rule-based: explicit matching rules (same email = definitely duplicate). High precision, low recall — misses fuzzy matches.
ML-based (entity resolution): model predicts the probability that two records are the same object. Accounts for typos, abbreviations, transliteration.
Embedding-based: convert each contact into a vector and search for nearest neighbors. Scales quickly.
ML Deduplication Model
import pandas as pd
import dedupe
from dedupe import Dedupe
class ContactDeduplicator:
def __init__(self):
self.deduper = None
def setup_fields(self):
"""Describing fields for dedupe"""
fields = [
dedupe.variables.String('first_name'),
dedupe.variables.String('last_name'),
dedupe.variables.String('email', has_missing=True),
dedupe.variables.String('phone', has_missing=True),
dedupe.variables.String('company'),
dedupe.variables.String('job_title', has_missing=True),
]
return dedupe.Dedupe(fields)
def train(self, records: dict, training_file: str = None):
"""Training on labeled pairs (match/not-match)"""
self.deduper = self.setup_fields()
if training_file and os.path.exists(training_file):
with open(training_file) as f:
self.deduper.prepare_training(records, f)
else:
self.deduper.prepare_training(records)
# Active learning: labeling sample pairs
dedupe.console_label(self.deduper)
with open(training_file, 'w') as f:
self.deduper.write_training(f)
self.deduper.train()
def find_duplicates(self, records: dict,
threshold: float = 0.5) -> list[tuple]:
"""Finding duplicates with probabilities"""
clustered_dupes = self.deduper.partition(records, threshold)
duplicate_groups = []
for (cluster_id, record_ids, scores) in clustered_dupes:
if len(record_ids) > 1:
duplicate_groups.append({
'records': list(record_ids),
'scores': list(scores),
'max_score': max(scores)
})
return sorted(duplicate_groups, key=lambda x: x['max_score'], reverse=True)
Fuzzy String Comparison
from rapidfuzz import fuzz, process
def compute_similarity(record1: dict, record2: dict) -> float:
scores = []
# Email: exact or domain match
if record1.get('email') and record2.get('email'):
if record1['email'].lower() == record2['email'].lower():
return 1.0 # Exact email match - definitely duplicate
email1_domain = record1['email'].split('@')[1]
email2_domain = record2['email'].split('@')[1]
if email1_domain == email2_domain:
scores.append(0.5) # Same domain - similar
# Name: fuzzy matching
name1 = f"{record1.get('first_name', '')} {record1.get('last_name', '')}"
name2 = f"{record2.get('first_name', '')} {record2.get('last_name', '')}"
name_score = fuzz.token_sort_ratio(name1, name2) / 100
scores.append(name_score * 0.4)
# Phone: normalization and comparison
phone1 = re.sub(r'\D', '', record1.get('phone', ''))
phone2 = re.sub(r'\D', '', record2.get('phone', ''))
if phone1 and phone2:
if phone1[-10:] == phone2[-10:]: # Last 10 digits
scores.append(0.9)
# Company
if record1.get('company') and record2.get('company'):
company_score = fuzz.token_set_ratio(
record1['company'], record2['company']
) / 100
scores.append(company_score * 0.2)
return sum(scores) / len(scores) if scores else 0.0
Record Merge Strategy
def merge_duplicates(records: list[dict]) -> dict:
"""Merging a group of duplicates into one record"""
merged = {}
field_priority = ['email', 'phone', 'first_name', 'last_name', 'company']
for field in field_priority:
values = [r.get(field) for r in records if r.get(field)]
if not values:
continue
# Take the most frequently occurring value
merged[field] = max(set(values), key=values.count)
# For created_at take the earliest date
dates = [r.get('created_at') for r in records if r.get('created_at')]
if dates:
merged['created_at'] = min(dates)
# Combine tags and labels
all_tags = []
for r in records:
all_tags.extend(r.get('tags', []))
merged['tags'] = list(set(all_tags))
merged['merged_from'] = [r['id'] for r in records]
return merged
Typical results from implementation: detecting 10-25% duplicates in mature CRM database, reducing database size by 8-15%, improving email marketing accuracy (reducing unsubscribe rate from duplicate mailings).







