SDK Examples#
Practical examples demonstrating common NeMo Safe Synthesizer jobs using the Python SDK REST API.
Basic Examples#
Note
Before you start, make sure that you have:
Stored CSVs locally
Uploaded them using the following steps:
export HF_ENDPOINT="http://localhost:3000/v1/hf"
# Adjust this line for your datasets as appropriate
datasets="customer-data.csv eu-customer-data.csv patient-data.csv financial-transactions.csv sensitive-dataset.csv large-dataset.csv"
for dataset in $datasets; do
huggingface-cli upload --repo-type dataset default/safe-synthesizer $dataset
done
# Upload the quarter datasets
for quarter in {1..4}; do
huggingface-cli upload --repo-type dataset default/safe-synthesizer customer-q${quarter}.csv
done
Example 1: Basic Synthetic Data Generation#
Generate private synthetic data:
# Complete pipeline using REST API
job_request = {
"name": "basic-synthetics",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/customer-data.csv",
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": {
"globals": {"locales": ["en_US"]},
"steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]
},
"generation": {
"num_records": 5000,
"temperature": 0.8
},
"privacy": {
"privacy_hyperparams": {"dp": True, "epsilon": 6.0}
},
"evaluation": {"mia_enabled": True, "aia_enabled": True}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**job_request)
# Get results via REST API
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
print(f"Job created: {job.id}")
current_job = client.beta.safe_synthesizer.jobs.retrieve(job.id)
print(f"Check job status: {current_job.status}")
Example 2: PII Detection, Redaction, or Replacement Only#
Remove PII from a customer dataset for compliance:
import pandas as pd
from nemo_microservices import NeMoMicroservices
# Load sensitive customer data
customer_data = pd.read_csv("customer_data.csv")
# Initialize client
client = NeMoMicroservices(base_url="http://localhost:8080")
# Create PII redaction job using REST API
job_request = {
"name": "customer-pii-redaction",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/customer-data.csv",
"config": {
"enable_synthesis": False,
"enable_replace_pii": True,
"replace_pii": {
"globals": {"locales": ["en_US"]},
"steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]
}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**job_request)
# Get redacted dataset via REST API
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
print(f"Job created: {job.id}")
Industry-Specific Examples#
Example 3: GDPR Compliance#
Configure NeMo Safe Synthesizer for European data protection requirements:
# GDPR-compliant configuration
gdpr_pii_config = {
"globals": {
"locales": ["en_GB", "de_DE", "fr_FR"], # European locales
"ner": {
"ner_threshold": 0.9, # High confidence for PII detection
"entities": ["name", "email", "phone_number", "address", "iban"]
}
},
"steps": [
{
"rows": {
"update": [
{"entity": ["name"], "value": "fake.name()"},
{"entity": ["email"], "value": "fake.email()"},
{"entity": ["phone_number"], "value": "fake.phone_number()"},
{"entity": ["address"], "value": "fake.address()"}
]
}
}
]
}
# Strong privacy protection
privacy_config = {
"dp": True,
"epsilon": 2.0, # Strong privacy guarantee
"delta": 1e-6
}
gdpr_job_request = {
"name": "gdpr-compliant-synthetics",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/eu-customer-data.csv",
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": gdpr_pii_config,
"privacy": {"privacy_hyperparams": privacy_config},
"evaluation": {"mia_enabled": True}
}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**gdpr_job_request)
Example 4: Healthcare Data (HIPAA)#
Process protected health information with enhanced privacy:
# Healthcare-specific PII configuration
hipaa_config = {
"globals": {
"locales": ["en_US"],
"ner": {
"entities": [
"name", "email", "phone_number", "address",
"medical_record_number", "ssn", "date_of_birth"
],
"ner_threshold": 0.95 # Very high confidence for PHI
}
},
"steps": [
{
"rows": {
"update": [
{"name": "patient_name", "value": "fake.name()"},
{"name": "mrn", "value": "fake.random_number(digits=8)"},
{"name": "dob", "value": "fake.date_of_birth(minimum_age=18, maximum_age=90)"}
]
},
"rows": {
"drop": [
{"condition": "notes CONTAINS 'confidential'"}
]
}
}
]
}
# Very strong privacy for healthcare
healthcare_privacy = {
"dp": True,
"epsilon": 1.0, # Very strong privacy
"delta": 1e-7,
"per_sample_max_grad_norm": 0.5
}
hipaa_job_request = {
"name": "hipaa-compliant-synthetics",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/patient-data.csv",
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": hipaa_config,
"privacy": {"privacy_hyperparams": healthcare_privacy},
"generation": {"temperature": 0.6},
"evaluation": {"mia_enabled": True, "aia_enabled": True}
}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**hipaa_job_request)
Example 5: Financial Data#
Process financial datasets with domain-specific constraints:
## Note: Use training configuration to express grouping/ordering for learning
financial_training = {
"group_training_examples_by": "account_type",
"order_training_examples_by": "transaction_date"
}
# Financial PII redaction
financial_pii = {
"steps": [
{
"rows": {
"update": [
{"name": "account_number", "value": "fake.random_number(digits=12)"},
{"name": "routing_number", "value": "fake.random_number(digits=9)"},
{"entity": ["credit_debit_card"], "value": "fake.credit_card_number()"},
{"entity": ["ssn"], "value": "fake.ssn()"}
]
}
}
]
}
financial_job_request = {
"name": "financial-synthetics",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/financial-transactions.csv",
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": financial_pii,
"training": financial_training,
"privacy": {"privacy_hyperparams": {"dp": True, "epsilon": 4.0}},
"generation": {
"num_records": 50000,
"use_structured_generation": True
},
"evaluation": {"mia_enabled": True}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**financial_job_request)
Advanced Jobs#
Example 6: Batch Processing#
Process multiple datasets with consistent configuration:
import asyncio
from nemo_microservices import AsyncNeMoMicroservices
async def process_multiple_datasets():
async_client = AsyncNeMoMicroservices(base_url="http://localhost:8080")
datasets = ["customer-q1", "customer-q2", "customer-q3", "customer-q4"]
# Shared configs
pii_config = {"steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]}
# Create job requests for all datasets
job_requests = []
for dataset_id in datasets:
job_request = {
"name": f"batch-{dataset_id}",
"project": "default",
"spec": {
"data_source": f"hf://datasets/default/safe-synthesizer/{dataset_id}.csv",
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": pii_config,
"privacy": {"privacy_hyperparams": {"dp": True, "epsilon": 5.0}},
"generation": {"num_records": 5000, "temperature": 0.8},
"evaluation": {"mia_enabled": True, "aia_enabled": True}
}
}
}
job_requests.append(job_request)
# Start all jobs concurrently using REST API
jobs = await asyncio.gather(*[
async_client.beta.safe_synthesizer.jobs.create(**job_request)
for job_request in job_requests
])
# Monitor jobs using REST API and collect results
all_results = []
for job in jobs:
while True:
current_job = await async_client.beta.safe_synthesizer.jobs.retrieve(job.id)
status = current_job.status
if status in ("completed", "error", "cancelled"):
break
await asyncio.sleep(30)
try:
all_results.append(await async_client.beta.safe_synthesizer.jobs.results.list(job.id))
except Exception:
all_results.append(None)
return jobs, all_results
# Execute batch processing
jobs, results = asyncio.run(process_multiple_datasets())
Error Handling Patterns#
Example 7: Robust Production Job#
import logging
import time
from nemo_microservices import APIError, APIStatusError
def safe_synthesizer_with_retry(dataset_id, max_retries=3):
"""Robust NeMo Safe Synthesizer execution with retry logic."""
for attempt in range(max_retries):
try:
job_request = {
"name": f"production-synthetics-attempt-{attempt + 1}",
"project": "default",
"spec": {
"data_source": dataset_id,
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": {"steps": [{"rows": {"update": [{"entity": ["email"], "value": "column.entity | fake"}]}}]},
"privacy": {"privacy_hyperparams": {"dp": True, "epsilon": 5.0}},
"generation": {"num_records": 5000},
"evaluation": {"mia_enabled": True}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**job_request)
# Monitor job status
while True:
current_job = client.beta.safe_synthesizer.jobs.retrieve(job.id)
status = current_job.status
if status == "completed":
break
elif status == "error":
raise APIError(f"Job failed - check logs for details", request=None, body=None)
time.sleep(30)
return job
except APIError as e:
logging.error(f"Attempt {attempt + 1} failed: {e.message}")
if attempt == max_retries - 1:
raise
continue
except APIStatusError as e:
logging.error(f"API error on attempt {attempt + 1}: {e.message}")
if attempt == max_retries - 1:
raise
continue
raise RuntimeError(f"All {max_retries} attempts failed")
# Use the robust function
try:
job = safe_synthesizer_with_retry("hf://datasets/default/safe-synthesizer/sensitive-dataset.csv")
print(f"Successfully created job: {job.id}")
# Access results via: client.beta.safe_synthesizer.jobs.results.list(job.id)
except Exception as e:
print(f"Failed to create NeMo Safe Synthesizer job: {e}")
Performance Optimization#
Example 8: Large Dataset Processing#
Optimize for datasets approaching the 500MB limit:
# Configuration for large datasets
large_dataset_config = {
"training": {
"max_sequences_per_example": 256, # Reduce context window
"num_input_records_to_sample": 100000, # Limit training data
"batch_size": 2, # Smaller batches for memory efficiency
"gradient_accumulation_steps": 8 # Maintain effective batch size
},
"generation": {
"num_records": 25000 # Reasonable output size
}
}
large_dataset_job_request = {
"name": "large-dataset-synthetics",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/large-dataset.csv",
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": {"steps": [{"rows": {"update": [{"entity": ["email"], "value": "column.entity | fake"}]}}]},
"training": large_dataset_config["training"],
"generation": large_dataset_config["generation"],
"evaluation": {"mia_enabled": True}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**large_dataset_job_request)