# Loads library to allow setting the pseudorandom number generator seed
import random

# Sets the pseudorandom number generator seed
random.seed(7779311)

# Load libary to read the data
import pandas as pd 

# Columns from the Faketucky file that we will keep and their new names
cols = { 'sid': 'stdid', 'first_dist_code': 'distid', 'first_hs_code': 'schcd', 
        'first_hs_alt': 'altsch', 'first_hs_urbanicity': 'urbanicity', 
        'chrt_ninth': 'cohort', 'male': 'male', 'race_ethnicity': 'race', 
        'frpl_ever_in_hs': 'frleverhs', 'sped_ever_in_hs': 'swdeverhs', 
        'lep_ever_in_hs': 'eleverhs', 'gifted_ever_in_hs': 'tageverhs', 
        'ever_alt_sch_in_hs': 'alteverhs', 'scale_score_6_math': 'mthss6', 
        'scale_score_6_read': 'rlass6', 'scale_score_8_math': 'mthss8', 
        'scale_score_8_read': 'rlass8', 'pct_absent_in_hs': 'pctabshs', 
        'pct_excused_in_hs': 'pctexcusedhs', 'avg_gpa_hs': 'hsgpa', 
        'scale_score_11_eng': 'acteng11', 'scale_score_11_math': 'actmth11', 
        'scale_score_11_read': 'actrla11', 'scale_score_11_comp': 'actcmp11', 
        'collegeready_ever_in_hs': 'evercollrdyhs', 'careerready_ever_in_hs': 'evercarrdyhs', 
        'ap_ever_take_class': 'aptakenever', 'last_acadyr_observed': 'lastobsyr', 
        'transferout': 'transfer', 'dropout': 'dropout', 'still_enrolled': 'stillenrolled', 
        'ontime_grad': 'gradontime', 'chrt_grad': 'gradcohort', 'hs_diploma': 'diploma', 
        'enroll_yr1_any': 'yr1psenrany', 'enroll_yr1_2yr': 'yr1psenr2yr', 
        'enroll_yr1_4yr': 'yr1psenr4yr', 'enroll_yr2_any': 'yr2psenrany' }

# Load the data and rename the columns to shorter names
df1 = pd.read_stata('https://github.com/OpenSDP/faketucky/raw/master/faketucky.dta', columns = cols.keys()).rename(columns = cols)

# Create the combined school/district ID
df1['schid'] = df1['distid'].astype(str) + df1['schcd'].astype(str)

# Get a sample of school IDs
schids = df1['schid'].drop_duplicates().sample(n = 60, random_state = 7779311)

# Inner Join to select the subset of cases with the sampled school IDs
df = df1.merge(schids, how = 'inner', on = 'schid')

# Remove the school and district codes that created schid
df.drop(columns = ['distid', 'schcd'], inplace = True)

# Make sure a couple columns/variables are correctly typed for the synthesis software
df['urbanicity'] = df['urbanicity'].astype(str)
df['schid'] = df['schid'].astype(str)

# Show some of the data
df.head(20)

df.dtypes

stdid              int32
altsch              int8
urbanicity        object
cohort             int16
male             float64
race              object
frleverhs        float64
swdeverhs           int8
eleverhs            int8
tageverhs           int8
alteverhs           int8
mthss6           float64
rlass6           float64
mthss8           float64
rlass8           float64
pctabshs         float64
pctexcusedhs     float64
hsgpa            float64
acteng11         float64
actmth11         float64
actrla11         float64
actcmp11         float64
evercollrdyhs       int8
evercarrdyhs        int8
aptakenever         int8
lastobsyr          int16
transfer            int8
dropout             int8
stillenrolled       int8
gradontime          int8
gradcohort       float64
diploma             int8
yr1psenrany      float64
yr1psenr2yr      float64
yr1psenr4yr      float64
yr2psenrany      float64
schid             object
dtype: object


# Load the torch library
import torch 

# Check to see if a GPU is available so you use a GPU later 
gpu = torch.cuda.is_available()

# Check how many GPU are available
ngpus = torch.cuda.device_count()

# For GPU setups
if ngpus >= 1 and ngpus is not None:
    # Get the device properties for each GPU available
    for i in range(ngpus):
        print(torch.cuda.get_device_properties(i))

# Print the result so you can make sure the GPU is ID'd in case something goes wrong with CUDA
print('GPU Available = {0}\n# of GPUS = {1}'.format(gpu, ngpus))

_CudaDeviceProperties(name='Quadro RTX 5000', major=7, minor=5, total_memory=16117MB, multi_processor_count=48)
GPU Available = True
# of GPUS = 1


# If you have multiple GPUs here is where you can set which GPU to use based on the index
# starting from 0
torch.cuda.set_device(0)


# Import CTGAN from the Synthetic Data Vault library
from sdv.tabular import CTGAN

# Use this to set the number of epochs (passes over your data) for training
eps = 1

# You can import this module if you want to see the timing
import time

# This is where you can define/tune your specific architecture for your CTGAN
# See: https://sdv.dev/SDV/api_reference/tabular/api/sdv.tabular.ctgan.CTGAN.html#sdv.tabular.ctgan.CTGAN 
# for a description of all the parameters that are available.
# cuda = whether to use a GPU or not
# discriminator_steps = Typically this is 5 or 10, it is how many times the discriminator is updated before updating the generator
# pac = # of samples grouped together when training the discriminator
# batch_size = How many observations from your dataset should be processed at a time
# verbose = whether to print updates during training indicating some type of progress
# embedding_dim = # of nodes to use for the embedding used for conditioning
# generator_dim = Specify the number of layers/nodes (#, #) each # is the number of nodes in the layer
# discriminator_dim = Specify the number of layers/nodes (#, #) each # is the number of nodes in the layer
# epochs = The # of passes over your dataset to train the model
# You define the architecture without any data.  Once defined you pass data to it using the fit() method.
ctmod = CTGAN(cuda = gpu,
              discriminator_steps = 5,
              pac = 50,
              batch_size = 4000,
              verbose = True,
              embedding_dim = 256,
              generator_dim = (512, 512, 256, 256, 128, 128),
              discriminator_dim = (1024, 512, 256, 128), 
              epochs = eps)

# Leave this uncommented if you want to time the training
start = time.time()

# This trains the model
ctmod.fit(df)

# Leave this uncommented if you 
end = time.time()

# Leave this uncommented if the timing stuff is uncommented to see how long it took
print('{} epochs took {} minutes to complete'.format(eps, (end - start)/60))

/home/billy/anaconda3/lib/python3.9/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'pctabshs'. Data will not be rounded.
  warnings.warn(
/home/billy/anaconda3/lib/python3.9/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'pctexcusedhs'. Data will not be rounded.
  warnings.warn(
/home/billy/anaconda3/lib/python3.9/site-packages/rdt/transformers/numerical.py:100: UserWarning: No rounding scheme detected for column 'hsgpa'. Data will not be rounded.
  warnings.warn(

Epoch 1, Loss G:  9.1588,Loss D: -15.8004
1 epochs took 0.5858280380566915 minutes to complete


# To generate synthetic data use the sample method
synthtest = ctmod.sample(330)

# Then you can save it to a file
synthtest.to_csv('synthestCTGAN.csv')

# And delete the object to reduce memory consumption
del synthtest

# To sample a specific group, define the conditions
from sdv.sampling import Condition

# Define the groups you need synthetic records for
remoteaa = Condition(num_rows = 100, column_values = {'urbanicity': 'Rural: Remote', 'race': 'African-American'})
remotelx = Condition(num_rows = 100, column_values = {'urbanicity': 'Rural: Remote', 'race': 'Hispanic'})
fringeaa = Condition(num_rows = 100, column_values = {'urbanicity': 'Rural: Fringe', 'race': 'African-American'})
fringelx = Condition(num_rows = 100, column_values = {'urbanicity': 'Rural: Fringe', 'race': 'Hispanic'})

# Then use the sample_conditions() method to generate samples with the characteristics you need
condtest = ctmod.sample_conditions(conditions = [remoteaa, remotelx, fringeaa, fringelx])

# And you can save those data to a file as well:
condtest.to_csv('condtestCTGAN.csv')

Sampling conditions: 100%|█████████████████████████████████████████████| 400/400 [00:10<00:00, 39.43it/s]

Synthesis Example using Conditional Tabular GAN (CTGAN)¶

Differences in approaches¶

Synthesizing Data¶

Notes on Synthesizing Specific Groups¶