from urllib.request import urlopen
from json import loads
import pandas as pd

# API Endpoint for the CRDC Discipline Instances Data
url = "https://educationdata.urban.org/api/v1/schools/crdc/restraint-and-seclusion/"

# API Endpoint for the CCD Directory Data
demog = "https://educationdata.urban.org/api/v1/schools/ccd/directory/"

# Define the value labels for the categorical variables from the CCD directory we will keep
valueLabels = { 'school_type': {1: 'Regular school', 2: 'Special education school', 3: 'Vocational school', 4: 'Other/alternative school', 5: 'Reportable program', -1: 'Missing/not reported', -2: 'Not applicable', -3: 'Suppressed data'}, 
               'elem_cedp': {0: 'No', 1: 'Yes', -1: 'Missing/not reported', -2: 'Not applicable', -3: 'Suppressed data'}, 
               'middle_cedp': {0: 'No', 1: 'Yes', -1: 'Missing/not reported', -2: 'Not applicable', -3: 'Suppressed data'}, 
               'high_cedp': {0: 'No', 1: 'Yes', -1: 'Missing/not reported', -2: 'Not applicable', -3: 'Suppressed data'}, 
               'title_i_eligible': {0: 'No', 1: 'Yes', -1: 'Missing/not reported', -2: 'Not applicable', -3: 'Suppressed data'}, 
               'title_i_schoolwide': {0: 'No', 1: 'Yes', -1: 'Missing/not reported', -2: 'Not applicable', -3: 'Suppressed data'}, 
               'charter': {0: 'No', 1: 'Yes', -1: 'Missing/not reported', -2: 'Not applicable', -3: 'Suppressed data'}, 
               'magnet': {0: 'No', 1: 'Yes', -1: 'Missing/not reported', -2: 'Not applicable', -3: 'Suppressed data'}, 
               'lunch_program': {0: 'No', 1: 'Yes participating without using any Provision or the CEP', 2: 'Yes under the Community Eligibility Provision (CEP)', 3: 'Yes under Provision 1', 4: 'Yes under Provision 2', 5: 'Yes under Provision 3', -1: 'Missing/not reported'
}}

# Mapping to recode missing values for the CRDC data, missing and N/A values will be treated as 0, while suppressed 
# data will be coded as missing; this should allow the model to learn that suppressed data are different from the other
# two classes of data.
crdcMap = { 'instances_mech_restraint': { -1: 0, -2: 0, -3: None}, 'instances_phys_restraint': { -1: 0, -2: 0, -3: None}, 'instances_seclusion': { -1: 0, -2: 0, -3: None}}

# The list of variables from  the CCD directory to retain
ccdvars = ['year', 'ncessch', 'leaid', 'fips', 'school_type', 'elem_cedp', 'middle_cedp', 'high_cedp', 'title_i_eligible', 'title_i_schoolwide', 'charter', 'magnet', 'teachers_fte', 'lunch_program', 'direct_certification', 'enrollment']

# Defines a helper function that deals with some of the data cleaning 
def getyear(year):
    # Only retrieves numbers for all disabled students
    discipline = urlopen(url + str(year) + '/instances/?disability=99')
    # Decodes the byte stream and parses the json into a Python object
    dis = loads(discipline.read())
    # Retrieves the school directory data
    schinfo = urlopen(demog + str(year) + '/')
    # Decodes the byte stream and parses the json into a Python object
    demo = loads(schinfo.read())
    # Converts the Python object into a deduplicated Pandas data frame and recodes the variables named in crdcMap
    disp = pd.DataFrame(dis['results']).drop_duplicates(subset = ['year', 'ncessch', 'leaid', 'fips'], keep = 'first', ignore_index = True).replace(crdcMap)
    # Converts the Python object into a deduplicated Pandas data frame and recodes the variables named in valueLabels
    demographics = pd.DataFrame(demo['results'])[ccdvars].drop_duplicates(subset = ['year', 'ncessch', 'leaid', 'fips'], keep = 'first', ignore_index = True).replace(valueLabels)
    # Keeps only school records that have some enrollment
    demographics = demographics.loc[demographics['enrollment'] > 0]
    # Joins the data using the NCES School Identifier, year, LEA ID, and FIPS number
    return disp.merge(demographics, on = ['ncessch', 'year', 'leaid', 'fips'], how = 'left')

# Combine all years of data into a single data frame
df = pd.concat([getyear(i) for i in [2013, 2015, 2017]], ignore_index = True).drop(columns = 'crdc_id')


# Show some of the data
df.head(10)


# Load the torch library
import torch 

# Check to see if a GPU is available so you use a GPU later 
gpu = torch.cuda.is_available()

# Check how many GPU are available
ngpus = torch.cuda.device_count()

# For GPU setups
if ngpus >= 1 and ngpus is not None:
    # Get the device properties for each GPU available
    for i in range(ngpus):
        print(torch.cuda.get_device_properties(i))

# Print the result so you can make sure the GPU is ID'd in case something goes wrong with CUDA
print('GPU Available = {0}\n# of GPUS = {1}'.format(gpu, ngpus))

_CudaDeviceProperties(name='Quadro RTX 5000', major=7, minor=5, total_memory=16117MB, multi_processor_count=48)
GPU Available = True
# of GPUS = 1


# If you have multiple GPUs here is where you can set which GPU to use based on the index
# starting from 0
torch.cuda.set_device(0)


# This is how it worked previously:
# Entity columns identified the unit of observation
entity_columns = ['ncessch' ]

# Context columns identified time invariant variables
context_columns = [ 'leaid' ]

# The sequence index identified the same thing that it does in the metadata object now
sequence_index = 'year'


# Imports the module for CPAR
from sdv.timeseries import PAR

# Imports the time module so we can time things
import time

# Define the number of epochs to train the model
eps = 1

# Creates an instance of the CPAR model
cpar = PAR(cuda=True, 
            context_columns=context_columns,
           entity_columns=entity_columns,
           sequence_index=sequence_index,
            verbose=True, 
            epochs = eps)

# Start the clock
stime = time.time()

# Train the model for eps# epochs
cpar.fit(df)

# Stop the timer
etime = time.time()

# Show how long it took to train for eps# epochs
print('Took {} minutes to complete {} epochs'.format((etime - stime)/60, eps))

Epoch 1 | Loss 0.0006023218156769872: 100%|████████████████████████████████| 1/1 [01:31<00:00, 91.35s/it]

Took 2.0508145729700726 minutes to complete 1 epochs


# Generate synthetic samples from the model
synthdf = cpar.sample(250)

# Save the synthetic data
synthdf.to_csv('cparSynthetic.csv')

100%|██████████████████████████████████████████████████████████████████| 250/250 [00:07<00:00, 35.25it/s]


# Loads the metadata module
from sdv.metadata import SingleTableMetadata

# Instantiates a SingleTableMetadata class object
metadata = SingleTableMetadata()

# Quick way to start:
metadata.detect_from_dataframe(data = df)

# To look at the metadata
metadata.to_dict()

# Validate the metadata
metadata.validate()

# Updating metadata
metadata.update_column(column_name = 'ncessch', sdtype = 'id', regex_format = '[0-9]{12}')
metadata.update_column(column_name = 'charter', sdtype = 'categorical')

# If you're not sure about the numeric type you can always default to 'Float', but if you know how much storage would be needed
# for the data you can likely specific that type to hopefully optimize memory consumption a bit.
metadata.update_column(column_name = 'enrollment', sdtype = 'numerical', computer_representation = 'Int32')

# IMPORTANT FOR LONGITUDINAL SYNTHESIS!!!!!! This identifies your observations.
metadata.set_sequence_key(column_name = 'ncessch')

# IMPORTANT FOR LONGITUDINAL SYNTHESIS!!!!!! This identifies your measure of time.
metadata.set_sequence_index(column_name = 'year')


# This is an example of how you would do this with the newest version of the library

# Import CPAR from the Synthetic Data Vault library
from sdv.sequential import PARSynthesizer

# Create the synthesizer
cpar = PARSynthesizer(metadata)

# Train the model
cpar.fit(df)

# Synthesize data
synthdf = cpar.sample(num_rows = 2000)

Longitudinal Synthesis Example¶

The Sample¶

Slightly Older SDV Library¶

Updates to the SDV Library¶

	ncessch	year	fips	leaid	disability	school_type	elem_cedp	middle_cedp	high_cedp	title_i_eligible	title_i_schoolwide	charter	magnet	teachers_fte	lunch_program	direct_certification	enrollment
0	010000201705	2013	1	0100002	99	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	010000201706	2013	1	0100002	99	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2		2013	1		99	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	010000500870	2013	1	0100005	99	Regular school	No	Yes	No	Yes	Yes	Not applicable	No	33.0	NaN	NaN	632.0
4	010000500871	2013	1	0100005	99	Regular school	No	No	Yes	No	Not applicable	Not applicable	No	64.0	NaN	NaN	1117.0
5	010000500879	2013	1	0100005	99	Regular school	Yes	No	No	Yes	Yes	Not applicable	No	31.0	NaN	NaN	679.0
6	010000500889	2013	1	0100005	99	Regular school	Yes	No	No	Yes	Yes	Not applicable	No	43.0	NaN	NaN	780.0
7	010000501616	2013	1	0100005	99	Regular school	Yes	No	No	Yes	Yes	Not applicable	No	27.0	NaN	NaN	476.0
8	010000502150	2013	1	0100005	99	Regular school	Yes	No	No	Yes	Yes	Not applicable	No	56.0	NaN	NaN	1029.0
9	010000600193	2013	1	0100006	99	Regular school	No	Yes	No	Yes	Yes	Not applicable	No	23.0	NaN	NaN	439.0