This is the code for a webinar I gave with Dan Mbanga for Amazon’s AWS Webinar Series about deep learning using MXNet in R. The video is available on YouTube.

I am experimentally saving htmlwidget objects to HTML files then loading them with iframes so please excuse the scroll bars.

Packages

These are the packages we are using either by loading the entire package or using individual functions.

Data

This dataset is about property lots in Manhattan and includes descriptive information as well as value. The original data are available from NYC Planning and the prepared files seen here at the Lander Analytics data.world repo.

data_train <- readr::read_csv(file.path(dataDir, 'manhattan_Train.csv'))
data_validate <- readr::read_csv(file.path(dataDir, 'manhattan_Validate.csv'))
data_test <- readr::read_csv(file.path(dataDir, 'manhattan_Test.csv'))
# Remove some variables
data_train <- data_train %>% 
    select(-ID, -TotalValue, -Borough, -ZoneDist4, 
           -SchoolDistrict, -Council, -PolicePrct, -HealthArea)
data_validate <- data_validate %>% 
    select(-ID, -TotalValue, -Borough, -ZoneDist4, 
           -SchoolDistrict, -Council, -PolicePrct, -HealthArea)
data_test <- data_test %>% 
    select(-ID, -TotalValue, -Borough, -ZoneDist4, 
           -SchoolDistrict, -Council, -PolicePrct, -HealthArea)

This is a glimpse of the data.

Here is a visualization of the class balance.

dataList <- list(data_train, data_validate, data_test)
dataList %>% 
    purrr::map(function(x) figure(width=600/NROW(dataList), height=500, legend_location=NULL) %>% 
                   ly_bar(x=High, color=factor(High), data=x, legend='code')
    ) %>% 
    grid_plot(nrow=1, ncol=NROW(dataList), same_axes=TRUE)

We use vtreat to do some automated feature engineering.

# The column name for the response
responseName <- 'High'
# The target value for the response
responseTarget <- TRUE
# The remaining columns are predictors
varNames <- setdiff(names(data_train), responseName)

# build the treatment design
treatmentDesign <- designTreatmentsC(dframe=data_train, varlist=varNames, 
                                     outcomename=responseName, 
                                     outcometarget=responseTarget, 
                                     verbose=TRUE)
## [1] "desigining treatments Mon Jun 26 01:23:38 2017"
## [1] "designing treatments Mon Jun 26 01:23:38 2017"
## [1] " have level statistics Mon Jun 26 01:23:39 2017"
## [1] "design var FireService Mon Jun 26 01:23:39 2017"
## [1] "design var ZoneDist1 Mon Jun 26 01:23:39 2017"
## [1] "design var ZoneDist2 Mon Jun 26 01:23:40 2017"
## [1] "design var ZoneDist3 Mon Jun 26 01:23:40 2017"
## [1] "design var Class Mon Jun 26 01:23:41 2017"
## [1] "design var LandUse Mon Jun 26 01:23:42 2017"
## [1] "design var Easements Mon Jun 26 01:23:43 2017"
## [1] "design var OwnerType Mon Jun 26 01:23:43 2017"
## [1] "design var LotArea Mon Jun 26 01:23:43 2017"
## [1] "design var BldgArea Mon Jun 26 01:23:43 2017"
## [1] "design var ComArea Mon Jun 26 01:23:43 2017"
## [1] "design var ResArea Mon Jun 26 01:23:44 2017"
## [1] "design var OfficeArea Mon Jun 26 01:23:44 2017"
## [1] "design var RetailArea Mon Jun 26 01:23:44 2017"
## [1] "design var GarageArea Mon Jun 26 01:23:44 2017"
## [1] "design var StrgeArea Mon Jun 26 01:23:44 2017"
## [1] "design var FactryArea Mon Jun 26 01:23:44 2017"
## [1] "design var OtherArea Mon Jun 26 01:23:44 2017"
## [1] "design var NumBldgs Mon Jun 26 01:23:44 2017"
## [1] "design var NumFloors Mon Jun 26 01:23:44 2017"
## [1] "design var UnitsRes Mon Jun 26 01:23:44 2017"
## [1] "design var UnitsTotal Mon Jun 26 01:23:44 2017"
## [1] "design var LotFront Mon Jun 26 01:23:44 2017"
## [1] "design var LotDepth Mon Jun 26 01:23:44 2017"
## [1] "design var BldgFront Mon Jun 26 01:23:44 2017"
## [1] "design var BldgDepth Mon Jun 26 01:23:45 2017"
## [1] "design var Extension Mon Jun 26 01:23:45 2017"
## [1] "design var Proximity Mon Jun 26 01:23:45 2017"
## [1] "design var IrregularLot Mon Jun 26 01:23:45 2017"
## [1] "design var LotType Mon Jun 26 01:23:46 2017"
## [1] "design var BasementType Mon Jun 26 01:23:46 2017"
## [1] "design var Landmark Mon Jun 26 01:23:47 2017"
## [1] "design var BuiltFAR Mon Jun 26 01:23:47 2017"
## [1] "design var ResidFAR Mon Jun 26 01:23:47 2017"
## [1] "design var CommFAR Mon Jun 26 01:23:47 2017"
## [1] "design var FacilFAR Mon Jun 26 01:23:47 2017"
## [1] "design var Built Mon Jun 26 01:23:47 2017"
## [1] "design var HistoricDistrict Mon Jun 26 01:23:48 2017"
## [1] " scoring treatments Mon Jun 26 01:23:48 2017"
## [1] "have treatment plan Mon Jun 26 01:24:19 2017"
## [1] "rescoring complex variables Mon Jun 26 01:24:19 2017"
## [1] "done rescoring complex variables Mon Jun 26 01:24:32 2017"

Then we create train, validate and test matrices.

# build design data.frames
dataTrain <- prepare(treatmentplan=treatmentDesign, dframe=data_train)
dataValidate <- prepare(treatmentplan=treatmentDesign, dframe=data_validate)
dataTest <- prepare(treatmentplan=treatmentDesign, dframe=data_test)

# use all the level names as predictors
predictorNames <- setdiff(names(dataTrain), responseName)

# training matrices
trainX <- data.matrix(dataTrain[, predictorNames])
trainY <- dataTrain[, responseName]

# validation matrices
validateX <- data.matrix(dataValidate[, predictorNames])
validateY <- dataValidate[, responseName]

# test matrices
testX <- data.matrix(dataTest[, predictorNames])
testY <- dataTest[, responseName]

# Sparse versions for some models
trainX_sparse <- sparse.model.matrix(object=High ~ ., data=dataTrain)
validateX_sparse <- sparse.model.matrix(object=High ~ ., data=dataValidate)
testX_sparse <- sparse.model.matrix(object=High ~ ., data=dataTest)

Feedforward Network

Helper Functions

This is a function that allows mxnet to calculate log-loss based on the logloss function from the Metrics package.

# log-loss
mx.metric.mlogloss <- mx.metric.custom("mlogloss", function(label, pred){
    return(Metrics::logLoss(label, pred))
})

Network Formulation

We build the model symbolically. We use a feedforward network with two hidden layers. The first hidden layer has 256 units and the second has 128 units. We also use dropout and batch normalization for regularization. The last step is to use a logistic sigmoid (inverse logit) for the logistic regression output.

net <- mx.symbol.Variable('data') %>%
    # drop out 20% of predictors
    mx.symbol.Dropout(p=0.2, name='Predictor_Dropout') %>%
    # a fully connected layer with 256 units
    mx.symbol.FullyConnected(num_hidden=256, name='fc_1') %>%
    # batch normalize the units
    mx.symbol.BatchNorm(name='bn_1') %>%
    # use the rectified linear unit (relu) for the activation function
    mx.symbol.Activation(act_type='relu', name='relu_1') %>%
    # drop out 50% of the units
    mx.symbol.Dropout(p=0.5, name='dropout_1') %>%
    # a fully connected layer with 128 units
    mx.symbol.FullyConnected(num_hidden=128, name='fc_2') %>%
    # batch normalize the units
    mx.symbol.BatchNorm(name='bn_2') %>%
    # use the rectified linear unit (relu) for the activation function
    mx.symbol.Activation(act_type='relu', name='relu_2') %>%
    # drop out 50% of the units
    mx.symbol.Dropout(p=0.5, name='dropout_2') %>%
    # fully connect to the output layer which has just the 1 unit
    mx.symbol.FullyConnected(num_hidden=1, name='out') %>%
    # use the sigmoid output
    mx.symbol.LogisticRegressionOutput(name='output')

Inspect the Network

By inspecting the symbolic network we see that it is actually just a C++ pointer. We also see its arguments and a visualization.

net
## C++ object <0000000018ed9aa0> of class 'MXSymbol' <0000000018c30d00>
arguments(net)
##  [1] "data"         "fc_1_weight"  "fc_1_bias"    "bn_1_gamma"  
##  [5] "bn_1_beta"    "fc_2_weight"  "fc_2_bias"    "bn_2_gamma"  
##  [9] "bn_2_beta"    "out_weight"   "out_bias"     "output_label"
graph.viz(net)

Network Training

With the data prepared and the network specified we now train the model. First we set the envinronment variable MXNET_CPU_WORKER_NTHREADS=4 since this demo is on a laptop with four threads. Using a GPU will speed up the computations. We also set the random seed with mx.set.seed for reproducibility.

We use the Adam optimization algorithm which has an adaptive learning rate which incorporates momentum.

# use four CPU threads
Sys.setenv('MXNET_CPU_WORKER_NTHREADS'=4)

# set the random seed
mx.set.seed(1234)

# train the model
mod_net <- mx.model.FeedForward.create(
    symbol            = net,    # the symbolic network
    X                 = trainX, # the predictors
    y                 = trainY, # the response
    optimizer         = "adam", # using the Adam optimization method
    eval.data         = list(data=validateX, label=validateY), # validation data
    ctx               = mx.cpu(), # use the cpu for training
    eval.metric       = mx.metric.mlogloss, # evaluate with log-loss
    num.round         = 50,     # 50 epochs
    learning.rate     = 0.001,   # learning rate
    array.batch.size  = 256,    # batch size
    array.layout      = "rowmajor"  # the data is stored in row major format
)

Predictions

Statisticians call this step prediction while the deep learning field calls it inference which has an entirely different meaning in statistics.

preds_net <- predict(mod_net, testX, array.layout="rowmajor") %>% t

Elastic Net

Model Training

We fit an Elastic Net model with glmnet.

registerDoParallel(cl=4)

set.seed(1234)
mod_glmnet <- cv.glmnet(x=trainX_sparse, y=trainY, 
                        alpha=.5, family='binomial', 
                        type.measure='auc',
                        nfolds=5, parallel=TRUE)

Predictions

preds_glmnet <- predict(mod_glmnet, newx=testX_sparse, s='lambda.1se', type='response')

XGBoost

Model Training

We fit a random forest with xgboost.

set.seed(1234)

trainXG <- xgb.DMatrix(data=trainX_sparse, label=trainY)
validateXG <- xgb.DMatrix(data=validateX_sparse, label=validateY)

watchlist <- list(train=trainXG, validate=validateXG)

mod_xgboost <- xgb.train(data=trainXG, 
                nrounds=1, nthread=4, 
                num_parallel_tree=500, subsample=0.5, colsample_bytree=0.5,
                objective='binary:logistic',
                eval_metric = "error", eval_metric = "logloss",
                print_every_n=1, watchlist=watchlist)
## [1]  train-error:0.104713    train-logloss:0.525032  validate-error:0.108254 validate-logloss:0.527535

Predictions

preds_xgboost <- predict(mod_xgboost, newdata=testX_sparse)

SVM

Model Training

set.seed(1234)
mod_svm <- e1071::svm(x=trainX_sparse, y=trainY, probability=TRUE, type='C')

This model did not train in a reasonable time.

Results

ROC

rocData <- dplyr::bind_rows(
    cbind(data.frame(roc(testY, preds_glmnet, direction="<")[c('specificities', 'sensitivities')]), Model='glmnet'),
    cbind(data.frame(roc(testY, preds_xgboost, direction="<")[c('specificities', 'sensitivities')]), Model='xgboost'),
    cbind(data.frame(roc(testY, preds_net, direction="<")[c('specificities', 'sensitivities')]), Model='Net')
)
ggplotly(ggplot(rocData, aes(x=specificities, y=sensitivities)) + geom_line(aes(color=Model, group=Model)) + scale_x_reverse(), width=800, height=600)

Related Posts



Jared Lander is the Chief Data Scientist of Lander Analytics a New York data science firm, Adjunct Professor at Columbia University, Organizer of the New York Open Statistical Programming meetup and the New York and Washington DC R Conferences and author of R for Everyone.