Python and R

Jared P. Lander
Python in Finance

April 5, 2013

Why am I Here?

The Languages

Python

A great programming language

Even better at data thanks to Pandas

R

Tailor Made for Statistics

Cutting Edge Algorithms

Statistical Graphics

The Python Side

Python Packages and Magic

# Pandas
import pandas as pd
import pandas.rpy.common as rpy

# make R interaction easy
%load_ext rmagic

Read in Some Data

att = pd.read_csv("../../data/att.csv")
print(att.head())
##   T.Open T.High T.Low T.Close T.Volume T.Adjusted       Date
## 1  35.67  35.78 34.78   34.95 33694300      25.06 2007-01-03
## 2  34.95  35.24 34.07   34.50 44285400      24.74 2007-01-04
## 3  34.40  34.54 33.95   33.96 36561800      24.35 2007-01-05
## 4  33.40  34.01 33.21   33.81 40237400      24.50 2007-01-08
## 5  33.85  34.41 33.66   33.94 40082600      24.59 2007-01-09
## 6  34.20  35.00 31.94   34.03 29964300      24.66 2007-01-10

Function

# function to load pandas DataFrame into R
# written by Wes McKinney
def pandas_to_r(df, name):
    from rpy2.robjects import r, globalenv
    r_df = rpy.convert_to_r_dataframe(df)
    globalenv[name] = r_df

Send to R

pandas_to_r(att, "att")

See it in R

%%R
print(head(att))
##   T.Open T.High T.Low T.Close T.Volume T.Adjusted       Date
## 1  35.67  35.78 34.78   34.95 33694300      25.06 2007-01-03
## 2  34.95  35.24 34.07   34.50 44285400      24.74 2007-01-04
## 3  34.40  34.54 33.95   33.96 36561800      24.35 2007-01-05
## 4  33.40  34.01 33.21   33.81 40237400      24.50 2007-01-08
## 5  33.85  34.41 33.66   33.94 40082600      24.59 2007-01-09
## 6  34.20  35.00 31.94   34.03 29964300      24.66 2007-01-10

R Code From Python

Some Packages

%%R
require(quantmod)
require(xts)
require(forecast)

Check Types

%%R
# convert to date
att$Date <- as.Date(as.character(att$Date))
att$T.Close <- as.numeric(att$T.Close)
attClose <- xts(att$T.Close, order.by=att$Date)

xts Plot

%%R
plot(attClose)

Chart Series

%%R
chartSeries(attClose)

ggplot

%%R
require(ggplot2)
p <- ggplot(att, aes(x=Date, y=T.Close)) + geom_line()
print(p + geom_smooth())

Some R Models

ARIMA

%%R
attFit <- auto.arima(ts(attClose))
attFit
## Series: ts(attClose) 
## ARIMA(0,1,2)                    
## 
## Coefficients:
##          ma1     ma2
##       -0.069  -0.113
## s.e.   0.025   0.026
## 
## sigma^2 estimated as 0.236:  log likelihood=-1093
## AIC=2193   AICc=2193   BIC=2209

ARIMA Residuals

%%R
plot(attFit$residuals)

ACF

%%R
acf(attFit$residuals)
pacf(attFit$residuals)

Forecast

%%R
plot(forecast(object=attFit, h=180))

GARCH

Generalized AutoRegressive Conditional Heteroskedasticity

GARCH

%%R
require(rugarch)
attSpec <- ugarchspec(variance.model=list(model="sGARCH", garchOrder=c(1, 1)), 
                      mean.model=list(armaOrder=c(1, 1)), distribution.model="std")
attGarch <- ugarchfit(spec=attSpec, data=attClose)

print(attGarch@fit$matcoef)
print(infocriteria(attGarch))
##         Estimate  Std. Error  t value  Pr(>|t|)
## mu     35.159848   1.3282100  26.4716 0.000e+00
## ar1     0.997009   0.0013019 765.8227 0.000e+00
## ma1    -0.009937   0.0268014  -0.3708 7.108e-01
## omega   0.001335   0.0006916   1.9297 5.365e-02
## alpha1  0.069952   0.0149684   4.6733 2.964e-06
## beta1   0.925012   0.0153999  60.0662 0.000e+00
## shape   7.581676   1.4048336   5.3968 6.782e-08
##                    
## Akaike       0.9975
## Bayes        1.0214
## Shibata      0.9975
## Hannan-Quinn 1.0064

Residuals

%%R
plot(attGarch@fit$residuals, type="l")
plot(attGarch, which=10)

Model Comparison

%%R
# ARMA(1,1)
attSpec1 <- ugarchspec(variance.model=list(model="sGARCH", garchOrder=c(1, 1)), 
mean.model=list(armaOrder=c(1, 1)), distribution.model="std")
# ARMA(0,0)
attSpec2 <- ugarchspec(variance.model=list(model="sGARCH", garchOrder=c(1, 1)), 
mean.model=list(armaOrder=c(0, 0)), distribution.model="std")
# ARMA(0,2)
attSpec3 <- ugarchspec(variance.model=list(model="sGARCH", garchOrder=c(1, 1)), 
mean.model=list(armaOrder=c(0, 2)), distribution.model="std")
# ARMA(1,2)
attSpec4 <- ugarchspec(variance.model=list(model="sGARCH", garchOrder=c(1, 1)), 
mean.model=list(armaOrder=c(1, 2)), distribution.model="std")

attGarch1 <- ugarchfit(spec=attSpec1, data=attClose)
attGarch2 <- ugarchfit(spec=attSpec2, data=attClose)
attGarch3 <- ugarchfit(spec=attSpec3, data=attClose)
attGarch4 <- ugarchfit(spec=attSpec4, data=attClose)

Compare AIC

%%R
print(infocriteria(attGarch1)[1])
print(infocriteria(attGarch2)[1])
print(infocriteria(attGarch3)[1])
print(infocriteria(attGarch4)[1])
## [1] 0.9975
## [1] 5.109
## [1] 3.406
## [1] 0.9963

GARCH Predictions

%%R
attPred <- ugarchboot(attGarch, n.ahead=50, method = c("Partial", "Full")[1])
plot(attPred, which=1)

Machine Learning

The Elastic Net



\min_{\beta_{0},\beta \in \mathbb{R}^{p+1}} \left[ \frac{1}{2N} \sum_{i=1}^N \left( y_i - \beta_0 -x_i^T\beta \right)^2 + \lambda P_{\alpha} \left(\beta \right) \right]
where


P_{\alpha} \left(\beta \right) = \left(1 - \alpha \right) \frac{1}{2}||\Gamma\beta||_{\mathit{l}_2}^2 + \alpha ||\Gamma\beta||_{\mathit{l}_1}

glmnet

acs = pd.read_csv("../../data/acs_ny.csv")
pandas_to_r(acs, "acs")

%%R
require(useful)

acs$Income <- with(acs, FamilyIncome >= 150000)

acsX <- build.x(Income ~ NumBedrooms + NumChildren + NumPeople + NumRooms + NumUnits + 
NumVehicles + NumWorkers + OwnRent + YearBuilt + ElectricBill + 
FoodStamp + HeatingFuel + Insurance + Language - 1, data=acs, contrasts=FALSE)

acsY <- build.y(Income ~ NumBedrooms + NumChildren + NumPeople + NumRooms + NumUnits + 
NumVehicles + NumWorkers + OwnRent + YearBuilt + ElectricBill + 
FoodStamp + HeatingFuel + Insurance + Language - 1, data=acs)

require(glmnet)
# run the cross-validated glmnet
acsCV1 <- cv.glmnet(x=acsX, y=acsY, family="binomial", nfold=5)

glmnet Diagnostics

%%R
plot(acsCV1)
plot(acsCV1$glmnet.fit, xvar="lambda")

glmnet Coefficients

%%R
theCoef <- as.matrix(coef(acsCV1, s="lambda.1se"))
coefDF <- data.frame(Value=theCoef, Coefficient=rownames(theCoef))
coefDF <- coefDF[nonzeroCoef(coef(acsCV1, s="lambda.1se")), ]
g <- ggplot(coefDF, aes(x=X1, y=reorder(Coefficient, X1))) + 
    geom_vline(xintercept=0, color="grey", linetype=2) + 
    geom_point(color="blue") + 
    labs(x="Value", y="Coefficient", title="Coefficient Plot")

glmnet Coefficient Plot

%%R
print(g)

Other Options

Assign R objects to Python Objects

More like Python Code

About Me

Jared P. Lander

Owner of JP Lander Consulting

Adjunct Professor at Columbia University

Organizer of New York Open Statistical Programming Meetup

Author of R for Non-Statisticans (August 2013)

Twitter: @jaredlander

Website: http://www.jaredlander.com

The Tools