g14 <- read.csv2(file.path(dataDir, 'football', 'pbp-2014.csv'), sep=',', header=TRUE, stringsAsFactors=FALSE) g15 <- read.csv2(file.path(dataDir, 'football', 'pbp-2015.csv'), sep=',', header=TRUE, stringsAsFactors=FALSE) games <- bind_rows(g14, g15)
oneOff <- games %>% filter(OffenseTeam == 'NYG', PlayType %in% c('PASS', 'RUSH')) %>% mutate(PlayType=factor(PlayType, levels=c('RUSH', 'PASS')), Down=factor(Down, levels=c(1, 2, 3, 4)))
ggplot(oneOff, aes(x=PlayType)) + geom_bar(fill='darkgrey')
ggplot(oneOff, aes(x=PlayType)) + geom_bar(fill='darkgrey') + facet_wrap(~Down)
\[ p(y_i=1) = \text{logit}^{-1}(\boldsymbol{X}_i\boldsymbol{\beta}) \]
\[ \text{logit}^{-1}(x) = \frac{e^x}{1+e^x} = \frac{1}{1+e^{-x}} \]
passRushMod1 <- glm(PlayType ~ Down - 1, data=oneOff, family=binomial) summary(passRushMod1)
Call: glm(formula = PlayType ~ Down - 1, family = binomial, data = oneOff) Deviance Residuals: Min 1Q Median 3Q Max -1.9174 -1.1486 0.5887 0.9610 1.2065 Coefficients: Estimate Std. Error z value Pr(>|z|) Down1 -0.06811 0.06527 -1.044 0.29671 Down2 0.53305 0.07709 6.914 4.7e-12 *** Down3 1.66501 0.13034 12.774 < 2e-16 *** Down4 1.27297 0.42762 2.977 0.00291 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 2958.4 on 2134 degrees of freedom Residual deviance: 2672.6 on 2130 degrees of freedom AIC: 2680.6 Number of Fisher Scoring iterations: 4
coefplot(passRushMod1, title='Probability of Pass')
invlogit <- function(x) { 1/(1 + exp(-x)) } round(invlogit(-6:6), 2)
[1] 0.00 0.01 0.02 0.05 0.12 0.27 0.50 0.73 0.88 0.95 0.98 0.99 1.00
coefplot(passRushMod1, trans=invlogit, title='Probability of Pass')
passRushMod2 <- glm(PlayType ~ Down + ToGo - 1, data=oneOff, family=binomial) coefplot(passRushMod2, trans=invlogit, title='Probability of Pass')
# make grid of scenarios scenarios <- expand.grid(ToGo=1:15, Down=1:4) %>% as.tbl %>% mutate(Down=factor(Down, levels=c(1, 2, 3, 4))) # make prediction based on model scenarioPredict <- predict(passRushMod2, newdata=scenarios, type='response', se.fit=TRUE) # build confidence intervals scenarios <- scenarios %>% mutate(Prediction=scenarioPredict$fit, Lower=Prediction - 2*scenarioPredict$se.fit, Upper=Prediction + 2*scenarioPredict$se.fit)
ggplot(scenarios, aes(x=ToGo)) + scale_y_continuous(label=scales::percent) + geom_ribbon(aes(ymin=Lower, ymax=Upper), fill='lightgrey') + geom_line(aes(y=Prediction)) + facet_wrap(~Down, nrow=2)
onePass <- oneOff %>% filter(PlayType == 'PASS') %>% mutate(Receiver=str_extract_all(Description, pattern=' \\d{1,2}-\\w\\.\\w+( |\\.)', simplify=TRUE) %>% `[`(, 2), Receiver=str_replace_all(Receiver, '(\\d{1,2}-)|(\\.$)', ''), Receiver= str_trim(Receiver)) %>% filter(Receiver %in% c("L.DONNELL", "V.CRUZ", "R.RANDLE", "O.BECKHAM", "H.NICKS"))
\[ \hat{p}_{mk} = \frac{1}{N_m} \sum_{x_i \in R_m} I(y_i = k) \]
passTree <- rpart(Receiver ~ Down + ToGo + Quarter + Minute + Second + YardLine + YardLineDirection + Formation, data=onePass)
rpart.plot(passTree, extra=104, branch=1, uniform=FALSE, varlen=0, faclen=9, under=TRUE, box.col=c(8), border.col=1, shadow.col='darkgray', col=4, eq=": ", branch.col='blue', nspace=.1, minbranch=.9, cex=.8)