% New York Pizza % Jared P. Lander;Cornell NYC % January 31, 2013 ```{r settings,include=FALSE} opts_chunk$set(cache=TRUE) make.image <- function(file, width="100%", align="center", encode=TRUE) { cat(sprintf("<%s>", align, if(encode) knitr::image_uri(f=file) else file, width, align)) } # options(rstudio.markdownToHTML = # function(inputFile, outputFile) { # system(paste("pandoc -s -S --webtex --toc -t slidy --self-contained", shQuote(inputFile), "-o", shQuote(outputFile))) # } # ) ``` ## About Me * Owner [JP Lander Consulting](http://www.jplander.com) * Author of Upcoming Book about R * Adjunct Professor at [Columbia University](http://www.columbia.edu) * Coorganizer of the [New York Open Statistical Programming](http://www.meetup.com/nyhackr) Meetup * Website: [http://www.jaredlander.com](http://www.jaredlander.com) * Twitter: [@jaredlander](https://twitter.com/jaredlander) ## ```{r cover-shot,results='asis',echo=FALSE} make.image(file="../Pictures/Lombardi's Pie and Oven.jpg", width="50%", encode=FALSE) ``` ## What is New York Style? ```{r ny-style,echo=FALSE,results='asis'} make.image(file="../Pictures/NY Style.png", width="75%", encode=FALSE) ``` ## Family Tree ```{r family-tree,echo=FALSE,results='asis',cache=FALSE} make.image(file="../Pictures/Family Tree.png", width="55%", encode=FALSE) ``` ## The Data ```{r the-data,echo=FALSE,results='asis'} make.image(file="../Pictures/The Data.png", width="75%",encode=F) pizza <- read.table(file="../Datasets/Pizza Consolidated.csv",header=TRUE,sep=",") ## takes in data ``` * MenuPages.com has menus, ratings, locations and price information for nearly 700 restaurants tagged as pizza in Manhattan and parts of Brooklyn * Slice, a pizza blog, has a definitive listing (and maps) of coal pizzerias * An exhaustive Google search provided wood classifications # Exploratory Data Analysis ## Number of Reviews Histogram ```{r ratings,echo=-3,fig.cap=""} # remove known outlier pizza <- pizza[pizza$Name != "Otto Enoteca and Pizzeria", ] require(ggplot2) ggplot(pizza, aes(x=Number.Reviews)) + geom_histogram(binwidth=10, fill="grey", color="darkgrey") + xlab("Number of Reviews") ``` ## Reviews by Area ```{r ratings-by-area,fig.cap=""} ggplot(pizza, aes(x=Number.Reviews)) + geom_histogram(binwidth=10, fill="grey", color="darkgrey") + facet_wrap(~ Area) + xlab("Number of Reviews") ``` ## Reviews by Fuel ```{r ratings-by-fuel,fig.cap="",fig.show='hold'} ggplot(pizza, aes(x=Number.Reviews)) + geom_histogram(binwidth=10, fill="grey", color="darkgrey") + facet_wrap(~ Fuel) + xlab("Number of Reviews") + ggtitle("Histogram by Fuel") ggplot(pizza, aes(x=Number.Reviews)) + geom_histogram(binwidth=10, fill="grey", color="darkgrey") + facet_wrap(~ Fuel, scales="free_y") + xlab("Number of Reviews") + ggtitle("Histogram by Fuel\nFree Scales") ``` ## Reviews by Fuel and Area ```{r ratings-by-area-and-fuel,fig.cap=""} ggplot(pizza, aes(x=Number.Reviews)) + geom_histogram(binwidth=10, fill="grey", color="darkgrey") + facet_grid(Fuel ~ Area, scales="free_y") + xlab("Number of Reviews") ``` ## Rating by Number of Reviews ```{r rating-by-reviews,fig.cap="",fig.show='hold'} ggplot(pizza, aes(x=Number.Reviews, y=Rating)) + geom_point() + xlab("Number of Reviews") ggplot(pizza, aes(x=Rating)) + geom_histogram(binwidth=.5, fill="grey", color="darkgrey") ``` # Poisson Distribution ## The Distribution $$ Pr(X = k) = \frac{\lambda^ke^{-\lambda}}{k!} $$ $$ \lambda = E(X) = Var(X) $$ ```{r pois-dist,include=FALSE} require(reshape2) require(stringr) pois1 <- rpois(n=10000, lambda=1) pois2 <- rpois(n=10000, lambda=2) pois5 <- rpois(n=10000, lambda=5) pois10 <- rpois(n=10000, lambda=10) pois20 <- rpois(n=10000, lambda=20) pois <- data.frame(Lambda.1=pois1, Lambda.2=pois2, Lambda.5=pois5, Lambda.10=pois10, Lambda.20=pois20) pois <- melt(data=pois, variable.name="Lambda", value.name="x") pois$Lambda <- as.factor(as.numeric(str_extract(string=pois$Lambda, pattern="\\d+"))) tail(pois) ``` ## Lambda ```{r lambda-hist,fig.cap=""} ggplot(pois, aes(x=x)) + geom_histogram(binwidth=1) + facet_wrap(~ Lambda) + ggtitle("Probability Mass Function") ``` ## Lambda ```{r lambda-dens,fig.cap=""} ggplot(pois, aes(x=x)) + geom_density(aes(group=Lambda, color=Lambda, fill=Lambda), adjust=4, alpha=1/2) + scale_color_discrete() + scale_fill_discrete() + ggtitle("Probability Mass Function") ``` ## Uses * Number of Accidents * Number of People at Checkout * Number of Lawsuits * Number of Trains in an Hour * Anything that is Counted ## Poisson Regression $$ y_i \sim pois(\theta_i) $$ $$ \theta_i = e^{X_i\beta} $$ # Back to Pizza ## Fit the Model ```{r fit-pois-mod,echo=TRUE} poisModReg <- glm(Number.Reviews ~ Area + Fuel + Fuel*Price, data=pizza, family=poisson(link="log")) summary(poisModReg)$coef ``` ## Visualize the Coefficients ```{r pois-mod-reg-coef,message=FALSE,fig.cap=""} require(coefplot) coefplot(poisModReg, sort="mag") ``` ## Check Overdispersion $$ z_i = \frac{y_i - \hat{y_i}}{sd(\hat{y_i})} = \frac{y_i - u_i\hat{\theta_i}}{\sqrt{u_i\hat{\theta_i}}} $$ $$ OD = \frac{1}{n - p}\sum_{i=1}^nz_i^2 $$ ```{r check-overdispersion} z <- (pizza$Number.Reviews - poisModReg$fitted.values) / sqrt(poisModReg$fitted.values) # Overdispersion Factor sum(z^2) / poisModReg$df.residual # Overdispersion p-value pchisq(sum(z^2), poisModReg$df.residual) ``` ## Fit Overdispersed Model ```{r fit-overdispersed} poisMod <- glm(Number.Reviews ~ Area + Fuel + Fuel*Price, data=pizza, family=quasipoisson(link="log")) summary(poisMod)$coef ``` ## Visualize the Coefficients ```{r pois-mod-coef,message=FALSE,fig.cap=""} require(coefplot) coefplot(poisMod, sort="mag") ``` ## Visualize Overdispersed vs Corrected ```{r pois-mod-coef-overdispersed,message=FALSE,fig.cap="",fig.show='hold'} coefplot(poisModReg, sort="mag", title="Regular Model") coefplot(poisMod, sort="mag", title="Overdispersed") ``` ## Other Parameterizations ```{r other-models} mod1 <- glm(Number.Reviews ~ Area + Fuel + Price, data=pizza, family=quasipoisson(link="log")) mod2 <- glm(Number.Reviews ~ Fuel + Price, data=pizza, family=quasipoisson(link="log")) mod3 <- glm(Number.Reviews ~ Fuel * Price, data=pizza, family=quasipoisson(link="log")) nullMod <- glm(Number.Reviews ~ 1, data=pizza, family=quasipoisson(link="log")) ``` # Goodness of Fit ## Wald Test $$ \frac{\hat{\theta} - \theta_0}{se(\hat{\theta})} $$ ```{r wald-test,message=FALSE} require(lmtest) waldtest(poisMod) ``` ## Wald Test on Multiple Models ```{r wald-mult} waldtest(poisMod, mod1, mod2, mod3, nullMod) ``` ## Drop-In Deviance Test $$ d(y;\hat{\theta}) = 2\sum_{i}y_ilog(\frac{y_i}{\hat{\theta_i}}) $$ Expect a drop of 2 for each additional parameter ```{r dropin-test} anova(poisMod) ``` ## Drop-In Deviance Test Multiple Models ```{r dropin-mult-test} anova(nullMod, mod2, mod1, mod3, poisMod) ``` # Rankings ## Fitted Values ```{r fitted-values,echo=-1} options("width"=80) pizza$Fitted <- poisMod$fitted.values print(pizza[order(pizza$Fitted, decreasing=TRUE)[1:15], c("Fitted", "Rating", "Name", "Area", "Price", "Number.Reviews", "Fuel")], row.names=FALSE) ``` ## Coefficients ```{r coef-again,echo=FALSE,fig.cap=""} coefplot(poisMod, sort="mag") ``` ## Most Popular by Number of Reviews ```{r rank-by-reviews} print(pizza[order(pizza$Number.Reviews, decreasing=TRUE)[1:15], c("Rating", "Name", "Area", "Price", "Number.Reviews", "Fuel")], row.names=FALSE) ``` ## Highest Ratings ```{r rank-by-rating,echo=-1} options("width"=80) print(pizza[order(pizza$Rating, decreasing=TRUE)[1:15], c("Rating", "Name", "Area", "Price", "Number.Reviews", "Fuel")], row.names=FALSE) ``` ## Common Top Pizzerias
Lombardi's
Di Fara
Joe's Famous
John's of Bleecker
Totonno's (Coney Island)
Vinny Vincenz
Denino's Pizzeria & Tavern
No. 28
L&B Spumoni Gardens
Patsy's (East Harlem)
Nick's Pizza
Artichoke
Motorino
New York Pizza Suprema
Keste
Franny's
Joe & Pat's
Forcella
# Conclusions ## Findings
* Coal ovens attract large crowds and the higher prices are worth paying
## Findings
* Coal ovens attract large crowds and the higher prices are worth paying
* The old adage about pizza is true: "Even when it's bad, it's still good."
## Blogosphere
[Slice/Serious Eats](http://slice.seriouseats.com/archives/2010/03/the-moneyball-of-pizza-statistician-uses-statistics-to-find-nyc-best-pizza.html)
[Midtown Lunch](http://midtownlunch.com/2010/03/24/everybody-agrees-midtown-pizza-sucks/)
[Revolution Analytics](http://blog.revolutionanalytics.com/2010/03/predicting-pizza.html)
NBC New York
## Jared's Picks * Keste * Lombardi's * Motorino * Artichoke * Don Antonio * Posto * John's of Bleecker * Vinny Vincenz * Luzzo's * L&B Spumoni Gardens # Questions? ## Built With... - [RStudio](http://www.rstudio.org) - [knitr](http://yihui.name/knitr/) - [Pandoc](http://johnmacfarlane.net/pandoc/README.html)