Jared P. Lander
Cornell Tech
February 5, 2014
# remove known outlier
pizza <- pizza[pizza$Name != "Otto Enoteca and Pizzeria", ]
ggplot(pizza, aes(x = Number.Reviews)) + geom_histogram(binwidth = 10, fill = "grey",
color = "darkgrey") + xlab("Number of Reviews")
ggplot(pizza, aes(x = Number.Reviews)) + geom_histogram(binwidth = 10, fill = "grey",
color = "darkgrey") + facet_wrap(~Area) + xlab("Number of Reviews")
ggplot(pizza, aes(x = Number.Reviews)) + geom_histogram(binwidth = 10, fill = "grey",
color = "darkgrey") + facet_wrap(~Fuel) + xlab("Number of Reviews") + ggtitle("Histogram by Fuel")
ggplot(pizza, aes(x = Number.Reviews)) + geom_histogram(binwidth = 10, fill = "grey",
color = "darkgrey") + facet_wrap(~Fuel, scales = "free_y") + xlab("Number of Reviews") +
ggtitle("Histogram by Fuel\nFree Scales")
ggplot(pizza, aes(x = Number.Reviews)) + geom_histogram(binwidth = 10, fill = "grey",
color = "darkgrey") + facet_grid(Fuel ~ Area, scales = "free_y") + xlab("Number of Reviews")
ggplot(pizza, aes(x = Number.Reviews, y = Rating)) + geom_point() + xlab("Number of Reviews")
ggplot(pizza, aes(x = Rating)) + geom_histogram(binwidth = 0.5, fill = "grey",
color = "darkgrey")
ggplot(pois, aes(x = x)) + geom_histogram(binwidth = 1) + facet_wrap(~Lambda) +
ggtitle("Probability Mass Function")
ggplot(pois, aes(x = x)) + geom_density(aes(group = Lambda, color = Lambda,
fill = Lambda), adjust = 4, alpha = 1/2) + scale_color_discrete() + scale_fill_discrete() +
ggtitle("Probability Mass Function")
poisModReg <- glm(Number.Reviews ~ Area + Fuel + Fuel * Price, data = pizza,
family = poisson(link = "log"))
summary(poisModReg)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.8129 0.05903 64.588 0.000e+00
## AreaUptown 0.1832 0.02172 8.435 3.318e-17
## FuelGas -1.6387 0.06019 -27.224 3.379e-163
## FuelWood -0.7237 0.08885 -8.145 3.793e-16
## PriceExpensive -0.5434 0.07919 -6.862 6.789e-12
## FuelGas:PriceExpensive 1.1140 0.08289 13.439 3.559e-41
## FuelWood:PriceExpensive 0.5420 0.10883 4.980 6.347e-07
z <- (pizza$Number.Reviews - poisModReg$fitted.values)/sqrt(poisModReg$fitted.values)
# Overdispersion Factor
sum(z^2)/poisModReg$df.residual
## [1] 11.73
# Overdispersion p-value
pchisq(sum(z^2), poisModReg$df.residual)
## [1] 1
poisMod <- glm(Number.Reviews ~ Area + Fuel + Fuel * Price, data = pizza, family = quasipoisson(link = "log"))
summary(poisMod)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8129 0.20215 18.861 2.319e-63
## AreaUptown 0.1832 0.07439 2.463 1.404e-02
## FuelGas -1.6387 0.20612 -7.950 8.525e-15
## FuelWood -0.7237 0.30427 -2.379 1.768e-02
## PriceExpensive -0.5434 0.27119 -2.004 4.551e-02
## FuelGas:PriceExpensive 1.1140 0.28386 3.925 9.634e-05
## FuelWood:PriceExpensive 0.5420 0.37269 1.454 1.463e-01
coefplot(poisModReg, sort = "mag", title = "Regular Model")
coefplot(poisMod, sort = "mag", title = "Overdispersed")
mod1 <- glm(Number.Reviews ~ Area + Fuel + Price, data = pizza, family = quasipoisson(link = "log"))
mod2 <- glm(Number.Reviews ~ Fuel + Price, data = pizza, family = quasipoisson(link = "log"))
mod3 <- glm(Number.Reviews ~ Fuel * Price, data = pizza, family = quasipoisson(link = "log"))
nullMod <- glm(Number.Reviews ~ 1, data = pizza, family = quasipoisson(link = "log"))
require(lmtest)
waldtest(poisMod)
## Wald test
##
## Model 1: Number.Reviews ~ Area + Fuel + Fuel * Price
## Model 2: Number.Reviews ~ 1
## Res.Df Df F Pr(>F)
## 1 636
## 2 642 -6 24.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
waldtest(poisMod, mod1, mod2, mod3, nullMod)
## Wald test
##
## Model 1: Number.Reviews ~ Area + Fuel + Fuel * Price
## Model 2: Number.Reviews ~ Area + Fuel + Price
## Model 3: Number.Reviews ~ Fuel + Price
## Model 4: Number.Reviews ~ Fuel * Price
## Model 5: Number.Reviews ~ 1
## Res.Df Df F Pr(>F)
## 1 636
## 2 638 -2 9.33 0.0001 ***
## 3 639 -1 6.26 0.0126 *
## 4 637 2 9.54 8.2e-05 ***
## 5 642 -5 27.93 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Expect a drop of 2 for each additional parameter
anova(poisMod)
## Analysis of Deviance Table
##
## Model: quasipoisson, link: log
##
## Response: Number.Reviews
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 642 7496
## Area 1 105 641 7391
## Fuel 2 872 639 6519
## Price 1 376 638 6143
## Fuel:Price 2 207 636 5936
anova(nullMod, mod2, mod1, mod3, poisMod)
## Analysis of Deviance Table
##
## Model 1: Number.Reviews ~ 1
## Model 2: Number.Reviews ~ Fuel + Price
## Model 3: Number.Reviews ~ Area + Fuel + Price
## Model 4: Number.Reviews ~ Fuel * Price
## Model 5: Number.Reviews ~ Area + Fuel + Fuel * Price
## Resid. Df Resid. Dev Df Deviance
## 1 642 7496
## 2 639 6218 3 1278
## 3 638 6143 1 76
## 4 637 6007 1 136
## 5 636 5936 1 71
pizza$Fitted <- poisMod$fitted.values
print(pizza[order(pizza$Fitted, decreasing = TRUE)[1:15], c("Fitted", "Rating",
"Name", "Area", "Price", "Number.Reviews", "Fuel")], row.names = FALSE)
## Fitted Rating Name Area Price Number.Reviews Fuel
## 54.39 3.5 Angelo's Pizza Uptown Cheap 55 Coal
## 54.39 3.5 John's Pizzeria Uptown Cheap 33 Coal
## 54.39 4.0 John's Pizzeria Uptown Cheap 83 Coal
## 45.28 4.0 John's of Bleecker Street Downtown Cheap 28 Coal
## 45.28 3.5 Lombardi's Downtown Cheap 97 Coal
## 45.28 2.0 South Brooklyn Pizza Downtown Cheap 3 Coal
## 31.58 4.5 Patsy's Uptown Expensive 10 Coal
## 31.58 3.5 Patsy's Pizzeria Uptown Expensive 13 Coal
## 31.58 3.5 Patsy's Pizzeria Uptown Expensive 32 Coal
## 31.58 3.5 Patsy's Pizzeria Uptown Expensive 25 Coal
## 31.58 3.0 Totonno's Uptown Expensive 44 Coal
## 26.37 3.5 Sal's Pizza Uptown Cheap 4 Wood
## 26.34 4.0 Al Forno Uptown Expensive 47 Wood
## 26.34 3.5 Bella Blu Uptown Expensive 17 Wood
## 26.34 3.0 Bricco Uptown Expensive 16 Wood
print(pizza[order(pizza$Number.Reviews, decreasing = TRUE)[1:15], c("Rating",
"Name", "Area", "Price", "Number.Reviews", "Fuel")], row.names = FALSE)
## Rating Name Area Price Number.Reviews Fuel
## 3.5 Lombardi's Downtown Cheap 97 Coal
## 3.5 Arte Cafe Uptown Expensive 86 Gas
## 4.0 John's Pizzeria Uptown Cheap 83 Coal
## 3.5 Il Corallo Trattoria Downtown Cheap 81 Gas
## 3.5 Adrienne's Pizza Bar Downtown Cheap 75 Wood
## 3.5 Pizza Shack Downtown Cheap 73 Gas
## 4.0 Nina's Uptown Expensive 64 Gas
## 3.5 Brick Oven Pizza 33 Downtown Cheap 61 Gas
## 3.5 Big Nick's Uptown Cheap 58 Gas
## 3.5 Celeste Uptown Expensive 57 Wood
## 3.5 Posto Downtown Cheap 56 Gas
## 3.5 Angelo's Pizza Uptown Cheap 55 Coal
## 3.5 Cafe Viva Uptown Cheap 54 Gas
## 4.0 Simply Pasta Downtown Expensive 54 Gas
## 4.0 44 Southwest Uptown Expensive 53 Gas
print(pizza[order(pizza$Rating, decreasing = TRUE)[1:15], c("Rating", "Name",
"Area", "Price", "Number.Reviews", "Fuel")], row.names = FALSE)
## Rating Name Area Price Number.Reviews Fuel
## 5.0 94 Deli & Salad Downtown Cheap 1 Gas
## 5.0 BB&R Uptown Cheap 1 Gas
## 5.0 Bella Maria Downtown Cheap 2 Gas
## 5.0 Cafe Bonjour Downtown Cheap 3 Gas
## 5.0 Energy Fuel Downtown Cheap 2 Gas
## 5.0 Georgios Pizzeria Downtown Cheap 2 Gas
## 5.0 Grandaisy Bakery Uptown Cheap 2 Gas
## 5.0 Italian Cafe Uptown Cheap 2 Gas
## 5.0 Mark's Pizzeria Restaurant Downtown Cheap 1 Gas
## 5.0 Pronto Pizza Downtown Cheap 1 Gas
## 5.0 Roma Pizza Downtown Expensive 2 Gas
## 4.5 215 Cucina Napoletana Downtown Cheap 5 Gas
## 4.5 Alia Restaurant Downtown Expensive 7 Gas
## 4.5 Best Of The West Downtown Cheap 7 Gas
## 4.5 Carmines Downtown Cheap 1 Gas
Lombardi’s Di Fara Joe’s Famous John’s of Bleecker Totonno’s (Coney Island) Vinny Vincenz Denino’s Pizzeria & Tavern No. 28 L&B Spumoni Gardens |
Patsy’s (East Harlem) Nick’s Pizza Artichoke Motorino New York Pizza Suprema Keste Franny’s Joe & Pat’s Forcella |
|
|
|
|
Slice/Serious Eats Midtown Lunch Revolution Analytics NBC New York |