##### # Code for analyzing top pizzerias by blog mentions ## Reads the data pizza <- read.csv("http://www.jaredlander.com/wordpress/wordpress-2.9.2/wordpress/wp-content/uploads/2010/04/Pizzeria%20Mentions.csv") ## Clean up the data and create new variables as needed pizza <- pizza[!is.na(pizza$Distance),] # elimintae big chains pizza$Color <- 1:21 pizza$LogDist <- log(pizza$Distance + .01) # +.01 to account for 0's pizza$Rank <- rank(pizza$Times) pizza$DistAdj <- (pizza$Distance - mean(pizza$Distance))/sd(pizza$Distance) ## Fit Poisson Regression pizzaPoi <- glm(Times ~ Distance*PlainPrice, family = "poisson",data = pizza) ## Test for overdispersion z <- (pizza$Times-pizzaPoi$fitted)/sqrt(pizzaPoi$fitted) sum(z^2)/(nrow(pizza)-length(pizzaPoi$coef)) pchisq(sum(z^2),nrow(pizza)-length(pizzaPoi$coef)) ## Fit Overdispersed Poisson Regression pizzaPoi2 <- glm(Times ~ Distance*PlainPrice, family = "quasipoisson",data = pizza) ## Histogram for Times Mentioned (hist(pizza$Times,main="Pizzeria Mentions",xlab="",ylab="") ## Histogram for fitted data hist(pizzaPoi$fitted,main="Pizzeria Mentions Predicted Values",xlab="",ylab="") computeHartigan <- function(dataSet,columns) { harts <- rep(0,11) withinSSs <- rep(0,11) for(i in 2:12) { KmeansActual <- kmeans(scale(dataSet[,columns]),centers=i,iter.max=15000,nstart=15) KmeansPlus1 <- kmeans(scale(dataSet[,columns]),centers=i+1,iter.max=15000,nstart=15) harts[i-1] <- ((sum(KmeansActual$withinss)/sum(KmeansPlus1$withinss)) - 1)*(nrow(dataSet)-i-1) withinSSs[i-1] <- sum(KmeansActual$withinss) cat("K=",i,":", harts[i-1] ,"\n","SS:",i,":", withinSSs[i-1] ,"\n" ) } plot(2:12,harts,main="Hartigan's Rule",xlab="Number of Clusters",ylab="Hartigan Number",type="l") } computeHartigan(pizza,c(4,5,6)) # 2 or 3 clusters #Distance computeHartigan(pizza,c(4,8,6)) # 2 or 3 clusters # standardized distance computeHartigan(pizza,c(4,10,6)) # 2 or 3 clusters # log distance pizza2Means1 <- kmeans(pizza[,4:6],2) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pizza2Means1$cluster,ylab="Times Mentioned",xlab="Distance from NY",main="k-Means Clustering",sub="2 Centers") ################### These two were used pizza3Means1 <- kmeans(pizza[,4:6],3) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pizza3Means1$cluster,ylab="Times Mentioned",xlab="Distance from NY",main="k-Means Clustering",sub="3 Centers") pizza4Means1 <- kmeans(pizza[,4:6],4) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pizza4Means1$cluster,ylab="Times Mentioned",xlab="Distance from NY",main="k-Means Clustering",sub="4 Centers") ################### These two were used pizza2Means2 <- kmeans(pizza[,c(4,8,6)],2) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pizza2Means2$cluster,ylab="Times Mentioned",xlab="Distance from NY") pizza3Means2 <- kmeans(pizza[,c(4,8,6)],3) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pizza3Means2$cluster,ylab="Times Mentioned",xlab="Distance from NY") pizza2Means3 <- kmeans(pizza[,c(4,10,6)],2) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pizza2Means3$cluster,ylab="Times Mentioned",xlab="Distance from NY") pizza3Means3 <- kmeans(pizza[,c(4,10,6)],3) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pizza3Means3$cluster,ylab="Times Mentioned",xlab="Distance from NY") pam2 <- pam(pizza[,c(4,5,6)],k=2) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pam2$cluster,ylab="Times Mentioned",xlab="Distance from NY",main="Partitioning Around Medoids",sub="2 Medoids") ################ This one was used pam3 <- pam(pizza[,c(4,5,6)],k=3) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pam3$cluster,ylab="Times Mentioned",xlab="Distance from NY",main="Partitioning Around Medoids",sub="3 Medoids") ################ This one was used pam4 <- pam(pizza[,c(4,5,6)],k=4) plot(pizza$Distance,pizza$Times,pch=16,cex=pizza$Rank/5,col=pam4$cluster,ylab="Times Mentioned",xlab="Distance from NY",main="Partitioning Around Medoids",sub="4 Medoids") plot(pam3,main="PAM Clustering") plot(pam4,main="PAM Clustering")