Introduction: This is an exploratory data analysis for the data set between Beers.csv and Breweries.csv for CEO and CFO of Budweiser
Doing intial inspection of data through various methods below.
# dataset should be in the same folder of this RMD file
Beers = read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/Beers.csv",header = TRUE) #loading beers dataset
Breweries = read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/Breweries.csv",header = TRUE) #loading breweries dataset
#below this line is for self analyzation
#summary(Beers)
#str(Beers)
##Beers$IBU
#summary(Breweries)
#str(Breweries)
#Above this line is for self analyzing can be deleted later
# trim state column
Breweries$State = trimws(Breweries$State)
#Turn Breweries State column into a factor
Breweries$State = as.factor(Breweries$State)
library(tidyverse)
library(ggplot2)
num_Breweries_by_state = Breweries %>% group_by(State) %>%
summarise(count=n())
## `summarise()` ungrouping output (override with `.groups` argument)
num_Breweries_by_state
## # A tibble: 51 x 2
## State count
## <fct> <int>
## 1 AK 7
## 2 AL 3
## 3 AR 2
## 4 AZ 11
## 5 CA 39
## 6 CO 47
## 7 CT 8
## 8 DC 1
## 9 DE 2
## 10 FL 15
## # … with 41 more rows
#As we can see the number of breweries per state is in the list below, to see this better we will use a plot to show results
ggplot(data=num_Breweries_by_state)+
geom_bar(mapping=aes(x=State,y=count,fill=State),stat="identity") +
coord_flip()+
ggtitle("Breweries count by state")+xlab("State")+ylab("Count of Breweries")

#To see this in order
ggplot(data=num_Breweries_by_state)+
geom_bar(mapping=aes(x=reorder(State,-count),y=count,fill=State),stat="identity") +
coord_flip()+
ggtitle("Breweries count by state")+xlab("State")+ylab("Count of Breweries")

Beers = Beers %>% rename(Brew_ID= Brewery_id)
Beers.with.Breweries = left_join(Beers,Breweries, by = "Brew_ID")
Beers.with.Breweries = Beers.with.Breweries %>% rename(Beer_Name= Name.x)
Beers.with.Breweries = Beers.with.Breweries %>% rename(Brew_Name= Name.y)
head(Beers.with.Breweries,6)
## Beer_Name Beer_ID ABV IBU Brew_ID Style Ounces
## 1 Pub Beer 1436 0.050 NA 409 American Pale Lager 12
## 2 Devil's Cup 2265 0.066 NA 178 American Pale Ale (APA) 12
## 3 Rise of the Phoenix 2264 0.071 NA 178 American IPA 12
## 4 Sinister 2263 0.090 NA 178 American Double / Imperial IPA 12
## 5 Sex and Candy 2262 0.075 NA 178 American IPA 12
## 6 Black Exodus 2261 0.077 NA 178 Oatmeal Stout 12
## Brew_Name City State
## 1 10 Barrel Brewing Company Bend OR
## 2 18th Street Brewery Gary IN
## 3 18th Street Brewery Gary IN
## 4 18th Street Brewery Gary IN
## 5 18th Street Brewery Gary IN
## 6 18th Street Brewery Gary IN
summary(Beers.with.Breweries)
## Beer_Name Beer_ID ABV IBU Brew_ID
## Length:2410 Min. : 1.0 Min. :0.00100 Min. : 4.00 Min. : 1.0
## Class :character 1st Qu.: 808.2 1st Qu.:0.05000 1st Qu.: 21.00 1st Qu.: 94.0
## Mode :character Median :1453.5 Median :0.05600 Median : 35.00 Median :206.0
## Mean :1431.1 Mean :0.05977 Mean : 42.71 Mean :232.7
## 3rd Qu.:2075.8 3rd Qu.:0.06700 3rd Qu.: 64.00 3rd Qu.:367.0
## Max. :2692.0 Max. :0.12800 Max. :138.00 Max. :558.0
## NA's :62 NA's :1005
## Style Ounces Brew_Name City State
## Length:2410 Min. : 8.40 Length:2410 Length:2410 CO : 265
## Class :character 1st Qu.:12.00 Class :character Class :character CA : 183
## Mode :character Median :12.00 Mode :character Mode :character MI : 162
## Mean :13.59 IN : 139
## 3rd Qu.:16.00 TX : 130
## Max. :32.00 OR : 125
## (Other):1406
library(mice) #Load mice library to analyze the pattern of missing data
md.pattern(Beers.with.Breweries)

## Beer_Name Beer_ID Brew_ID Style Ounces Brew_Name City State ABV IBU
## 1405 1 1 1 1 1 1 1 1 1 1 0
## 943 1 1 1 1 1 1 1 1 1 0 1
## 62 1 1 1 1 1 1 1 1 0 0 2
## 0 0 0 0 0 0 0 0 62 1005 1067
# Since there is large amont of data missing in IBM column
#Try to impute the missing data with Predictive mean Matching method
tempData <- mice(Beers.with.Breweries,m=5,maxit=50,meth='pmm',seed=20)
##
## iter imp variable
## 1 1 ABV IBU
## 1 2 ABV IBU
## 1 3 ABV IBU
## 1 4 ABV IBU
## 1 5 ABV IBU
## 2 1 ABV IBU
## 2 2 ABV IBU
## 2 3 ABV IBU
## 2 4 ABV IBU
## 2 5 ABV IBU
## 3 1 ABV IBU
## 3 2 ABV IBU
## 3 3 ABV IBU
## 3 4 ABV IBU
## 3 5 ABV IBU
## 4 1 ABV IBU
## 4 2 ABV IBU
## 4 3 ABV IBU
## 4 4 ABV IBU
## 4 5 ABV IBU
## 5 1 ABV IBU
## 5 2 ABV IBU
## 5 3 ABV IBU
## 5 4 ABV IBU
## 5 5 ABV IBU
## 6 1 ABV IBU
## 6 2 ABV IBU
## 6 3 ABV IBU
## 6 4 ABV IBU
## 6 5 ABV IBU
## 7 1 ABV IBU
## 7 2 ABV IBU
## 7 3 ABV IBU
## 7 4 ABV IBU
## 7 5 ABV IBU
## 8 1 ABV IBU
## 8 2 ABV IBU
## 8 3 ABV IBU
## 8 4 ABV IBU
## 8 5 ABV IBU
## 9 1 ABV IBU
## 9 2 ABV IBU
## 9 3 ABV IBU
## 9 4 ABV IBU
## 9 5 ABV IBU
## 10 1 ABV IBU
## 10 2 ABV IBU
## 10 3 ABV IBU
## 10 4 ABV IBU
## 10 5 ABV IBU
## 11 1 ABV IBU
## 11 2 ABV IBU
## 11 3 ABV IBU
## 11 4 ABV IBU
## 11 5 ABV IBU
## 12 1 ABV IBU
## 12 2 ABV IBU
## 12 3 ABV IBU
## 12 4 ABV IBU
## 12 5 ABV IBU
## 13 1 ABV IBU
## 13 2 ABV IBU
## 13 3 ABV IBU
## 13 4 ABV IBU
## 13 5 ABV IBU
## 14 1 ABV IBU
## 14 2 ABV IBU
## 14 3 ABV IBU
## 14 4 ABV IBU
## 14 5 ABV IBU
## 15 1 ABV IBU
## 15 2 ABV IBU
## 15 3 ABV IBU
## 15 4 ABV IBU
## 15 5 ABV IBU
## 16 1 ABV IBU
## 16 2 ABV IBU
## 16 3 ABV IBU
## 16 4 ABV IBU
## 16 5 ABV IBU
## 17 1 ABV IBU
## 17 2 ABV IBU
## 17 3 ABV IBU
## 17 4 ABV IBU
## 17 5 ABV IBU
## 18 1 ABV IBU
## 18 2 ABV IBU
## 18 3 ABV IBU
## 18 4 ABV IBU
## 18 5 ABV IBU
## 19 1 ABV IBU
## 19 2 ABV IBU
## 19 3 ABV IBU
## 19 4 ABV IBU
## 19 5 ABV IBU
## 20 1 ABV IBU
## 20 2 ABV IBU
## 20 3 ABV IBU
## 20 4 ABV IBU
## 20 5 ABV IBU
## 21 1 ABV IBU
## 21 2 ABV IBU
## 21 3 ABV IBU
## 21 4 ABV IBU
## 21 5 ABV IBU
## 22 1 ABV IBU
## 22 2 ABV IBU
## 22 3 ABV IBU
## 22 4 ABV IBU
## 22 5 ABV IBU
## 23 1 ABV IBU
## 23 2 ABV IBU
## 23 3 ABV IBU
## 23 4 ABV IBU
## 23 5 ABV IBU
## 24 1 ABV IBU
## 24 2 ABV IBU
## 24 3 ABV IBU
## 24 4 ABV IBU
## 24 5 ABV IBU
## 25 1 ABV IBU
## 25 2 ABV IBU
## 25 3 ABV IBU
## 25 4 ABV IBU
## 25 5 ABV IBU
## 26 1 ABV IBU
## 26 2 ABV IBU
## 26 3 ABV IBU
## 26 4 ABV IBU
## 26 5 ABV IBU
## 27 1 ABV IBU
## 27 2 ABV IBU
## 27 3 ABV IBU
## 27 4 ABV IBU
## 27 5 ABV IBU
## 28 1 ABV IBU
## 28 2 ABV IBU
## 28 3 ABV IBU
## 28 4 ABV IBU
## 28 5 ABV IBU
## 29 1 ABV IBU
## 29 2 ABV IBU
## 29 3 ABV IBU
## 29 4 ABV IBU
## 29 5 ABV IBU
## 30 1 ABV IBU
## 30 2 ABV IBU
## 30 3 ABV IBU
## 30 4 ABV IBU
## 30 5 ABV IBU
## 31 1 ABV IBU
## 31 2 ABV IBU
## 31 3 ABV IBU
## 31 4 ABV IBU
## 31 5 ABV IBU
## 32 1 ABV IBU
## 32 2 ABV IBU
## 32 3 ABV IBU
## 32 4 ABV IBU
## 32 5 ABV IBU
## 33 1 ABV IBU
## 33 2 ABV IBU
## 33 3 ABV IBU
## 33 4 ABV IBU
## 33 5 ABV IBU
## 34 1 ABV IBU
## 34 2 ABV IBU
## 34 3 ABV IBU
## 34 4 ABV IBU
## 34 5 ABV IBU
## 35 1 ABV IBU
## 35 2 ABV IBU
## 35 3 ABV IBU
## 35 4 ABV IBU
## 35 5 ABV IBU
## 36 1 ABV IBU
## 36 2 ABV IBU
## 36 3 ABV IBU
## 36 4 ABV IBU
## 36 5 ABV IBU
## 37 1 ABV IBU
## 37 2 ABV IBU
## 37 3 ABV IBU
## 37 4 ABV IBU
## 37 5 ABV IBU
## 38 1 ABV IBU
## 38 2 ABV IBU
## 38 3 ABV IBU
## 38 4 ABV IBU
## 38 5 ABV IBU
## 39 1 ABV IBU
## 39 2 ABV IBU
## 39 3 ABV IBU
## 39 4 ABV IBU
## 39 5 ABV IBU
## 40 1 ABV IBU
## 40 2 ABV IBU
## 40 3 ABV IBU
## 40 4 ABV IBU
## 40 5 ABV IBU
## 41 1 ABV IBU
## 41 2 ABV IBU
## 41 3 ABV IBU
## 41 4 ABV IBU
## 41 5 ABV IBU
## 42 1 ABV IBU
## 42 2 ABV IBU
## 42 3 ABV IBU
## 42 4 ABV IBU
## 42 5 ABV IBU
## 43 1 ABV IBU
## 43 2 ABV IBU
## 43 3 ABV IBU
## 43 4 ABV IBU
## 43 5 ABV IBU
## 44 1 ABV IBU
## 44 2 ABV IBU
## 44 3 ABV IBU
## 44 4 ABV IBU
## 44 5 ABV IBU
## 45 1 ABV IBU
## 45 2 ABV IBU
## 45 3 ABV IBU
## 45 4 ABV IBU
## 45 5 ABV IBU
## 46 1 ABV IBU
## 46 2 ABV IBU
## 46 3 ABV IBU
## 46 4 ABV IBU
## 46 5 ABV IBU
## 47 1 ABV IBU
## 47 2 ABV IBU
## 47 3 ABV IBU
## 47 4 ABV IBU
## 47 5 ABV IBU
## 48 1 ABV IBU
## 48 2 ABV IBU
## 48 3 ABV IBU
## 48 4 ABV IBU
## 48 5 ABV IBU
## 49 1 ABV IBU
## 49 2 ABV IBU
## 49 3 ABV IBU
## 49 4 ABV IBU
## 49 5 ABV IBU
## 50 1 ABV IBU
## 50 2 ABV IBU
## 50 3 ABV IBU
## 50 4 ABV IBU
## 50 5 ABV IBU
#summary(tempData)
# completed dataset after adding in generated predictive values
completedData <- complete(tempData,1)
#head(completedData)
# Density plot original vs imputed dataset
densityplot(tempData)

#Note: idea used above to impute data is from link below:
#https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
#Compute and display Median of ABV and IBU by state:
median = completedData %>% group_by(State) %>%
summarize(median_ABV=median(ABV),median_IBU=median(IBU))
## `summarise()` ungrouping output (override with `.groups` argument)
median
## # A tibble: 51 x 3
## State median_ABV median_IBU
## <fct> <dbl> <dbl>
## 1 AK 0.056 35
## 2 AL 0.06 39.5
## 3 AR 0.052 72
## 4 AZ 0.055 28
## 5 CA 0.058 36
## 6 CO 0.061 35
## 7 CT 0.06 32
## 8 DC 0.0625 75
## 9 DE 0.0705 52
## 10 FL 0.0555 36.5
## # … with 41 more rows
#Draw Bar Charts to compare
#First plot median of alcohol content using modified data
median %>% ggplot()+
geom_bar(mapping=aes(x=reorder(State,-median_ABV),y=median_ABV,fill=State),stat="identity") +
coord_flip()+
ggtitle("Median Alcohol content by State on modified dataset")+xlab("State")+ylab("Alcohol Content Percentage")

#Below is result of using complete data set with missing data to plot median of alcohol content
Beers.with.Breweries %>% group_by(State) %>%
summarize(median_ABV=median(ABV),median_IBU=median(IBU))%>% ggplot()+
geom_bar(mapping=aes(x=reorder(State,-median_ABV),y=median_ABV,fill=State),stat="identity")+
coord_flip()+
ggtitle("Median Alcohol content by State on non-Modified dataset")+xlab("State")+ylab("Alcohol Content Percentage")
## `summarise()` ungrouping output (override with `.groups` argument)

#Below is result of plotting median international bitterness unit for each state on modified data set
median %>% ggplot()+
geom_bar(mapping=aes(x=reorder(State,-median_IBU),y=median_IBU,fill=State),stat="identity") +
coord_flip()+
ggtitle("Median International Bitterness Unit by State on modified dataset")+xlab("State")+ylab("International Bitterness Unit")

#Below is result of using complete data set with missing data to plot median of alcohol content
Beers.with.Breweries %>% group_by(State) %>%
summarize(median_ABV=median(ABV),median_IBU=median(IBU))%>% ggplot()+
geom_bar(mapping=aes(x=reorder(State,-median_IBU),y=median_IBU,fill=State),stat="identity") +
coord_flip()+
ggtitle("Median International Bitterness Unit by State on non-modified dataset")+xlab("State")+ylab("International Bitterness Unit")
## `summarise()` ungrouping output (override with `.groups` argument)

# Discover which state has the maximum alcoholic beer
head(completedData %>%
arrange(desc(ABV)) %>%
select(State,ABV,Beer_Name))
## State ABV Beer_Name
## 1 CO 0.128 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale
## 2 KY 0.125 London Balling
## 3 IN 0.120 Csar
## 4 CO 0.104 Lee Hill Series Vol. 4 - Manhattan Style Rye Ale
## 5 NY 0.100 4Beans
## 6 CA 0.099 Lower De Boom
# Discover with un-changed data set
head(Beers.with.Breweries %>%
arrange(desc(ABV)) %>%
select(State,ABV,Beer_Name))
## State ABV Beer_Name
## 1 CO 0.128 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale
## 2 KY 0.125 London Balling
## 3 IN 0.120 Csar
## 4 CO 0.104 Lee Hill Series Vol. 4 - Manhattan Style Rye Ale
## 5 NY 0.100 4Beans
## 6 CA 0.099 Lower De Boom
# Discover which state has the most bitter (IBU) beer
head(completedData %>%
arrange(desc(IBU)) %>%
select(State,IBU,Beer_Name))
## State IBU Beer_Name
## 1 OR 138 Bitter Bitch Imperial IPA
## 2 MI 138 Zaison
## 3 MI 138 Lemon Shandy Tripel
## 4 VA 135 Troopers Alley IPA
## 5 MI 130 Escoffier Bretta Ale
## 6 MA 130 Dead-Eye DIPA
# Discover with un-changed data set
head(Beers.with.Breweries %>%
arrange(desc(IBU)) %>%
select(State,IBU,Beer_Name))
## State IBU Beer_Name
## 1 OR 138 Bitter Bitch Imperial IPA
## 2 VA 135 Troopers Alley IPA
## 3 MA 130 Dead-Eye DIPA
## 4 OH 126 Bay of Bengal Double IPA (2014)
## 5 MN 120 Abrasive Ale
## 6 VT 120 Heady Topper
#first explore modified data
completedData %>%
ggplot(aes(x=ABV, y=IBU)) +
geom_point(size=2, shape=23, color = "dark green",position="jitter") +
geom_smooth(method=lm) +
ggtitle("IBU vs. ABV")+xlab("Alcohol by volume")+ylab("International Bitterness Unit")
## `geom_smooth()` using formula 'y ~ x'

#next explore unmodified data
Beers.with.Breweries %>% select(ABV, IBU, State) %>%
ggplot(aes(x=ABV, y=IBU)) +
geom_point(size=2, shape=23, color = "dark green",position="jitter") +
geom_smooth(method=lm) +
ggtitle("IBU vs. ABV")+xlab("Alcohol by volume")+ylab("International Bitterness Unit")
## `geom_smooth()` using formula 'y ~ x'

#In order to investigate the difference respect to IBU and ABV, first extract all name with Ales
#getting all bear name with ale in it (ignore the case factor)
all_ales = completedData %>% filter(str_detect(completedData$Beer_Name,regex("Ale|IPA",ignore.case=TRUE)))
india_pale_ales = all_ales %>%
filter(str_detect(all_ales$Beer_Name,regex("India Pale Ale|IPA",ignore.case=TRUE)))
other_ales = all_ales %>%
filter(!str_detect(all_ales$Beer_Name,regex("India Pale Ale|IPA",ignore.case=TRUE)))
# in order to effectively use KNN model, I decided to standardize percentage scale of ABV and IBU, so the effect of distance from ABV and IBU are roughly the same. I choose to use scale method to do so
all_ales$ABV = scale(all_ales$ABV)
all_ales$IBU = scale(all_ales$IBU)
# Change other ales name to other ales in order to use KNN model to test whether we can use IBU and ABV to distinguish IPAs from others
all_ales = all_ales %>%
mutate(India.Pale.Ale.Or.Else = ifelse(str_detect(Beer_Name,regex("India Pale Ale|IPA",ignore.case=TRUE)),"India Pale Ale","Other Ale"))
#all_ales$India.Pale.Ale.Or.Else
#Plot scatter plot on scaled data before running with KNN
all_ales %>%
ggplot(aes(x=ABV, y=IBU, fill=India.Pale.Ale.Or.Else)) +
geom_point(size=2, shape=23) +
geom_smooth(method=lm) +
ggtitle("Plot scaled alcohol content and bitterness (IPA and others)")+xlab("Alcohol by volume")+ylab("International Bitterness Unit")
## `geom_smooth()` using formula 'y ~ x'

# Start KNN test to see how good it is to use ABV and IBU to distinguish the Ales
library(class)
library(caret)
library(e1071)
all_ales$India.Pale.Ale.Or.Else = as.factor(all_ales$India.Pale.Ale.Or.Else)
#Keep my result reproducible initially tried set.seed(100), try k=5
set.seed(101)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
# try k=5
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 5)
table(classifications,test$India.Pale.Ale.Or.Else)
##
## classifications India Pale Ale Other Ale
## India Pale Ale 48 15
## Other Ale 26 102
cm = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
cm
## Confusion Matrix and Statistics
##
##
## classifications India Pale Ale Other Ale
## India Pale Ale 48 15
## Other Ale 26 102
##
## Accuracy : 0.7853
## 95% CI : (0.7203, 0.8413)
## No Information Rate : 0.6126
## P-Value [Acc > NIR] : 2.578e-07
##
## Kappa : 0.5351
##
## Mcnemar's Test P-Value : 0.1183
##
## Sensitivity : 0.6486
## Specificity : 0.8718
## Pos Pred Value : 0.7619
## Neg Pred Value : 0.7969
## Prevalence : 0.3874
## Detection Rate : 0.2513
## Detection Prevalence : 0.3298
## Balanced Accuracy : 0.7602
##
## 'Positive' Class : India Pale Ale
##
# explore best possible K value for accuracy
set.seed(101)
iterations = 500
numks = 50
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = i)
table(classifications,test$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
plot(seq(1,numks,1),MeanAcc, type = "l",main="mean Accuracy vs. different K (number of neighbor used to predict)",
ylab="Mean Accuracy",xlab="k used")

# explore best possible K value for sensitivity
set.seed(101)
iterations = 500
numks = 50
masterSen = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = i)
table(classifications,test$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
masterSen[j,i] = CM$byClass[1]
}
}
MeanSen = colMeans(masterSen)
plot(seq(1,numks,1),MeanSen, type = "l",main="mean Sensitivity vs. different K (number of neighbor used to predict)",
ylab="Mean Sensitivity",xlab="k used")

#Get an average percentage of Accuracy, Sensitivity, and Specificity of KNN model k=45
set.seed(101)
iterations = 100
numks = 50
masterAcc = matrix(nrow = iterations, ncol = 1)
masterSen = matrix(nrow = iterations, ncol = 1)
masterSpec = matrix(nrow = iterations, ncol = 1)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 45)
CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
masterAcc[j,1]=CM$overall[1]
masterSen[j,1]=CM$byClass[1]
masterSpec[j,1]=CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.7978534
MeanSen
## [1] 0.7759099
MeanSpec
## [1] 0.812105
# Try using Naive Bayes see if it will improve results, split 0.8:
set.seed(102)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
#NB model
model = naiveBayes(train[,3:4],train$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(predict(model,test[,3:4]),test$India.Pale.Ale.Or.Else))
CM
## Confusion Matrix and Statistics
##
##
## India Pale Ale Other Ale
## India Pale Ale 57 18
## Other Ale 21 95
##
## Accuracy : 0.7958
## 95% CI : (0.7316, 0.8506)
## No Information Rate : 0.5916
## P-Value [Acc > NIR] : 1.65e-09
##
## Kappa : 0.5749
##
## Mcnemar's Test P-Value : 0.7488
##
## Sensitivity : 0.7308
## Specificity : 0.8407
## Pos Pred Value : 0.7600
## Neg Pred Value : 0.8190
## Prevalence : 0.4084
## Detection Rate : 0.2984
## Detection Prevalence : 0.3927
## Balanced Accuracy : 0.7857
##
## 'Positive' Class : India Pale Ale
##
# Try getting average of Accuracy, Sensitivity and Specificity rate using NB model from 100 random generators
set.seed(101)
splitPerc = .8
iterations = 100
masterAcc = matrix(nrow = iterations, ncol = 1)
masterSen = matrix(nrow = iterations, ncol = 1)
masterSpec = matrix(nrow = iterations, ncol = 1)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
#NB model
model = naiveBayes(train[,3:4],train$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(predict(model,test[,3:4]),test$India.Pale.Ale.Or.Else))
masterAcc[j,1]=CM$overall[1]
masterSen[j,1]=CM$byClass[1]
masterSpec[j,1]=CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.7837173
MeanSen
## [1] 0.6808075
MeanSpec
## [1] 0.8485147
#import map library
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Import US Cities location data
uscities <- read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/uscities.csv",header = TRUE)
uscities = uscities%>%rename(City = city)
uscities = uscities%>% group_by(City) %>% filter(row_number()==1)
#Loading in State Coordinates
state_coord <- read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/states_coord.csv",header = TRUE)
# Getting all Ales Beer available
allAles2 = completedData %>% filter(str_detect(completedData$Beer_Name,regex("Ale",ignore.case=TRUE)))
dataWithMap = left_join(allAles2,uscities, by = "City")
#str(dataWithMap)
#dataWithMap %>% filter((is.na(lng))|(is.na(lat))) %>% select(Brew_Name,City)
dataWithMap2 = dataWithMap %>% select(Beer_Name,City,lat,lng,state_name)
#head(dataWithMap2,100)
dataWithMap3 = dataWithMap2 %>% group_by(City) %>% mutate(count = n())
dataWithMap3 = dataWithMap3 %>% group_by(City)%>%filter(row_number()==1)%>%
filter((!is.na(lng))&(!is.na(lat)))
states <- map_data("state")
p <- ggplot() +
geom_polygon(data = states, aes(x = long, y = lat, group = group), fill = "yellow", color = "black") +
coord_quickmap()
p <-p + geom_point(data = dataWithMap3, aes(x = lng, y = lat, size=count,alpha=count),color="blue")+
geom_text(data = state_coord, aes(x = longitude, y = latitude, label = state))+
ggtitle("Density of Ales count by City") + xlab("Longitude")+ylab("Latitute")
ggplotly(p)
# Getting all Beer available
dataWithMap = left_join(completedData,uscities, by = "City")
#str(dataWithMap)
#dataWithMap %>% filter((is.na(lng))|(is.na(lat))) %>% select(Brew_Name,City)
dataWithMap2 = dataWithMap %>% select(Beer_Name,City,lat,lng,state_name)
#head(dataWithMap2,100)
dataWithMap3 = dataWithMap2 %>% group_by(City) %>% mutate(count = n())
dataWithMap3 = dataWithMap3 %>% group_by(City)%>%filter(row_number()==1)%>%
filter((!is.na(lng))&(!is.na(lat)))
states <- map_data("state")
p <- ggplot() +
geom_polygon(data = states, aes(x = long, y = lat, group = group), fill = "yellow", color = "black") +
coord_quickmap()
p <-p + geom_point(data = dataWithMap3, aes(x = lng, y = lat, size=count,alpha=count),color="blue")+
geom_text(data = state_coord, aes(x = longitude, y = latitude, label = state))+
ggtitle("Density of all Beer count by City") + xlab("Longitude")+ylab("Latitute")
ggplotly(p)
library(ggplot2)
library(dplyr)
library(mapproj)
library(plyr)
## ---------------------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ---------------------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:maps':
##
## ozone
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
## The following object is masked from 'package:purrr':
##
## compact
lookup = data.frame(abb = state.abb, State = state.name) #makes a data frame with State name and abbreviation.
#lookup
colnames(completedData)[10] = "abb"
completedData$abb = as.character(completedData$abb)
Breweries2 = merge(completedData,lookup, by.x = "abb", by.y="abb", all.x=TRUE) # make one dataset with state names and abb
#Breweries2
Breweries2$ABV <- as.numeric(Breweries2$ABV)
BreweriesMapData <- Breweries2 %>% select(ABV, State) %>% group_by(State)
#BreweriesMapData
BreweriesMapData <- aggregate(. ~ State, BreweriesMapData, mean)
BreweriesMapData$region <- tolower(BreweriesMapData$State)
BreweriesMapData = BreweriesMapData[-1]
#BreweriesMapData
states <- map_data("state")
#states
map.df <- full_join(states,BreweriesMapData, by="region", all.x=T)
#map.df
map.df <- map.df[order(map.df$order),]
h <- ggplot(map.df, aes(x=long,y=lat))+
geom_polygon(aes(fill=ABV))+
geom_path(aes(group=group))+
geom_text(data = state_coord, aes(x = longitude, y = latitude, label = state))+
scale_fill_gradientn(colours=rev(heat.colors(5)),na.value="grey90")+ggtitle("Average ABV By State")+
coord_map()
ggplotly(h)
We received datasets that contain beers and breweries.We are trying to discover any relationship between Alcohol by volume(ABV) and International Bitterness(IBU). Scatter plot shows a positive but weak linear relationship between ABV and IBU. We used such relationship between ABV and IBU to classify types of Ales into Indian Pale Ale or other types of Ales. The accuracy rate of our KNN model is pretty good at 80%, we can correctly predicting Indian Pale Ales at 78%, and correctly predicting other types of Ales at 81%. We obtain similar results from Naive Bayes model, except correctly predicting Indian Pale Ales at 68%. However, depending on what the customer is looking for, Naive Bayes model can be more efficient.
We obtained density plots of all beers produced at facilities near cities. We were then able to go deeper into the dataset and show facilities of any kind of beer, such as Ales. This is a great asset management tool, and gives us overview of which kind of beer are more prevalent facilities across the country. It gives us leads to dig deeper into areas and find out why? Cost(Need sales data)? Resources(Need agriculture data)? Popularity(Need more research)?
Comment on the summary statistics and distribution of the ABV variable.