Exploratory Analysis

Restaurant concentration

Here is an interactive plot which shows restaurant concentration for individual cities.

# Graph 3
#-----With legend
#Define color palette
dfForGraph <- dfFOrLeaflet %>%
  group_by(Latitude,Longitude,City,Price_range) %>%
  summarise(count=n())%>% filter(count >20) 

pal <- colorNumeric(
  palette = 'RdBu',#"YlGnBu",
  domain = dfForGraph$count
)
# Create Map
map<-leaflet(dfForGraph) %>% addTiles() %>% setView(lng = -111.92556, 
                                                    lat = 33.56518, zoom = 12) %>%
  addCircles(lng = ~Longitude, lat = ~Latitude, weight = 1, color = ~pal(count),
             radius = ~sqrt(count) * 30, popup = ~City 
  )
#Add legend
map %>%
  addLegend("bottomright", pal = pal, values = ~dfForGraph$count,
            title = "Count of Restaurants",
            opacity = 1)

State-wise distribution of restaurants

ggplot(data = as.data.frame(table(final_df$state)) ,
       aes(x = reorder(Var1, -Freq), y = Freq, label = paste0(Freq))) +
  geom_segment(aes(x = reorder(Var1, -Freq) ,xend=Var1, y=0, yend=Freq), color = 'black') +
  geom_point(size=13, color= 'maroon') +
  geom_text(color = "white", size = 4) +
  labs(x ="States", y= "No. of Restaurants", title = "States and No. of Restaurants") +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), 
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_x_discrete(breaks=c("ON","AZ","NV","QC","OH","NC","PA","AB","WI","IL","AR","AL","BC","CA",
                            "CO","FL","HI","HPL","MB","NE","NY","OR","SC","TX","VA","VT","WA","XWY"),
                   labels= c("Ontario", "Arizona", "Nevada","Quebec","Ohio","North Carolina",
                             "Pennslyvania","Alberta","Wisconsin","Illinois","Arkansas","Alabama","British Columbia",
                             "California","Colorado","Florida","Hawaii","Hartlepool","Manitoba",
                             "Nebraska","New York","Oregon","South Carolina","Texas","Virginia",
                             "Vermont","Washington","Leeds"))

The top two states with respect to the number of restaurant establishments are Ontario and Arizona and thus had been selected for the implementation.

Restaurant price per person distribution across states

temp_final <- final_df[c(5,11)]

temp_final$Price_range <- ifelse(grepl('1', temp_final$Price_range, ignore.case = F), '1',
                                 ifelse(grepl('2',temp_final$Price_range, ignore.case = F), '2',
                                        ifelse(grepl('3',temp_final$Price_range, ignore.case = F), '2',
                                               ifelse(grepl('4',temp_final$Price_range, ignore.case = F), '3',
                                                      ifelse(grepl('Not Available',temp_final$Price_range, ignore.case = F), '0',
                                                             '')))))

data_group_state_price <- temp_final %>%
  filter(state != '') %>% mutate(tsum = n()) %>%
  group_by(state, Price_range)

data_group_state_price$Price_range <- as.numeric(data_group_state_price$Price_range)

data_weighted_group_state_price <- data_group_state_price %>%
  summarise(total_res_price_range = n()) %>% arrange(desc(Price_range)) %>%
  mutate(total_res = sum(total_res_price_range)) %>% mutate(percent = round((total_res_price_range / total_res)*100, 1))

data_weighted_group_state_price$Price_range <- as.integer(data_weighted_group_state_price$Price_range)

ggplot(data_weighted_group_state_price, aes(x = reorder(state,total_res), y = Price_range, label = paste0(percent))) +
  geom_point(aes(size = percent, color = Price_range), alpha= 1) +
  geom_text(hjust = 0.4, size = 4) + 
  scale_size(range = c(1, 12), guide= "none") +
  labs(title = "Restaurants Price per person by State ", subtitle = "(All numbers are in percentage)",
       x = "States (ascending order in terms of number of restaurants)", 
       y = "Price per Person") +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_color_gradient(name = "Price per Person", breaks= c(0,1,2,3),
                       labels = c("Not Available","Under $10", "$11-60", "Above $60"),
                       guide = "colorbar", low= "yellow", high = "#FF2000") + 
  scale_y_continuous(breaks = c(0,1,2,3), labels= c("Not Available","Under $10", "$11-60", "Above $60")) +
  scale_x_discrete(breaks=c("ON","AZ","NV","QC","OH","NC","PA","AB","WI","IL","AR","AL","BC","CA",
                            "CO","FL","HI","HPL","MB","NE","NY","OR","SC","TX","VA","VT","WA","XWY"),
                   labels= c("Ontario", "Arizona", "Nevada","Quebec","Ohio","North Carolina",
                             "Pennslyvania","Alberta","Wisconsin","Illinois","Arkansas","Alabama","British Columbia",
                             "California","Colorado","Florida","Hawaii","Hartlepool","Manitoba",
                             "Nebraska","New York","Oregon","South Carolina","Texas","Virginia",
                             "Vermont","Washington","Leeds"))

The six states which have the cheapest restaurants, viz. Arkansas, Florida, Hawaii, Oregon, Virginia, Colorado. And the state with most expensive restaurants is Manitoba.

Word cloud (Restaurant type)

pal2 <- brewer.pal(8,"Dark2")
wordcloud(final_df$Sub_category,scale=c(4,.9), min.freq = 25,random.order=FALSE,random.color = TRUE, rot.per=.15, colors=pal2)

American is the most common restaurant category among entire Yelp dataset (At its current size).

Rating-wise restaurant distribution (AZ and ON)

temp_az_star <- as.data.frame(table(az_df$stars))
temp_on_star <- as.data.frame(table(on_df$stars))

final_temp <- temp_az_star

final_temp['Freq1'] <- vlookup_df(final_temp$Var1, temp_on_star, result_column= 'Freq', lookup_column= 'Var1')

final_temp <- final_temp %>% rename(Arizona = Freq)

final_temp <- final_temp %>% rename(Ontario = Freq1)

final_temp <- melt(final_temp, id = 'Var1')

ggplot() + 
  geom_bar(data = final_temp, aes(x = Var1, y = value, fill = variable), position = "dodge", stat = "identity") +
  labs(x ="Restaurant Star ratings", y= "No. of Restaurants", title = "States- Arizona & Ontario", subtitle = "Restaurants star rating distribution" ) +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  scale_fill_discrete(name= "States")

It can be inferred that the state of Ontario has better rated restaurants in comparison to the state of Arizona. Although this could also occur due the fact that Ontario has 4000 more restaurant listings in the dataset.

Average star rating for AZ & ON

sum_az_on <- rbind(az_df,on_df)

sum_az_on <- sum_az_on[c(4,5,9)]

az_on_cities_stars <- sum_az_on %>% group_by(state,city) %>% summarise(avg_star = mean(stars))

bx_plt <- ggplot(az_on_cities_stars, aes(x = state, y = avg_star, fill= state)) +
  geom_boxplot(alpha = 0.7) + 
  labs(title = "Average Star Rating of Arizona & Ontario", x= "States", y= "Star Ratings")+
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11)) +
  scale_x_discrete(breaks=c("AZ","ON"), labels= c("Arizona", "Ontario")) +
  scale_fill_discrete(name= "State", breaks=c("AZ","ON"), labels= c("Arizona", "Ontario"))

bx_plt

We can observe that state of Arizona has lowest average rated restaurant as 1 star and state of Ontario has lowest average rated as 2 star. And both the states have restaurants with highest average rating of 5 star. Also we can see that the whiskers of Ontario state seems to be indicating that the average rated restaurant is more than that of Arizona.

Arizona Top 5 restaurant types and cities

az_df$Sub_category <- as.factor(az_df$Sub_category)

temp_az_res_type <- as.data.frame(table(az_df$Sub_category))

temp_az_res_type <- temp_az_res_type[order(-temp_az_res_type$Freq),] 

temp_az_res_type$Var1 <- as.factor(temp_az_res_type$Var1)

ggplot(data = top_n(temp_az_res_type,5) , aes(x = reorder(Var1, Freq), y = Freq)) + 
  geom_bar(stat = 'identity', fill="steelblue") + 
  labs(x ="Restaurant Type/Cuisine", y= "Count", title = "Arizona State", subtitle = "Top 5 Restaurant Types" ) +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  coord_flip()

It can be inferred that American cuisine type is well established. Also Mexican cuisine is the second most favoured by the restaurants due to the sharing of border with Mexico.

num_rest_az <- as.data.frame(table(az_df$city))

num_rest_az <- num_rest_az[order(-num_rest_az$Freq),]

ggplot(data = top_n(num_rest_az,5) , aes(x = reorder(Var1, -Freq), y = Freq)) + 
  geom_bar(stat = 'identity', fill="tomato3") + 
  labs(x ="City Name", y= "No. of Restaurants", title = "Arizona State", subtitle = "Top 5 Cities w.r.t. number of Restaurants" ) +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))

Ontario Top 5 restaurant types and cities

on_df$Sub_category <- as.factor(on_df$Sub_category)

temp_on_res_type <- as.data.frame(table(on_df$Sub_category))

temp_on_res_type <- temp_on_res_type[order(-temp_on_res_type$Freq),] 

temp_on_res_type$Var1 <- as.factor(temp_on_res_type$Var1)

ggplot(data = top_n(temp_on_res_type,5) , aes(x = reorder(Var1, Freq), y = Freq)) + 
  geom_bar(stat = 'identity', fill="steelblue") + 
  labs(x ="Restaurant Type/Cuisine", y= "Count", title = "Ontario State", subtitle = "Top 5 Restaurant Types" ) +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  coord_flip()

Being in the North American continent, Ontario has highest American cuisine restaurants, followed by Pub; due to Toronto being the hub for business and higher education attracting younger people.

num_rest_on <- as.data.frame(table(on_df$city))

num_rest_on <- num_rest_on[order(-num_rest_on$Freq),]

ggplot(data = top_n(num_rest_on,5) , aes(x = reorder(Var1, -Freq), y = Freq)) + 
  geom_bar(stat = 'identity', fill="tomato3") + 
  labs(x ="City Name", y= "No. of Restaurants", title = "Ontario State", subtitle = "Top 5 Cities w.r.t. number of Restaurants" ) +
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))

We see that Phoenix and Toronto are cities with highest restaurants for states of Arizona and Ontario respectively. Additionally American is the top restaurant type for them.

Based on the above bar plots, We investigate those cities for price distribution.

City wise price distribution

most_exp_ph <- az_df[az_df[,'city'] == 'Phoenix',]

temp_ph <- as.data.frame(table(most_exp_ph$Price_range))

temp_ph$Var1 <- ifelse(grepl('1', temp_ph$Var1, ignore.case = F), '1',
                       ifelse(grepl('2',temp_ph$Var1, ignore.case = F), '2',
                              ifelse(grepl('3',temp_ph$Var1, ignore.case = F), '2',
                                     ifelse(grepl('4',temp_ph$Var1, ignore.case = F), '4',
                                            'Not Available'))))

temp_ph <- temp_ph %>% group_by(Var1) %>% summarise(sum_price = sum(Freq))

temp_ph$Var1 <- ifelse(grepl('1', temp_ph$Var1, ignore.case = F), 'Under $10',
                       ifelse(grepl('2',temp_ph$Var1, ignore.case = F), '$11-60',
                              ifelse(grepl('4',temp_ph$Var1, ignore.case = F), 'Above $60',
                                     'Not Available')))

most_exp_tn <- on_df[on_df[,'city'] == 'Toronto',]

most_exp_tn$Price_range <- as.factor(most_exp_tn$Price_range)

temp_tn <- as.data.frame(table(most_exp_tn$Price_range))

temp_tn$Var1 <- ifelse(grepl('1', temp_tn$Var1, ignore.case = F), '1',
                       ifelse(grepl('2',temp_tn$Var1, ignore.case = F), '2',
                              ifelse(grepl('3',temp_tn$Var1, ignore.case = F), '2',
                                     ifelse(grepl('4',temp_tn$Var1, ignore.case = F), '4',
                                            'Not Available'))))

temp_tn <- temp_tn %>% group_by(Var1) %>% summarise(sum_price = sum(Freq))

temp_tn$Var1 <- ifelse(grepl('1', temp_tn$Var1, ignore.case = F), 'Under $10',
                       ifelse(grepl('2',temp_tn$Var1, ignore.case = F), '$11-60',
                              ifelse(grepl('4',temp_tn$Var1, ignore.case = F), 'Above $60',
                                     'Not Available')))

### Plotting the pie charts side-by-side ###
fig_pie <- plot_ly()

fig_pie <- fig_pie %>% add_pie(data = temp_ph, labels = ~Var1, values = ~sum_price, name = 'Phoenix \nRestaurant',
                               domain = list(row = 1, column = 0))

fig_pie <- fig_pie %>% add_pie(data = temp_tn, labels = ~Var1, values = ~sum_price, name = 'Toronto \nRestaurant',
                               domain = list(row = 1, column = 1))

fig_pie <- fig_pie %>% layout(title = "Phoenix & Toronto City - Restaurant Price/Person Distribution",
                              showlegend = T,
                              legend=list(title=list(text='<b> Price per Person </b>')),
                              grid=list(rows=1, columns=2),
                              xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
                              yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
                              annotations = list(
                                list(x = 0.2 , y = 0.85, text = "<b>Phoenix</b>", showarrow = F, xref='paper', yref='paper'),
                                list(x = 0.8 , y = 0.85, text = "<b>Toronto</b>", showarrow = F, xref='paper', yref='paper')
                              ))
fig_pie

When comparing the top two cities of these states, we found that city of Toronto has more number of restaurants for the price range of ‘$11-60’ than the city of Phoenix. It can also indicate that Toronto city has population of higher income group.

Dataset

Preprocessing

Selecting business categories

State selection for GUI

Cleanup of city names