#!/usr/bin/env python # coding: utf-8 # # Adara Challenge Analysis # ## 1. Exploratory Data Analysis # ### 1.1 Data Dictionary # # The data contains events from the months of January 2019 and January 2021; the 2019 can be used as a basis of comparison for the activity in 2021. # # The destinations contained in the data are selected top-tier cities in Germany, UK, France, Italy, and Spain # # The table below shows the definition of attributes: # Attributes |Definition # :-----|:--------------- # **event_date** |The date of the search or booking activity. This is different from the departure_checkin_date, which is the actual date of travel activity: if a customer makes a booking today for check-in on 1 October, today’s date would be the event date, and 1 October would be the “departure_checkin_date” # **activity_group** |An indicator of the type of event on the hotel website. Classified as “Searching” (a general search for a room) or “Booking” (an actual reservation) # **reason_for_travel_detailed** |this is the Reason for Travel, categorized as either “Business”, “Leisure-Non-Family” (leisure travel consisting of 1 or 2 people), and “Leisure-Family” (leisure travel consisting of 3 or more people) # **origin_country_code** |2-letter ISO country code indicating the origin country from which the booking or search activity was made # **hotel_city,hotel_state,hotel_country** |raw information specifying the destination in question.
Note that this data is in an _unclean_ state, provided straight from the source system. # **traveler_value_group** |Indicator of traveler value (overall spend + loyalty status), grouped into “Low”, “Medium”, “High” # **departure_checkin_date** |Start date of hotel stay; see _event_date_ to determine how the date information differs. # **total_number_events** |Aggregated count of total number of events # ### 1.2 Data Exploration # # #### **Import Python packages and raw data** # In[1]: #data manipulation import pandas as pd #data visualization import plotly.express as px import plotly.graph_objects as go import plotly.io as pio pio.templates.default = "plotly_white" # interactive data table import qgrid as dt #read raw data as panda dataframe raw_data = pd.read_csv("./sample_data.csv") #Glimpse of imported raw csv data raw_data.head() # #### **Summary Statistics** # In[2]: raw_data.info() raw_data.describe() # #### **Data Wrangling** # - *Transformed event_date and departure_checkin_date from object to date* # - *Added event_wday, event_year, event_mday, checkin_wday, checkin_year, checkin_month attribues for a better understanding of total_number_events according to dates.* # In[3]: raw_data["event_date"] = pd.to_datetime(raw_data["event_date"]) raw_data["event_wday"] = raw_data["event_date"].dt.day_name().str[:3] raw_data["event_year"] = raw_data["event_date"].dt.year raw_data["event_mday"] = raw_data["event_date"].dt.month_name().str[:3].str.cat(raw_data["event_date"].dt.day.astype(str).str.zfill(2), sep =" ") raw_data["departure_checkin_date"] = pd.to_datetime(raw_data["departure_checkin_date"]) raw_data["checkin_wday"] = raw_data["departure_checkin_date"].dt.day_name().str[:3] raw_data["checkin_year"] = raw_data["departure_checkin_date"].dt.year raw_data["checkin_month"] = raw_data["departure_checkin_date"].dt.month_name().str[:3] #check new columns raw_data.head() # - *Convert datatype from object to categorical to save memory* # In[4]: raw_data.info() # In[5]: raw_data["activity_group"] = raw_data["activity_group"].astype("category") raw_data["reason_for_travel_detailed"] = raw_data["reason_for_travel_detailed"].astype("category") raw_data["origin_country_code"] = raw_data["origin_country_code"].astype("category") raw_data["hotel_city"] = raw_data["hotel_city"].astype("category") raw_data["hotel_state"] = raw_data["hotel_state"].astype("category") raw_data["hotel_country"] = raw_data["hotel_country"].astype("category") raw_data["event_year"] = raw_data["event_year"].astype("category") raw_data["event_mday"] = raw_data["event_mday"].astype("category") raw_data["checkin_year"] = raw_data["checkin_year"].astype("category") # - *Transformed traveler_value_group into ordinal based on ascending order of Low, Med, and High.* # In[6]: raw_data["traveler_value_group"] = raw_data["traveler_value_group"].astype("category") raw_data["traveler_value_group"].value_counts(dropna=False) # In[7]: raw_data["traveler_value_group"] = raw_data["traveler_value_group"].cat.set_categories( new_categories=["Low", "Med", "High"], ordered=True ) #check if the values are the same raw_data["traveler_value_group"].value_counts(dropna=False) # In[8]: #check order raw_data["traveler_value_group"].head() # - *Transformed event_wday, checkin_wday, and checkin_month into ordinal based on calendar weekday and month* # In[9]: raw_data["event_wday"] = raw_data["event_wday"].astype("category") raw_data["event_wday"].value_counts(dropna=False) # In[10]: raw_data["event_wday"] = raw_data["event_wday"].cat.set_categories( new_categories=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"], ordered=True ) #check if the values are the same raw_data["event_wday"].value_counts(dropna=False) # In[11]: #check order raw_data["event_wday"].head() # In[12]: raw_data["checkin_wday"] = raw_data["checkin_wday"].astype("category") raw_data["checkin_wday"].value_counts(dropna=False) # In[13]: raw_data["checkin_wday"] = raw_data["checkin_wday"].cat.set_categories( new_categories=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"], ordered=True ) #check if the values are the same raw_data["checkin_wday"].value_counts(dropna=False) # In[14]: #check order raw_data["checkin_wday"].head() # In[15]: raw_data["checkin_month"] = raw_data["checkin_month"].astype("category") raw_data["checkin_month"].value_counts(dropna=False) # In[16]: raw_data["checkin_month"] = raw_data["checkin_month"].cat.set_categories( new_categories=["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], ordered=True ) #check order raw_data["checkin_month"].value_counts(dropna=False) # In[17]: #check order raw_data["checkin_month"].head() # In[18]: #final check of datatype and memory savings raw_data.info() # #### Event Year EDA # In[19]: #generate dataframe df_event_year = pd.DataFrame(raw_data.groupby("event_year")["total_number_events"].sum()) df_event_year.reset_index(inplace=True) df_event_year["pct_total"] = (df_event_year["total_number_events"]/raw_data["total_number_events"].sum())*100 #interactive plotly bar chart fig_event_year = px.bar(data_frame = df_event_year, x = 'event_year', y = 'total_number_events', color_discrete_map={ 2019: 'darkgray', 2021: 'purple'}, color = 'event_year', hover_data = ['pct_total']) fig_event_year.update_xaxes(title_text = 'Event Year', type='category') fig_event_year.update_layout( title = 'Event Year EDA', yaxis_title = 'Total Number of Events', showlegend=False) fig_event_year.show() # - *More than a third of the events were from 2019 and less than a quarter are from 2020* # In[20]: #interactive data table dt.show_grid(df_event_year, precision = 2) # #### Event Month Day EDA # In[21]: #generate dataframe raw_data_2019 = pd.DataFrame(raw_data[raw_data["event_year"] == 2019]) raw_data_2021 = pd.DataFrame(raw_data[raw_data["event_year"] == 2021]) #2019 df_event_mday_2019 = pd.DataFrame(raw_data_2019.groupby("event_mday")["total_number_events"].sum()) df_event_mday_2019.rename(columns = {"total_number_events" : "mday_2019_total"}, inplace = True) #2021 df_event_mday_2021 = pd.DataFrame(raw_data_2021.groupby("event_mday")["total_number_events"].sum()) df_event_mday_2021.rename(columns = {"total_number_events" : "mday_2021_total"}, inplace = True) #join dataframes df_event_mday = df_event_mday_2019.join(df_event_mday_2021) df_event_mday.reset_index(inplace=True) #interactive plotly line chart fig_event_mday = go.Figure() fig_event_mday.add_trace(go.Scatter( x = df_event_mday['event_mday'], y = df_event_mday['mday_2019_total'], name = '2019', mode = 'lines+markers', line=go.scatter.Line(color="darkgray"))) fig_event_mday.add_trace(go.Scatter( x = df_event_mday['event_mday'], y = df_event_mday['mday_2021_total'], name = '2021', mode = 'lines+markers', line=go.scatter.Line(color="purple"))) fig_event_mday.update_layout( title = 'Event Month Day EDA', xaxis_title = 'Date', yaxis_title = 'Total Number of Events') fig_event_mday.show() # - _Based on day on day comparison, 2019 saw some dips on certain dates like Jan 05, Jan 12, etc._ # # - _2020 also exhibits certain dips on total events like Jan 10, Jan 17, etc._ # In[22]: #interactive data table df_event_mday["diff_val"] = df_event_mday["mday_2021_total"] - df_event_mday["mday_2019_total"] df_event_mday["diff_pct"] = (df_event_mday["diff_val"] / df_event_mday["mday_2019_total"])*100 dt.show_grid(df_event_mday, precision = 2) # #### Event Weekday EDA # In[23]: #2019 df_event_wday_2019 = pd.DataFrame(raw_data_2019.groupby("event_wday")["total_number_events"].sum()) df_event_wday_2019.rename(columns = {"total_number_events" : "wday_2019_total"}, inplace = True) #2021 df_event_wday_2021 = pd.DataFrame(raw_data_2021.groupby("event_wday")["total_number_events"].sum()) df_event_wday_2021.rename(columns = {"total_number_events" : "wday_2021_total"}, inplace = True) #join dataframes df_event_wday = df_event_wday_2019.join(df_event_wday_2021) df_event_wday.reset_index(inplace=True) df_event_wday.head(7) #interactive plotly bar chart fig_event_wday = go.Figure() fig_event_wday.add_trace(go.Bar( x = df_event_wday['event_wday'], y = df_event_wday['wday_2019_total'], name = '2019', marker=dict(color="darkgray"))) fig_event_wday.add_trace(go.Bar( x = df_event_wday['event_wday'], y = df_event_wday['wday_2021_total'], name = '2021', marker=dict(color="purple"))) fig_event_wday.update_layout( title = 'Event Weekday EDA', xaxis_title = 'Day', yaxis_title = 'Total Number of Events') fig_event_wday.show() # - _Viewing from a day of the week, there are less total events on Saturday and Sunday for 2019 while it is fairly consistent throughout the week with minimal fluctuations for 2020._ # In[24]: #interactive data table df_event_wday["diff_val"] = df_event_wday["wday_2021_total"] - df_event_wday["wday_2019_total"] df_event_wday["diff_pct"] = (df_event_wday["diff_val"] / df_event_wday["wday_2019_total"])*100 dt.show_grid(df_event_wday, precision = 2) # #### Activity Group EDA # In[25]: #generate dataframe df_activity = pd.DataFrame(raw_data.groupby("activity_group")["total_number_events"].sum()) df_activity.reset_index(inplace=True) df_activity["pct_total"] = (df_activity["total_number_events"]/raw_data["total_number_events"].sum())*100 #interactive plotly bar chart fig_activity = px.bar(data_frame = df_activity, x = 'activity_group', y = 'total_number_events', color_discrete_map={ 'Booking' : 'purple', 'Searching': 'darkgray'}, color = 'activity_group', hover_data = ['pct_total']) fig_activity.update_layout( title = 'Activity Group EDA', xaxis_title = 'Activity Group', yaxis_title = 'Total Number of Events', showlegend=False) fig_activity.show() # - _From the total number of events, only less than 8% are doing actual reservation. It would be interesting to see what the are the factors that affect the conversion from ‘Searching’ to ‘Booking’._ # # - _An ‘conversion’ attribute can be use as a target variable for a prediction model._ # In[26]: #interactive data table dt.show_grid(df_activity, precision = 2) # #### Reason for Travel Detailed EDA # In[27]: #generate dataframe df_reason_travel = pd.DataFrame(raw_data.groupby("reason_for_travel_detailed")["total_number_events"].sum()) df_reason_travel.reset_index(inplace=True) df_reason_travel["pct_total"] = (df_reason_travel["total_number_events"]/raw_data["total_number_events"].sum())*100 df_reason_travel.sort_values(by = "total_number_events", ascending = False, inplace = True) #interactive plotly bar chart fig_reason_travel = px.bar(data_frame = df_reason_travel, x = 'reason_for_travel_detailed', y = 'total_number_events', color_discrete_map={ 'Leisure - Non-Family': 'orchid', 'Business' : 'darkorchid', 'Leisure - Family': 'purple'}, color = 'reason_for_travel_detailed', hover_data = ['pct_total']) fig_reason_travel.update_layout( title = 'Reason for Travel Detailed EDA', xaxis_title = 'Reason for Travel', yaxis_title = 'Total Number of Events', showlegend=False) fig_reason_travel.show() # - _Almost 95% of the total events are from Leisure - Non-Family and Business. Although Leisure - Family accounts for more than 5%, it would be good to know the spending patterns._ # # - _Intuitively, leisure travel consisting of 3 or more people for Leisure-Family category will spend more in the hotel services like rooms and food than Leisure - Non-Family and Business._ # # # In[28]: #interactive data table dt.show_grid(df_reason_travel, precision = 2) # #### Origin Country Code EDA # In[29]: #generate dataframe df_origin_country = pd.DataFrame(raw_data.groupby("origin_country_code", dropna=False)["total_number_events"].sum()) df_origin_country.reset_index(inplace=True) df_origin_country["pct_total"] = (df_origin_country["total_number_events"]/raw_data["total_number_events"].sum())*100 df_origin_country.sort_values(by = "total_number_events",ascending = False,inplace = True) df_origin_country_10 = pd.DataFrame(df_origin_country.head(10)) #interactive plotly bar chart fig_origin_country = px.bar(data_frame = df_origin_country_10, y = 'origin_country_code', x = 'total_number_events', hover_data = ['pct_total']) fig_origin_country.update_traces(marker_color='purple') fig_origin_country.update_yaxes(categoryorder='total ascending') fig_origin_country.update_layout( title = 'Top 10 Origin Country Code EDA', yaxis_title = 'Origin Country', xaxis_title = 'Total Number of Events') fig_origin_country.show() # - _The top 5 origin country contributes to more than 92% of the total events exceeding the 80/20 Pareto principle._ # # # In[30]: #interactive data table dt.show_grid(df_origin_country, precision = 2) # #### Hotel City EDA # In[31]: #generate dataframe df_hotel_city = pd.DataFrame(raw_data.groupby("hotel_city",dropna=False)["total_number_events"].sum()) df_hotel_city.reset_index(inplace=True) df_hotel_city["pct_total"] = (df_hotel_city["total_number_events"]/raw_data["total_number_events"].sum())*100 df_hotel_city.sort_values(by = "total_number_events",ascending = False,inplace = True) df_hotel_city_10 = pd.DataFrame(df_hotel_city.head(10)) #interactive plotly bar chart fig_hotel_city = px.bar(data_frame = df_hotel_city_10, y = 'hotel_city', x = 'total_number_events', hover_data = ['pct_total']) fig_hotel_city.update_traces(marker_color='purple') fig_hotel_city.update_yaxes(categoryorder='total ascending') fig_hotel_city.update_layout( title = 'Top 10 Hotel City EDA', yaxis_title = 'Hotel City', xaxis_title = 'Total Number of Events') fig_hotel_city.show() # In[32]: #interactive data table dt.show_grid(df_hotel_city, precision = 2) # #### Hotel State EDA # In[33]: #generate dataframe df_hotel_state = pd.DataFrame(raw_data.groupby("hotel_state",dropna=False)["total_number_events"].sum()) df_hotel_state.reset_index(inplace=True) #93.37% are mising values df_hotel_state["pct_total"] = (df_hotel_state["total_number_events"]/raw_data["total_number_events"].sum())*100 df_hotel_state.sort_values(by = "total_number_events", ascending = False,inplace = True) df_hotel_state_10 = pd.DataFrame(df_hotel_state.head(10)) #interactive plotly bar chart fig_hotel_state = px.bar(data_frame = df_hotel_state_10, y = 'hotel_state', x = 'total_number_events', hover_data = ['pct_total']) fig_hotel_state.update_traces(marker_color='purple') fig_hotel_state.update_yaxes(categoryorder='total ascending') fig_hotel_state.update_layout( title = 'Top 10 Hotel State EDA', yaxis_title = 'Hotel State', xaxis_title = 'Total Number of Events') fig_hotel_state.show() # In[34]: #interactive data table dt.show_grid(df_hotel_state, precision = 2) # #### Hotel Country EDA # In[35]: #generate dataframe df_hotel_country = pd.DataFrame(raw_data.groupby("hotel_country",dropna=False)["total_number_events"].sum()) df_hotel_country.reset_index(inplace=True) #0.86% are mising values df_hotel_country["pct_total"] = (df_hotel_country["total_number_events"]/raw_data["total_number_events"].sum())*100 df_hotel_country.sort_values(by = "total_number_events", ascending = False,inplace = True) df_hotel_country_10 = pd.DataFrame(df_hotel_country.head(10)) #interactive plotly bar chart fig_hotel_country = px.bar(data_frame = df_hotel_country_10, y = 'hotel_country', x = 'total_number_events', hover_data = ['pct_total']) fig_hotel_country.update_traces(marker_color='purple') fig_hotel_country.update_yaxes(categoryorder='total ascending') fig_hotel_country.update_layout( title = 'Top 10 Hotel Country EDA', yaxis_title = 'Hotel Country', xaxis_title = 'Total Number of Events') fig_hotel_country.show() # In[36]: #interactive data table dt.show_grid(df_hotel_country, precision = 2) # - *London, GB accounts to almost 20% of the total number of events* # # - *As noted in the Data Dictionary, hotel_city, hotel_state, and hotel_country are raw data from the source systems and requires data cleaning to be useable.* # #### Traveler Value Group EDA # In[37]: #generate dataframe df_traveler_value = pd.DataFrame(raw_data.groupby("traveler_value_group")["total_number_events"].sum()) df_traveler_value.reset_index(inplace=True) df_traveler_value["pct_total"] = (df_traveler_value["total_number_events"]/raw_data["total_number_events"].sum())*100 df_traveler_value.sort_values(by = "total_number_events", ascending = False, inplace = True) #interactive plotly bar chart fig_traveler_value = px.bar(data_frame = df_traveler_value, x = 'traveler_value_group', y = 'total_number_events', color_discrete_map={ 'Low - Non-Family': 'orchid', 'Med' : 'darkorchid', 'High - Family': 'purple'}, color = 'traveler_value_group', hover_data = ['pct_total']) fig_reason_travel.update_layout( title = 'Traveler Value Group EDA', xaxis_title = 'Traveler Value', yaxis_title = 'Total Number of Events', showlegend=False) fig_reason_travel.show() # - *Similar to reason_for_travel_detailed, it would be good to know the spending patterns of different Traveler Value Group.* # # - *Intuitively, “High” will spend more in the hotel services like rooms, food, and loyalty. Therefore, they require the best quality of customer care.* # In[38]: #interactive data table dt.show_grid(df_traveler_value, precision = 2) # #### Checkin Year EDA # In[39]: #generate dataframe df_checkin_year = pd.DataFrame(raw_data.groupby("checkin_year")["total_number_events"].sum()) df_checkin_year.reset_index(inplace=True) df_checkin_year["pct_total"] = (df_checkin_year["total_number_events"]/raw_data["total_number_events"].sum())*100 #interactive plotly bar chart fig_checkin_year = px.bar(data_frame = df_checkin_year, x = 'checkin_year', y = 'total_number_events', color_discrete_map={ 2019 : 'darkgray', 2020 : 'darkgray', 2021 : 'purple', 2022 : 'darkgray', 2023 : 'darkgray', 2032 : 'darkgray',}, color = 'checkin_year', hover_data = ['pct_total']) fig_checkin_year.update_xaxes(title_text = 'Checkin Year', type='category') fig_checkin_year.update_layout( title = 'Checkin Year EDA', yaxis_title = 'Total Number of Events', showlegend=False) fig_checkin_year.show() # In[40]: #interactive data table dt.show_grid(df_checkin_year, precision = 2) # #### Checkin Month EDA # In[41]: #generate dataframe df_checkin_month = pd.DataFrame(raw_data.groupby("checkin_month")["total_number_events"].sum()) df_checkin_month.reset_index(inplace=True) df_checkin_month["pct_total"] = (df_checkin_month["total_number_events"]/raw_data["total_number_events"].sum())*100 #interactive plotly bar chart fig_checkin_month = px.bar(data_frame = df_checkin_month, x = 'checkin_month', y = 'total_number_events', hover_data = ['pct_total']) fig_checkin_month.update_traces(marker_color='purple') fig_checkin_month.update_layout( title = 'Checkin Month EDA', xaxis_title = 'Checkin Month', yaxis_title = 'Total Number of Events', showlegend=False) fig_checkin_month.show() # In[42]: #interactive data table dt.show_grid(df_checkin_month, precision = 2) # - _Most of the checkins are from 2019 and January. The data also show that there is a decreasing trend towards the end of the year. Probably, it would be good to have some promotions during the 3rd and 4th quarter to increase the number of events._ # #### Total Number of Events Histogram # In[43]: #generate dataframe df_total_event = pd.DataFrame(raw_data["total_number_events"].value_counts()) df_total_event.reset_index(inplace=True) df_total_event["pct_total"] = (df_total_event["total_number_events"]/raw_data["total_number_events"].sum())*100 df_total_event.rename(columns = {"index": "total_number_events", "total_number_events" : "count"}, inplace = True) df_total_event.sort_values(by = "count", ascending = False, inplace = True) fig_total_event = px.histogram(data_frame = raw_data, x='total_number_events', nbins = 100) fig_total_event.update_traces(marker_color='purple') fig_total_event.update_layout( title = 'Total Number of Events Histogram', yaxis_title = 'Count', xaxis_title = 'Total Number of Events', showlegend=False) fig_total_event.show() # In[44]: #interactive data table dt.show_grid(df_total_event, precision = 2) # ### 1.3 Data Cleaning # # #### **Clean Hotel City** # In[45]: clean_data = raw_data.copy(deep=True) #replace misspelled hotel city replace_map_hotel_city = {"dusseldorf" : "duesseldorf", "firenze" : "florence", "kln" : "koeln", "mnchen" : "munich", "seville" : "sevilla", "venezia": "venice"} clean_data["hotel_city"].replace(replace_map_hotel_city, inplace = True) clean_data["hotel_city"] = clean_data["hotel_city"].astype("category") #generate dataframe df_clean_hotel_city = pd.DataFrame(clean_data.groupby("hotel_city",dropna=False)["total_number_events"].sum()) df_clean_hotel_city.reset_index(inplace=True) df_clean_hotel_city["pct_total"] = (df_clean_hotel_city["total_number_events"]/clean_data["total_number_events"].sum())*100 df_clean_hotel_city.sort_values(by = "total_number_events",ascending = False,inplace = True) #interactive data table dt.show_grid(df_clean_hotel_city, precision = 2) # **Merged the following cities:** # - dusseldorf -> duesseldorf # - firenze -> florence # - kln -> koeln # - mnchen -> munich # - seville -> sevilla # - venezia -> venice # #### **Clean Hotel Country** # In[46]: replace_map_hotel_country = {"alemania" : "de", "allemagne" : "de", "germania" : "de", "espaa" : "es", "catalunya" : "es", "gran bretaa": "gb", "gran bretagna" : "gb", "grande bretagne" : "gb", "grobritannien" : "gb", "regno unito" : "gb", "reino unido" : "gb", "royaume-uni" : "gb", "sct" : "gb", "on" : "gb", #london "ns" : "gb", #liverpool "jp" : "fr", #bordeaux "br" : "es", #barcelona "co" : "es"} #madrid clean_data["hotel_country"].replace(replace_map_hotel_country, inplace = True) clean_data["hotel_country"] = clean_data["hotel_country"].astype("category") #generate dataframe df_clean_hotel_country = pd.DataFrame(clean_data.groupby("hotel_country",dropna=False)["total_number_events"].sum()) df_clean_hotel_country.reset_index(inplace=True) df_clean_hotel_country["pct_total"] = (df_clean_hotel_country["total_number_events"]/clean_data["total_number_events"].sum())*100 df_clean_hotel_country.sort_values(by = "total_number_events",ascending = False,inplace = True) #interactive data table dt.show_grid(df_clean_hotel_country, precision = 2) # **Issues:** # # - Merged and renamed countries. However US, CA, AU, and RU are still incorrect (does not match correctly with hotel_city) # # - There are missing countries and (not set) entries # # - Assuming the hotel_city is correct, a better approach is to map the hotel_country from the hotel_city # # - Additionally, hotel_state is unusable because of 93.37% of the data are missing. If this attribute is needed, hotel_state can be mapped from the hotel_city # #### **Final Clean Data** # In[47]: clean_data.drop(columns = ["hotel_state", "hotel_country"], inplace = True) clean_data.info() # - For the final data to be used for analysis, hotel_state and hotel_country will be excluded since the final report only requires top destination per city. # # - Additionally, the outliers in the total_number_events and missing values from other fields will be kept as is. The analysis assumes that the data inputs are correct. # ## 2. Insights Discovery # # ### 2.1 Booking # In[48]: #generate dataframe clean_data_2019 = pd.DataFrame(clean_data[clean_data["event_year"] == 2019]) clean_data_2019_booking = pd.DataFrame(clean_data_2019[clean_data_2019["activity_group"] == "Booking"]) clean_data_2019_searching = pd.DataFrame(clean_data_2019[clean_data_2019["activity_group"] == "Searching"]) clean_data_2021 = pd.DataFrame(clean_data[clean_data["event_year"] == 2021]) clean_data_2021_booking = pd.DataFrame(clean_data_2021[clean_data_2021["activity_group"] == "Booking"]) clean_data_2021_searching = pd.DataFrame(clean_data_2021[clean_data_2021["activity_group"] == "Searching"]) #2019 df_booking_2019 = pd.DataFrame(clean_data_2019_booking.groupby("hotel_city")["total_number_events"].sum()) df_booking_2019.rename(columns = {"total_number_events" : "booking_2019_total"}, inplace = True) #2021 df_booking_2021 = pd.DataFrame(clean_data_2021_booking.groupby("hotel_city")["total_number_events"].sum()) df_booking_2021.rename(columns = {"total_number_events" : "booking_2021_total"}, inplace = True) #join dataframes df_booking_city = df_booking_2019.join(df_booking_2021) df_booking_city.reset_index(inplace=True) df_booking_city.sort_values(by = "booking_2021_total", ascending = False, inplace = True) #interactive plotly line chart fig_booking_city = go.Figure() fig_booking_city.add_trace(go.Scatter( x = df_booking_city['hotel_city'], y = df_booking_city['booking_2021_total'], name = '2021', mode = 'lines+markers', line=go.scatter.Line(color='purple'))) fig_booking_city.add_trace(go.Scatter( x = df_booking_city['hotel_city'], y = df_booking_city['booking_2019_total'], name = '2019', mode = 'lines+markers', line=go.scatter.Line(color="darkgray"))) fig_booking_city.update_layout( title = 'Booking Top Destinations by City', xaxis_title = 'Booking Destination', yaxis_title = 'Total Number of Events') fig_booking_city.show() # In[49]: #interactive data table df_booking_city["diff_val"] = df_booking_city["booking_2021_total"] - df_booking_city["booking_2019_total"] df_booking_city["diff_pct"] = (df_booking_city["diff_val"] / df_booking_city["booking_2019_total"])*100 df_booking_city.sort_values(by = "diff_pct", ascending = False, inplace = True) dt.show_grid(df_booking_city, precision = 2) # In[50]: fig_booking_city_diff = px.bar(data_frame = df_booking_city, x = 'diff_pct', y = 'hotel_city') fig_booking_city_diff.update_traces(marker_color='purple') fig_booking_city_diff.update_yaxes(categoryorder='total ascending') fig_booking_city_diff.update_layout( title = 'Destinations by City Percentage Change (2021 vs. 2019)', xaxis_title = 'Percentage Change(%)', yaxis_title = 'Booking Destination') fig_booking_city_diff.show() # In[51]: #generate dataframe #2019 df_booking_2019_country = pd.DataFrame(clean_data_2019_booking.groupby("origin_country_code")["total_number_events"].sum()) df_booking_2019_country.rename(columns = {"total_number_events" : "booking_2019_country"}, inplace = True) #2021 df_booking_2021_country = pd.DataFrame(clean_data_2021_booking.groupby("origin_country_code")["total_number_events"].sum()) df_booking_2021_country.rename(columns = {"total_number_events" : "booking_2021_country"}, inplace = True) #join dataframes df_booking_country = df_booking_2019_country.join(df_booking_2021_country) df_booking_country.reset_index(inplace=True) df_booking_country.sort_values(by = "booking_2021_country", ascending = False, inplace = True) df_booking_country_10 = pd.DataFrame(df_booking_country.head(10)) #interactive plotly bar chart fig_booking_country = go.Figure() fig_booking_country.add_trace(go.Bar( x = df_booking_country_10['origin_country_code'], y = df_booking_country_10['booking_2021_country'], name = '2021', marker=dict(color="purple"))) fig_booking_country.add_trace(go.Bar( x = df_booking_country_10['origin_country_code'], y = df_booking_country_10['booking_2019_country'], name = '2019', marker=dict(color="darkgray"))) fig_booking_country.update_layout( title = 'Top 10 Origin Country', xaxis_title = 'Origin Country', yaxis_title = 'Total Number of Events') fig_booking_country.show() # In[52]: #interactive data table df_booking_country["diff_val"] = df_booking_country["booking_2021_country"] - df_booking_country["booking_2019_country"] df_booking_country["diff_pct"] = (df_booking_country["diff_val"] / df_booking_country["booking_2019_country"])*100 df_booking_country.sort_values(by = "booking_2021_country", ascending = False, inplace = True) dt.show_grid(df_booking_country, precision = 2) # - In terms of percent change of activity (2021 vs 2019), only florence has positive growth of 203.92%. # # - The rest of the destinations experienced negative percent change from venice (-29.69%) up to the worst hit cities which are leeds (-93.82%), frankfurt(-93.84%), koeln (-93.85%). # # - In terms of total number of events, birmingham has the highest events with 2,650 followed by london with 2,600. # # - The bottom 3 destination cities have less than 100 events which are malaga with 83, cologne with 52, and palma de mallorca with a mere 38. # # - The top origin countries from which bookings were made are US, FR, GB, DE, and ES. # ### 2.1 Searching # In[53]: #generate dataframe #2019 df_searching_2019 = pd.DataFrame(clean_data_2019_searching.groupby("hotel_city")["total_number_events"].sum()) df_searching_2019.rename(columns = {"total_number_events" : "searching_2019_total"}, inplace = True) #2021 df_searching_2021 = pd.DataFrame(clean_data_2021_searching.groupby("hotel_city")["total_number_events"].sum()) df_searching_2021.rename(columns = {"total_number_events" : "searching_2021_total"}, inplace = True) #join dataframes df_searching_city = df_searching_2019.join(df_searching_2021) df_searching_city.reset_index(inplace=True) df_searching_city.sort_values(by = "searching_2021_total", ascending = False, inplace = True) #interactive plotly line chart fig_searching_city = go.Figure() fig_searching_city.add_trace(go.Scatter( x = df_searching_city['hotel_city'], y = df_searching_city['searching_2021_total'], name = '2021', mode = 'lines+markers', line=go.scatter.Line(color='purple'))) fig_searching_city.add_trace(go.Scatter( x = df_searching_city['hotel_city'], y = df_searching_city['searching_2019_total'], name = '2019', mode = 'lines+markers', line=go.scatter.Line(color="darkgray"))) fig_searching_city.update_layout( title = 'Searching Top Destinations by City', xaxis_title = 'Searching Destination', yaxis_title = 'Total Number of Events') fig_searching_city.show() # In[54]: #interactive data table df_searching_city["diff_val"] = df_searching_city["searching_2021_total"] - df_searching_city["searching_2019_total"] df_searching_city["diff_pct"] = (df_searching_city["diff_val"] / df_searching_city["searching_2019_total"])*100 df_searching_city.sort_values(by = "diff_pct", ascending = False, inplace = True) dt.show_grid(df_searching_city, precision = 2) # In[55]: fig_searching_city_diff = px.bar(data_frame = df_searching_city, x = 'diff_pct', y = 'hotel_city') fig_searching_city_diff.update_traces(marker_color='purple') fig_searching_city_diff.update_yaxes(categoryorder='total ascending') fig_searching_city_diff.update_layout( title = 'Destinations by City Percentage Change (2021 vs. 2019)', xaxis_title = 'Percentage Change(%)', yaxis_title = 'Searching Destination') fig_searching_city_diff.show() # In[56]: #generate dataframe #2019 df_searching_2019_country = pd.DataFrame(clean_data_2019_searching.groupby("origin_country_code")["total_number_events"].sum()) df_searching_2019_country.rename(columns = {"total_number_events" : "searching_2019_country"}, inplace = True) #2021 df_searching_2021_country = pd.DataFrame(clean_data_2021_booking.groupby("origin_country_code")["total_number_events"].sum()) df_searching_2021_country.rename(columns = {"total_number_events" : "searching_2021_country"}, inplace = True) #join dataframes df_searching_country = df_searching_2019_country.join(df_searching_2021_country) df_searching_country.reset_index(inplace=True) df_searching_country.sort_values(by = "searching_2021_country", ascending = False, inplace = True) df_searching_country_10 = pd.DataFrame(df_searching_country.head(10)) #interactive plotly bar chart fig_searching_country = go.Figure() fig_searching_country.add_trace(go.Bar( x = df_searching_country_10['origin_country_code'], y = df_searching_country_10['searching_2021_country'], name = '2021', marker=dict(color="purple"))) fig_searching_country.add_trace(go.Bar( x = df_searching_country_10['origin_country_code'], y = df_searching_country_10['searching_2019_country'], name = '2019', marker=dict(color="darkgray"))) fig_searching_country.update_layout( title = 'Top 10 Origin Country', xaxis_title = 'Origin Country', yaxis_title = 'Total Number of Events') fig_searching_country.show() # In[57]: #interactive data table df_searching_country["diff_val"] = df_searching_country["searching_2021_country"] - df_searching_country["searching_2019_country"] df_searching_country["diff_pct"] = (df_searching_country["diff_val"] / df_searching_country["searching_2019_country"])*100 df_searching_country.sort_values(by = "searching_2021_country", ascending = False, inplace = True) dt.show_grid(df_searching_country, precision = 2) # - In terms of percent change of activity (2021 vs 2019), florence, venince, and birmingham have positive percent change with values of 181.83%, 27.73% and 1.51% respectively. # # - The rest of the destinations experienced negative percent change from palma de mallorca (-33.84%) up to the worst hit cities which are koeln (-92.54%) and leeds (-93.70%) # # - In terms of total number of events, london has the highest events with 54,812 and consistently at the top for 2019 and 2021. # # - The bottom 4 destination cities have less than 2000 events which are malaga (1,836), cologne (1,817), hannover (1,573), and leeds (1,210). # # - The top origin countries from which bookings were made are US, FR, GB, ES, and DE. # ## 3. Actionable Recommendations # # - Add new attribute like a flag to understand the factors that affect the conversion from ‘Searching’ to ‘Booking’. An ‘conversion’ attribute can be use as a target variable for a prediction model. # # - Add spending patterns information to know how ‘Reason for Travel Detailed’ and ‘Traveler Value Group’ contribute to the bottom line. # # - Add user demographics information, time of event, and campaigns used to advertise the hotel to better plan and invest marketing and operational resources. # # - Suggest to focus the marketing and operational resources to the top origin countries from which bookings were made such as US, FR, GB, DE, and ES. Additionally, may need to investigate why there are no bookings or even search from US for 2019. This may be a data source issue. # # - Finally, aside from looking at the City Hotel, perform additional interactive visualization and data analysis to other attributes (reason_for_travel_detailed, traveler_value_group, and departure_checkin_date). # #