#!/usr/bin/env python
# coding: utf-8

# ## Structure of the notebook 

# ### I. Creating the dataset.
# ### II. Sentiment analysis using RoBERTa.
# ### III. Data analysis in terms of type of seat.
# #### 1. Counting how many cases of each sentiment class we have for each type of seat.
# #### 2. Negative and positive word clouds for each type of seat.
# #### 3. The type of seat with the highest level of dissatisfaction.
# #### 4. The type of seat with the highest level of satisfaction.
# ###  IV. Data analysis in terms of type of traveler .
# #### 1. Counting how many cases of each sentiment class we have for each type of traveler.
# #### 2. Negative and positive word clouds for each type of traveler.
# #### 3. The type of traveler with the highest level of dissatisfaction.
# #### 4. The type of traveler with the highest level of satisfaction.
# ###  V. A Cross-tabulation of the variables “type of seat” and “type of traveler” with an aggregation of the ratings.
# 
# 
# 

#  

#  

# In[1]:


from bs4 import BeautifulSoup
from lxml import html
import requests
from wordcloud import WordCloud
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
from scipy.special import softmax


# ### I. Creating the dataset 

# In[2]:


# The Xpath for the different elements that we will scrape
rating_xpath = '//div[contains(@itemprop, "reviewRating")]/span[contains(@itemprop, "ratingValue")]'
header_xpath = '//div[contains(@class, "body")]/h2[contains(@class, "text_header")]'
rev_date_xpath = '//div[contains(@class, "body")]//time[contains(@itemprop, "datePublished")]/@datetime'
verfiyed_xpath = '//div[contains(@class, "text_content")]/strong/a/em'
review_cont_xpath = '//article[contains(@itemprop, "review")]//div[contains(@class, "text_con")]'
type_of_traveler_xpath = '//tr/td[contains(@class, "review-rating-header type_of_traveller ")]/following-sibling::td/text()'
type_of_seat_xpath = '//tr/td[contains(@class, "review-rating-header cabin_flown ")]/following-sibling::td/text()'
route_path_xpath = '//tr/td[contains(@class, "review-rating-header route ")]/following-sibling::td/text()'
date_flight_xpath = '//tr/td[contains(@class, "review-rating-header date_flown ")]/following-sibling::td/text()'


# In[3]:


ratings = []
headers = []
rev_date = []
verif = []
reviews = []
type_of_trav = []
type_of_seat = []
routes = []
flt_date = []
complete_pages = []

for i in range(1, 36):
    URL = "https://www.airlinequality.com/airline-reviews/british-airways/page/{}/?sortby=post_date%3ADesc&pagesize=100".format(i)

    page = requests.get(URL)
    tree = html.fromstring(page.content)
    
    # Getting the ratings
    rt = tree.xpath(rating_xpath)
    
    # Getting the headers
    hd = tree.xpath(header_xpath)
    
    # Getting the dates
    dt_rv = tree.xpath(rev_date_xpath)
    
    #  Getting verifications
    vr_f = tree.xpath(verfiyed_xpath)
    
    # Getting the review content
    cont = list(tree.xpath(review_cont_xpath)[0])
    
    # Getting type of traveler 
    trav_tp = tree.xpath(type_of_traveler_xpath)
    
    # Getting type of seat
    seat_tp = tree.xpath(type_of_seat_xpath)
    
    # Getting the routes 
    route = tree.xpath(route_path_xpath)
    
    # Getting flight date
    fl_dt = tree.xpath(date_flight_xpath)
    
    rev = []
    
    # Code for cleaning the reviews.
    for j in tree.xpath(review_cont_xpath)[0].xpath('//text()'):
        if ' | ' in j and '{"@context' not in j and '[]' not in j and 'Air Travel Ratings &' not in j:
            rev.append(j)
    # Code to make sure we get only complete reviews with no missing data. 
    if len(rt)==len(rev) and len(rt)==len(route) and len(rt)==len(trav_tp):
        ratings = ratings + rt
        headers = headers + hd
        rev_date = rev_date + dt_rv
        verif = verif + vr_f
        reviews = reviews + rev
        type_of_trav = type_of_trav + trav_tp
        type_of_seat = type_of_seat + seat_tp
        routes = routes + route
        flt_date = flt_date + fl_dt
        complete_pages.append(i)


# In[253]:


len(flt_date)


# In[254]:


# Getting the pages with correct 100 reviews
complete_pages


# In[263]:


# Getting the ratings 
ratings_ct = []
for i in range(len(ratings)):
    ratings_ct.append(int(ratings[i].text))


# In[418]:


ratings_ct[0]


# In[271]:


# Getting the headers
headers_ct = []
for i in range(len(headers)):
    headers_ct.append(str(headers[i].text).strip('""'))


# In[419]:


headers_ct[0]


# In[296]:


# Getting the reviews 
reviews_ct = []
for i in range(len(reviews)):
    reviews_ct.append(reviews[i].strip(' | '))


# In[420]:


reviews_ct[0]


# In[311]:


import pandas as pd
df_BA = pd.DataFrame({'ratings':ratings_ct, 'headers':headers_ct, 'review_date':rev_date, 'review':reviews_ct, 'traveler_type':type_of_trav, 'seat_type':type_of_seat, 'routes':routes, 'flight_date':flt_date})
df_BA


# In[312]:


df_BA.to_csv('BA_dataset.csv')


# In[324]:


# Getting all the reviews 
all_reviews = []
htmls = []
for i in range(1, 36):
    URL = "https://www.airlinequality.com/airline-reviews/british-airways/page/{}/?sortby=post_date%3ADesc&pagesize=100".format(i)

    page = requests.get(URL)
    tree_reviews = html.fromstring(page.content)
    
    rev_list = []
    
    html_obj = tree_reviews.xpath(review_cont_xpath)
    htmls.append(html_obj[0])


# In[416]:


# Getting the reviews
all_rev = []

for u in range(len(htmls)):
    for l in htmls[u].xpath('//text()'):
        rv = []
        if '{' not in l and len(l)>238:
            rv.append(l)
        all_rev = all_rev+rv


# In[417]:


len(all_rev)


# In[421]:


# stripping ' | '
all_rev_str = []
for i in all_rev:
    all_rev_str.append(i.strip(' | '))


# In[422]:


all_rev_str[0]


# In[423]:


# Creating and saving the reviews dataframe 
df_reviews_ba = pd.DataFrame({'reviews':all_rev_str})
df_reviews_ba


# In[424]:


# Saving the dataset.
df_reviews_ba.to_csv('df_all_reviews_BA.csv')


# ### II. Sentiment analysis using RoBERTa

# In[ ]:


# Loading the model.
weights = f"cardiffnlp/twitter-roberta-base-sentiment"
tok = AutoTokenizer.from_pretrained(weights)
model = TFAutoModelForSequenceClassification.from_pretrained(weights)
model.save_pretrained(weights)


# In[548]:


def predict_class(text, model_output):
    """ 
    A function to output the probability score for a given sentiment or the sentiment class.
    
    Inputs:
    ---------
    
     - text : The text for which we want to compute the sentiment score or the class
     - model_output : One of the following strings to get their probability score ['positive', 'negative', 'neutral']
                      or 'class' to the get the predicted sentiment.
    Outputs :
    ---------
    
    - Probability score for the sentiment in question or the sentiment class.
    
    """
    enc = tok(text, return_tensors='tf', padding='max_length')
    output = model(**enc)
    prob = softmax(output[0][0])
    if model_output == 'positive':
        return prob[-1]
    elif model_output == 'neutral':
        return prob[1]
    elif model_output == 'negative':
        return prob[0]
    elif model_output == 'class':
        return np.argmax(prob)
   

# In[ ]:


# Predicting the probability of the positive class and the sentiment class for
# all examples.
df_BA['prob_positive'] = [predict_class(i[0:510], 'positive') for i in list(df_BA['review'].values)]
df_BA['sent_class'] = [predict_class(i[0:510], 'class') for i in list(df_BA['review'].values)]


# In[4]:


# Evaluating the performance of the model.
df_DataFrame(df_BA.groupby(['sent_class'])['ratings'].mean())


# We can see that the model did a decent job in classifying a large number of  positive and negative reviews, but not a good job in distinguishing neutral reviews.
# 

# In[556]:


# Investigating the ratings of the neutral class
df_BA[df_BA["sent_class"]== 1]['ratings'].describe()


# We can see that we have at least two examples that were incorrectly classified as neutral, one example with a rating of 10(positive) and another one with a rating of 1 (negative)
# 

# In[569]:


# Fixing the mistakes of the model by setting the value of sent_class to 0 (negative)
# for examples that have a rating less than 5 and were classified as neutral by the model.

df_BA.loc[(df_BA["ratings"] < 5) & (df_BA['sent_class']==1), 'sent_class'] = 0 


# In[572]:


# Verify the execution of the correction.

df_BA.loc[(df_BA["ratings"] < 5) & (df_BA['sent_class']==1), 'sent_class']


# In[577]:


# Fixing the mistakes of the model by setting the value of sent_class to 2 (positive)
# for examples that have a rating greater than 5 and were classified as neutral by the model.

df_BA.loc[(df_BA["ratings"] > 5) & (df_BA['sent_class']==1), 'sent_class'] = 2 


# In[578]:


# Verify the execution of the correction.

df_BA.loc[(df_BA["ratings"] > 5) & (df_BA['sent_class']==1), 'sent_class']


# In[800]:


# Fixing the mistakes of the model by setting the value of sent_class to 2 (positive)
# for examples that have a rating greater than 5 and were classified as negative(0) by the model.

df_BA.loc[(df_BA["ratings"] > 5) & (df_BA['sent_class']==0), 'sent_class'] = 2


# In[801]:


# Verify the execution of the correction.

df_BA.loc[(df_BA["ratings"] > 5) & (df_BA['sent_class']==0), 'sent_class']


# In[802]:


# Fixing the mistakes of the model by setting the value of sent_class to 0 (negative)
# for examples that have a rating less than 5 and were classified as positive(0) by the model.

df_BA.loc[(df_BA["ratings"] < 5) & (df_BA['sent_class']==2), 'sent_class'] = 0


# In[803]:


# Verifiy the execution of the correction.

df_BA.loc[(df_BA["ratings"] < 5) & (df_BA['sent_class']==2), 'sent_class']


# In[804]:


# Evaluating the performance of the model after correcting the mistakes done by the model.

df_BA.groupby(['sent_class'])['ratings'].mean()


# Now, the average rating for the neutral class is 5 which is the most representative rating for a neutral review.

# In[805]:


# Replacing the values of the 'sent_class' column
dict_ = {0:'negative',
        1: 'neutral',
        2: 'positive'}
df_BA['sentiment'] = df_BA['sent_class'].replace(dict_)
df_BA['sentiment'].head()


# In[806]:


# Counting the cases for each sentiment
sns.countplot(x='sentiment', data=df_BA)
plt.show()


# ### III. Data analysis in terms of type of seat.

# In[807]:


seats = list(df_BA.seat_type.unique())
seats


# #### 1. Counting how much cases of each sentiment class we have for each type of seat.
# 

# In[808]:


# the case of the business class

sns.countplot(x='sentiment', data=df_BA[df_BA['seat_type'] == 'Business Class'], order=['negative', 'positive', 'neutral'])
plt.show()


# In[809]:


# The case of the economy class

sns.countplot(x='sentiment', data=df_BA[df_BA['seat_type'] == 'Economy Class'], order=['negative', 'positive', 'neutral'])
plt.show()


# In[810]:


# The case of the premium economy class

sns.countplot(x='sentiment', data=df_BA[df_BA['seat_type'] == 'Premium Economy'], order=['negative', 'positive', 'neutral'])
plt.show()


# In[842]:


# The case of the first class
sns.countplot(x='sentiment', data=df_BA[df_BA['seat_type'] == 'First Class'], order=['negative', 'positive', 'neutral'])
plt.show()
plt.savefig('num_pos_rev.png')


# #### 2. Negative and positive wordclouds for each type of seat

# In[812]:


# Word cloud for business class 
# Negative sentiment.
bc_df = df_BA[(df_BA['seat_type'] == 'Business Class') & (df_BA['sentiment'] == 'negative') ]['review']
words = ' '.join(list(bc_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[813]:


# Wordcloud for business class 
# Positive sentiment.
bc_df = df_BA[(df_BA['seat_type'] == 'Business Class') & (df_BA['sentiment'] == 'positive') ]['review']
words = ' '.join(list(bc_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[814]:


# Wordcloud for economy class 
# negative sentiment.
bc_df = df_BA[(df_BA['seat_type'] == 'Economy Class') & (df_BA['sentiment'] == 'negative') ]['review']
words = ' '.join(list(bc_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[815]:


# Wordcloud for economy class 
# positive sentiment.
bc_df = df_BA[(df_BA['seat_type'] == 'Economy Class') & (df_BA['sentiment'] == 'positive') ]['review']
words = ' '.join(list(bc_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[816]:


# Wordcloud for premium economy 
# negative sentiment.
bc_df = df_BA[(df_BA['seat_type'] == 'Premium Economy') & (df_BA['sentiment'] == 'negative') ]['review']
words = ' '.join(list(bc_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[817]:


# Wordcloud for premium economy 
# positive sentiment.
bc_df = df_BA[(df_BA['seat_type'] == 'Premium Economy') & (df_BA['sentiment'] == 'positive') ]['review']
words = ' '.join(list(bc_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[818]:


# Wordcloud for first class 
# negative sentiment.
bc_df = df_BA[(df_BA['seat_type'] == 'First Class') & (df_BA['sentiment'] == 'negative') ]['review']
words = ' '.join(list(bc_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[819]:


# Wordcloud for first class 
# positive sentiment.
bc_df = df_BA[(df_BA['seat_type'] == 'First Class') & (df_BA['sentiment'] == 'positive') ]['review']
words = ' '.join(list(bc_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# #### 3. The type of seat with the highest level of unsatisfaction
# 

# In[820]:


# Getting the dataframe
neg_df = pd.DataFrame(df_BA[df_BA.sentiment == 'negative'].groupby(['seat_type'])['sentiment'].count()).reset_index()

# normalizing the counts to neutralize the effect of the number of examples for each class

# Getting the number of examples for the economy class
num_econ = df_BA[df_BA['seat_type']=='Economy Class'].shape[0]

# Getting the number of examples for the business class
num_bus = df_BA[df_BA['seat_type']=='Business Class'].shape[0]

# Getting the number of examples for the first class
num_first = df_BA[df_BA['seat_type']=='First Class'].shape[0]

# Getting the number of examples for the premium economy
num_prem = df_BA[df_BA['seat_type']=='Premium Economy'].shape[0]

# normlized count for business class
neg_df.loc[(neg_df["seat_type"] == 'Business Class'), 'normalized_count'] = neg_df.loc[(neg_df["seat_type"] == 'Business Class'), 'sentiment']/num_bus

# normlized count for economy class
neg_df.loc[(neg_df["seat_type"] == 'Economy Class'), 'normalized_count'] = neg_df.loc[(neg_df["seat_type"] == 'Economy Class'), 'sentiment']/num_econ

# normlized count for first class
neg_df.loc[(neg_df["seat_type"] == 'First Class'), 'normalized_count'] = neg_df.loc[(neg_df["seat_type"] == 'First Class'), 'sentiment']/num_first

# normlized count for premium economy 
neg_df.loc[(neg_df["seat_type"] == 'Premium Economy'), 'normalized_count'] = neg_df.loc[(neg_df["seat_type"] == 'Premium Economy'), 'sentiment']/num_prem


# In[821]:


# Dropping the old column
neg_df.drop('sentiment', axis=1, inplace=True)


# In[822]:


sns.catplot(x='seat_type', y='normalized_count', data=neg_df, kind='bar')
plt.title('The level of unsatisfaction per type of seat')
plt.xlabel('The type of the seat')
plt.ylabel('The normalized count')
plt.show()


# The visual shows us that the class with the highest level of dissatisfaction is the "economy class".

# #### 4. The type of seat with the highest level of satisfaction
# 

# In[823]:


# Getting the dataframe
pos_df = pd.DataFrame(df_BA[df_BA.sentiment == 'positive'].groupby(['seat_type'])['sentiment'].count()).reset_index()

# normalizing using the number of examples for each class

# normlized count for business class
pos_df.loc[(pos_df["seat_type"] == 'Business Class'), 'normalized_count'] = pos_df.loc[(pos_df["seat_type"] == 'Business Class'), 'sentiment']/num_bus

# normlized count for economy class
pos_df.loc[(pos_df["seat_type"] == 'Economy Class'), 'normalized_count'] = pos_df.loc[(pos_df["seat_type"] == 'Economy Class'), 'sentiment']/num_econ

# normlized count for first class
pos_df.loc[(pos_df["seat_type"] == 'First Class'), 'normalized_count'] = pos_df.loc[(pos_df["seat_type"] == 'First Class'), 'sentiment']/num_first

# normlized count for premium economy 
pos_df.loc[(pos_df["seat_type"] == 'Premium Economy'), 'normalized_count'] = pos_df.loc[(pos_df["seat_type"] == 'Premium Economy'), 'sentiment']/num_prem


# Dropping the old column
pos_df.drop('sentiment', axis=1, inplace=True)


# In[824]:


sns.catplot(x='seat_type', y='normalized_count', data=pos_df, kind='bar')
plt.title('The level of satisfaction per type of seat')
plt.xlabel('The type of the seat')
plt.ylabel('The normalized count')
plt.show()


# The two visuals show us that the most appreciated class is the "first class", it has the lowest level of dissatisfaction and the highest level of satisfaction.
# 

# ###  IV. Data analysis in terms of type of traveler 
# 

# #### 1. Counting how much cases of each sentiment class we have for each type of traveler.
# 

# In[825]:


df_BA.traveler_type.unique()


# In[826]:


# The case of solo leisure
sns.countplot(x='sentiment', data=df_BA[df_BA['traveler_type'] == 'Solo Leisure'], order=['negative', 'positive', 'neutral'])
plt.show()


# In[827]:


# The case of couple leisure
sns.countplot(x='sentiment', data=df_BA[df_BA['traveler_type'] == 'Couple Leisure'], order=['negative', 'positive', 'neutral'])
plt.show()


# In[828]:


# The case of business
sns.countplot(x='sentiment', data=df_BA[df_BA['traveler_type'] == 'Business'], order=['negative', 'positive', 'neutral'])
plt.show()


# In[829]:


# The case of Family Leisure
sns.countplot(x='sentiment', data=df_BA[df_BA['traveler_type'] == 'Family Leisure'], order=['negative', 'positive', 'neutral'])
plt.show()


# #### 2. Negative and positive wordclouds for each type of seat

# In[830]:


# Wordcloud for Solo leisure
# Negative sentiment.
solo_df = df_BA[(df_BA['traveler_type'] == 'Solo Leisure') & (df_BA['sentiment'] == 'negative') ]['review']
words = ' '.join(list(solo_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[831]:


# Wordcloud for Solo leisure
# positive sentiment.
solo_df = df_BA[(df_BA['traveler_type'] == 'Solo Leisure') & (df_BA['sentiment'] == 'positive') ]['review']
words = ' '.join(list(solo_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[832]:


# Wordcloud for family leisure
# negative sentiment.
fam_df = df_BA[(df_BA['traveler_type'] == 'Family Leisure') & (df_BA['sentiment'] == 'negative') ]['review']
words = ' '.join(list(fam_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[833]:


# Wordcloud for family leisure
# positive sentiment.
fam_df = df_BA[(df_BA['traveler_type'] == 'Family Leisure') & (df_BA['sentiment'] == 'positive') ]['review']
words = ' '.join(list(fam_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[834]:


# Wordcloud for couple leisure
# negative sentiment.
fam_df = df_BA[(df_BA['traveler_type'] == 'Couple Leisure') & (df_BA['sentiment'] == 'negative') ]['review']
words = ' '.join(list(fam_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[835]:


# Wordcloud for couple leisure
# positive sentiment.
fam_df = df_BA[(df_BA['traveler_type'] == 'Couple Leisure') & (df_BA['sentiment'] == 'positive') ]['review']
words = ' '.join(list(fam_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[836]:


# Wordcloud for Business
# negative sentiment.
fam_df = df_BA[(df_BA['traveler_type'] == 'Business') & (df_BA['sentiment'] == 'negative') ]['review']
words = ' '.join(list(fam_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# In[837]:


# Wordcloud for Business
# positive sentiment.
fam_df = df_BA[(df_BA['traveler_type'] == 'Business') & (df_BA['sentiment'] == 'positive') ]['review']
words = ' '.join(list(fam_df.values))
wordcloud = WordCloud(background_color='white').generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


# #### 3. The type of traveler with the highest level of unsatisfaction
# 

# In[838]:


# Getting the dataframe
neg_df_tr = pd.DataFrame(df_BA[df_BA.sentiment == 'negative'].groupby(['traveler_type'])['sentiment'].count()).reset_index()

# normalizing using the number of examples for each type of traveler

# Getting the number of examples for the solo leisure type
num_solo_l = df_BA[df_BA['traveler_type']=='Solo Leisure'].shape[0]

# Getting the number of examples for the couple leisure type
num_couple_l = df_BA[df_BA['traveler_type']=='Couple Leisure'].shape[0]

# Getting the number of examples for the business type
num_bus = df_BA[df_BA['traveler_type']=='Business'].shape[0]

# Getting the number of examples for the family leisure type
num_fam_l = df_BA[df_BA['traveler_type']=='Family Leisure'].shape[0]

# normlized count for solo leisure
neg_df_tr.loc[(neg_df_tr["traveler_type"] == 'Solo Leisure'), 'normalized_count_trav'] = neg_df_tr.loc[(neg_df_tr["traveler_type"] == 'Solo Leisure'), 'sentiment']/num_solo_l

# normlized count for couple leisure
neg_df_tr.loc[(neg_df_tr["traveler_type"] == 'Couple Leisure'), 'normalized_count_trav'] = neg_df_tr.loc[(neg_df_tr["traveler_type"] == 'Couple Leisure'), 'sentiment']/num_couple_l

# normlized count for Business
neg_df_tr.loc[(neg_df_tr["traveler_type"] == 'Business'), 'normalized_count_trav'] = neg_df_tr.loc[(neg_df_tr["traveler_type"] == 'Business'), 'sentiment']/num_bus

# normlized count for Family Leisure 
neg_df_tr.loc[(neg_df_tr["traveler_type"] == 'Family Leisure'), 'normalized_count_trav'] = neg_df_tr.loc[(neg_df_tr["traveler_type"] == 'Family Leisure'), 'sentiment']/num_fam_l

neg_df_tr


# In[839]:


sns.catplot(x='traveler_type', y='normalized_count_trav', data=neg_df_tr, kind='bar')
plt.title('The level of unsatisfaction per type of traveler')
plt.xlabel('The type of the traveler')
plt.ylabel('The normalized count')
plt.show()


# #### 4. The type of traveler with the highest level of satisfaction
# 

# In[840]:


# Getting the dataframe
pos_df_tr = pd.DataFrame(df_BA[df_BA.sentiment == 'positive'].groupby(['traveler_type'])['sentiment'].count()).reset_index()

# normlized count for solo leisure
pos_df_tr.loc[(pos_df_tr["traveler_type"] == 'Solo Leisure'), 'normalized_count_trav'] = pos_df_tr.loc[(pos_df_tr["traveler_type"] == 'Solo Leisure'), 'sentiment']/num_solo_l

# normlized count for couple leisure
pos_df_tr.loc[(pos_df_tr["traveler_type"] == 'Couple Leisure'), 'normalized_count_trav'] = pos_df_tr.loc[(pos_df_tr["traveler_type"] == 'Couple Leisure'), 'sentiment']/num_couple_l

# normlized count for Business
pos_df_tr.loc[(pos_df_tr["traveler_type"] == 'Business'), 'normalized_count_trav'] = pos_df_tr.loc[(pos_df_tr["traveler_type"] == 'Business'), 'sentiment']/num_bus

# normlized count for Family Leisure 
pos_df_tr.loc[(pos_df_tr["traveler_type"] == 'Family Leisure'), 'normalized_count_trav'] = pos_df_tr.loc[(pos_df_tr["traveler_type"] == 'Family Leisure'), 'sentiment']/num_fam_l

pos_df_tr


# In[787]:


sns.catplot(x='traveler_type', y='normalized_count_trav', data=pos_df_tr, kind='bar')
plt.title('The level of satisfaction per type of traveler')
plt.xlabel('The type of the traveler')
plt.ylabel('The normalized count')
plt.show()


# From the two previous visuals, we can see that the most satisfied type of traveler is the "Family Leisure" traveler.
# They have the lowest level of dissatisfaction and the highest level of satisfaction.
# 

# ###  V. A Cross-tabulation of the variables type of seat and type of traveler to aggregate the ratings.

# In[684]:


cross_tb = pd.crosstab(index=df_BA['traveler_type'], columns=df_BA['seat_type'], values=df_BA['ratings'],
           aggfunc='median')
cross_tb


# In[687]:


cross_tb.plot(kind='bar')
plt.title('Average rating of the different classes by the different types of travelers')
plt.xlabel('Type of traveler')
plt.ylabel('Average rating')
plt.show()


# From the cross table and the visual we can see that :
# 
# - Travelers of type "business" are very unsatisfied with all types of seats.
# - The first class is poorly rated by travelers of type "family leisure" and highly rated by travelers of types "solo leisure" and "couple leisure".
# - The "family leisure" type of travelers appreciate the "business class".
# - The "solo leisure" travelers give on average a decent rating for all types of seats, except the "economy class".
# 
# 

# In[ ]: