!pip install kaggle

!mkdir ~/.kaggle

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiments = SentimentIntensityAnalyzer()

data = pd.read_csv("Hotel_Reviews.csv")
print(data.head())

# get the unique countries in the file
data["country"].unique()

# get the unique hotel names in the file 
data["name"].unique()

data["reviews.rating"].unique()

# fill the null reviews of the Hotels with 0
data['reviews.rating']=data['reviews.rating'].fillna(0)

# create a range of 5 maximum review numbers
for i in range(0,len(data)):
   if(data['reviews.rating'].loc[i] > 5):
       temp = data['reviews.rating'].loc[i]
       newtemp = (temp/10)*5
       data.at[i,'reviews.rating'] = newtemp

ratings = data["reviews.rating"].value_counts()
numbers = ratings.index
quantity = ratings.values
ratings

# round decimal places in rating
ratings=data['reviews.rating'].round(0).value_counts()

# create indexes from rating value and counts
numbers = ratings.index
quantity = ratings.values

# assign color to each rating and revise the chart to see the share of rating values 
custom_colors = ["tan", "grey", 'silver', "black", "yellow"]
plt.figure(figsize=(5, 5))
plt.pie(quantity, labels=numbers, colors=custom_colors)
central_circle = plt.Circle((0, 0), 0.5, color='white')
fig = plt.gcf()
fig.gca().add_artist(central_circle)
plt.rc('font', size=12)
plt.title("Hotel Reviews Ratings", fontsize=20)
plt.show()

# Sentiment analysis - analyzer, transformers, textblob, ...

# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pip install -q transformers

from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
rt = data["reviews.text"]
rt

sid = SentimentIntensityAnalyzer()

pip install TextBlob

# install bs libraries to get the information, you can use selenium or scrapy as well
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv

# extract the HTML content and create an url object
url = ('https://www.tripadvisor.in/Hotels-g28932-Hawaii-Hotels.html')

user_agent = ({'User-Agent':
			'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
			AppleWebKit/537.36 (KHTML, like Gecko) \
			Chrome/90.0.4430.212 Safari/537.36',
			'Accept-Language': 'en-US, en;q=0.5'})

def get_page_contents(url):
    page = requests.get(url, headers = user_agent)
    return BeautifulSoup(page.text, 'html.parser')

soup = get_page_contents(url)

# Find and extract the data elements.
hotels = []
for name in soup.findAll('div',{'class':'listing_title'}):
    hotels.append(name.text.strip())

ratings = []
for rating in soup.findAll('a',{'class':'ui_bubble_rating'}):
    ratings.append(rating['alt'])  

reviews = []
for review in soup.findAll('a',{'class':'review_count'}):
    reviews.append(review.text.strip())

prices = []
for p in soup.findAll('div',{'class':'price-wrap'}):
    prices.append(p.text.replace('₹','').strip())  

# save column names into dictionary
dict = {'Hotel Names':hotels,'Ratings':ratings,'Number of Reviews':reviews,'Prices':prices}

# create the dataframe
information = pd.DataFrame.from_dict(dict)
information.head(10)

# convert dataframe to CSV file
information.to_csv('hotels.csv', index=False, header=True)

# get text reviews using beautiful soup scraping

from bs4 import BeautifulSoup
import re
import requests
import pandas as pd

headers = {'User-Agent':
               'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

page = "https://www.yelp.com/biz/fairmont-san-francisco-san-francisco?sort_by=rating_desc"
page_num =0
session = requests.Session()
review_comments = []
while True:
    pageTree = session.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    posts = pageSoup.select('p[lang="en"]')
    for post in posts:
        comments = post.get_text().replace('\n', '').strip()
        #print(comments)
        review_comments.append(comments)

    if pageSoup.find("span", text=re.compile("Next")):
     page = "https://www.yelp.com/biz/fairmont-san-francisco-san-francisco?start={}&sort_by=rating_desc".format(page_num)
     page_num += 20
    else:
      break

df = pd.DataFrame({"Review_Comments": review_comments})
print(df)
df.to_csv('filename.csv')

df = pd.read_csv('filename.csv')

print(df.to_string()) 

# Install the packages.
!pip install beautifulsoup4 requests pandas
# Import the required libraries.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv

names = []
for links in soup.find_all('div', class_='name'):
     name = links.get_text().strip()
     names.append(name)

# Extract the HTML and create a BeautifulSoup object.
url = 'https://www.metacritic.com/game/switch/super-mario-3d-world-+-bowsers-fury/user-reviews'

user_agent = {'User-agent': 'Mozilla/5.0'}

def get_page_contents(url):
    page = requests.get(url, headers = user_agent)
    return BeautifulSoup(page.text, 'html.parser')

soup = get_page_contents(url)

names = []
for links in soup.find_all('div', class_='name'):
     name = links.get_text().strip()
     names.append(name)
dates = []
for links in soup.find_all('div', class_='date'):
     date = links.get_text()
     dates.append(date)
ratings = []   
for links in soup.find_all('div', class_='metascore_w user medium game positive indiv'):
     score = links.get_text()
     ratings.append(score)
reviews = []   
for links in soup.find_all('span', class_='blurb blurb_expanded'):
     review = links.get_text()
     reviews.append(review)

for links in soup.find_all('span', class_='blurb blurb_collapsed'):
     review = links.get_text()
     reviews.append(review)

# Create the dictionary.
games_dict = {'Name': names, 'Date': dates, 'Rating': ratings, 'Review': reviews}

# Print the lengths of each list.  
print(len(names), len(dates), len(ratings), len(reviews))

# Create the data frame.
game = pd.DataFrame.from_dict(games_dict, orient='index')
games = game.transpose()

games.head(4)

games.to_csv('reviews.csv', index=False, header=True)
reviews = pd.read_csv('reviews.csv', lineterminator='\n')

print(reviews)