!pip install kaggle !mkdir ~/.kaggle import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import nltk nltk.download('vader_lexicon') from nltk.sentiment.vader import SentimentIntensityAnalyzer sentiments = SentimentIntensityAnalyzer() data = pd.read_csv("Hotel_Reviews.csv") print(data.head()) # get the unique countries in the file data["country"].unique() # get the unique hotel names in the file data["name"].unique() data["reviews.rating"].unique() # fill the null reviews of the Hotels with 0 data['reviews.rating']=data['reviews.rating'].fillna(0) # create a range of 5 maximum review numbers for i in range(0,len(data)): if(data['reviews.rating'].loc[i] > 5): temp = data['reviews.rating'].loc[i] newtemp = (temp/10)*5 data.at[i,'reviews.rating'] = newtemp ratings = data["reviews.rating"].value_counts() numbers = ratings.index quantity = ratings.values ratings # round decimal places in rating ratings=data['reviews.rating'].round(0).value_counts() # create indexes from rating value and counts numbers = ratings.index quantity = ratings.values # assign color to each rating and revise the chart to see the share of rating values custom_colors = ["tan", "grey", 'silver', "black", "yellow"] plt.figure(figsize=(5, 5)) plt.pie(quantity, labels=numbers, colors=custom_colors) central_circle = plt.Circle((0, 0), 0.5, color='white') fig = plt.gcf() fig.gca().add_artist(central_circle) plt.rc('font', size=12) plt.title("Hotel Reviews Ratings", fontsize=20) plt.show() # Sentiment analysis - analyzer, transformers, textblob, ... # add sentiment anaylsis columns from nltk.sentiment.vader import SentimentIntensityAnalyzer pip install -q transformers from transformers import pipeline sentiment_pipeline = pipeline("sentiment-analysis") rt = data["reviews.text"] rt sid = SentimentIntensityAnalyzer() pip install TextBlob # install bs libraries to get the information, you can use selenium or scrapy as well import requests from bs4 import BeautifulSoup import pandas as pd import csv # extract the HTML content and create an url object url = ('https://www.tripadvisor.in/Hotels-g28932-Hawaii-Hotels.html') user_agent = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/90.0.4430.212 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'}) def get_page_contents(url): page = requests.get(url, headers = user_agent) return BeautifulSoup(page.text, 'html.parser') soup = get_page_contents(url) # Find and extract the data elements. hotels = [] for name in soup.findAll('div',{'class':'listing_title'}): hotels.append(name.text.strip()) ratings = [] for rating in soup.findAll('a',{'class':'ui_bubble_rating'}): ratings.append(rating['alt']) reviews = [] for review in soup.findAll('a',{'class':'review_count'}): reviews.append(review.text.strip()) prices = [] for p in soup.findAll('div',{'class':'price-wrap'}): prices.append(p.text.replace('₹','').strip()) # save column names into dictionary dict = {'Hotel Names':hotels,'Ratings':ratings,'Number of Reviews':reviews,'Prices':prices} # create the dataframe information = pd.DataFrame.from_dict(dict) information.head(10) # convert dataframe to CSV file information.to_csv('hotels.csv', index=False, header=True) # get text reviews using beautiful soup scraping from bs4 import BeautifulSoup import re import requests import pandas as pd headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} page = "https://www.yelp.com/biz/fairmont-san-francisco-san-francisco?sort_by=rating_desc" page_num =0 session = requests.Session() review_comments = [] while True: pageTree = session.get(page, headers=headers) pageSoup = BeautifulSoup(pageTree.content, 'html.parser') posts = pageSoup.select('p[lang="en"]') for post in posts: comments = post.get_text().replace('\n', '').strip() #print(comments) review_comments.append(comments) if pageSoup.find("span", text=re.compile("Next")): page = "https://www.yelp.com/biz/fairmont-san-francisco-san-francisco?start={}&sort_by=rating_desc".format(page_num) page_num += 20 else: break df = pd.DataFrame({"Review_Comments": review_comments}) print(df) df.to_csv('filename.csv') df = pd.read_csv('filename.csv') print(df.to_string()) # Install the packages. !pip install beautifulsoup4 requests pandas # Import the required libraries. import requests from bs4 import BeautifulSoup import pandas as pd import csv names = [] for links in soup.find_all('div', class_='name'): name = links.get_text().strip() names.append(name) # Extract the HTML and create a BeautifulSoup object. url = 'https://www.metacritic.com/game/switch/super-mario-3d-world-+-bowsers-fury/user-reviews' user_agent = {'User-agent': 'Mozilla/5.0'} def get_page_contents(url): page = requests.get(url, headers = user_agent) return BeautifulSoup(page.text, 'html.parser') soup = get_page_contents(url) names = [] for links in soup.find_all('div', class_='name'): name = links.get_text().strip() names.append(name) dates = [] for links in soup.find_all('div', class_='date'): date = links.get_text() dates.append(date) ratings = [] for links in soup.find_all('div', class_='metascore_w user medium game positive indiv'): score = links.get_text() ratings.append(score) reviews = [] for links in soup.find_all('span', class_='blurb blurb_expanded'): review = links.get_text() reviews.append(review) for links in soup.find_all('span', class_='blurb blurb_collapsed'): review = links.get_text() reviews.append(review) # Create the dictionary. games_dict = {'Name': names, 'Date': dates, 'Rating': ratings, 'Review': reviews} # Print the lengths of each list. print(len(names), len(dates), len(ratings), len(reviews)) # Create the data frame. game = pd.DataFrame.from_dict(games_dict, orient='index') games = game.transpose() games.head(4) games.to_csv('reviews.csv', index=False, header=True) reviews = pd.read_csv('reviews.csv', lineterminator='\n') print(reviews)