!pip install -q yahoo_fin pandas_datareader gensim textblob import nltk nltk.download('stopwords') nltk.download('punkt') import requests import pandas as pd from yahoo_fin import stock_info as info from yahoo_fin import news from pandas_datareader import DataReader import numpy as np import warnings warnings.filterwarnings('ignore') from gensim import corpora, models from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import string # Get the list of tickers that comprise the Dow Jones Industrial Average tickers = info.tickers_dow() tickers # Initialize an empty DataFrame to store the summaries dow_news_df = pd.DataFrame(columns=['Ticker', 'Summaries']) # Iterate through the list of Dow tickers and fetch news summaries for ticker in tickers: ticker_news = news.get_yf_rss(ticker) summaries = [article['summary'] for article in ticker_news] dow_news_df = dow_news_df.append({'Ticker': ticker, 'Summaries': summaries}, ignore_index=True) dow_news_df.head() dow_news_df from textblob import TextBlob # Function to calculate sentiment polarity def calculate_sentiment(text): return TextBlob(text).sentiment.polarity # Initialize an empty DataFrame to store the sentiment scores dow_sentiment_df = pd.DataFrame(columns=['Ticker', 'Average Sentiment']) # Iterate through the DataFrame and calculate the average sentiment for each ticker for index, row in dow_news_df.iterrows(): ticker = row['Ticker'] summaries = row['Summaries'] if summaries: avg_sentiment = np.mean([calculate_sentiment(summary) for summary in summaries]) dow_sentiment_df = dow_sentiment_df.append({'Ticker': ticker, 'Average Sentiment': avg_sentiment}, ignore_index=True) dow_sentiment_df.head() dow_sentiment_df # Initialize an empty DataFrame to store the top 20 summaries for each ticker dow_top20_summaries_df = pd.DataFrame(columns=['Ticker', 'Summary']) # Iterate through the list of Dow tickers and fetch the top 20 news summaries for ticker in tickers: ticker_news = news.get_yf_rss(ticker)[:20] for article in ticker_news: summary = article['summary'] dow_top20_summaries_df = dow_top20_summaries_df.append({'Ticker': ticker, 'Summary': summary}, ignore_index=True) dow_top20_summaries_df.head(40) dow_top20_summaries_df # Function to calculate sentiment polarity def calculate_sentiment(text): return TextBlob(text).sentiment.polarity # Initialize an empty DataFrame to store the sentiment scores for the top 20 summaries dow_top20_sentiment_df = pd.DataFrame(columns=['Ticker', 'Summary', 'Sentiment']) # Iterate through the DataFrame and calculate the sentiment for each summary for index, row in dow_top20_summaries_df.iterrows(): ticker = row['Ticker'] summary = row['Summary'] sentiment = calculate_sentiment(summary) dow_top20_sentiment_df = dow_top20_sentiment_df.append({'Ticker': ticker, 'Summary': summary, 'Sentiment': sentiment}, ignore_index=True) dow_top20_sentiment_df.head(40) # Function to clean and tokenize text def clean_tokenize(text): stop_words = set(stopwords.words('english')) tokens = word_tokenize(text.lower()) tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation] return tokens # Tokenize the summaries tokenized_summaries = dow_top20_summaries_df['Summary'].apply(clean_tokenize) # Create a dictionary and corpus from the tokenized summaries dictionary = corpora.Dictionary(tokenized_summaries) corpus = [dictionary.doc2bow(text) for text in tokenized_summaries] # Apply LDA model lda_model = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15) topics = lda_model.print_topics(num_words=4) topics # Re-run the LDA topic modeling code after downloading the required NLTK resources from gensim import corpora, models from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import string # Function to clean and tokenize text def clean_tokenize(text): stop_words = set(stopwords.words('english')) tokens = word_tokenize(text.lower()) tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation] return tokens # Tokenize the summaries tokenized_summaries = dow_top20_summaries_df['Summary'].apply(clean_tokenize) # Create a dictionary and corpus from the tokenized summaries dictionary = corpora.Dictionary(tokenized_summaries) corpus = [dictionary.doc2bow(text) for text in tokenized_summaries] # Apply LDA model lda_model = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15) topics = lda_model.print_topics(num_words=4) topics # Function to assign topics to summaries based on LDA model def assign_topic_to_summary(summary): bow = dictionary.doc2bow(clean_tokenize(summary)) topic_scores = lda_model[bow] dominant_topic = max(topic_scores, key=lambda x: x[1])[0] return dominant_topic # Assign topics to each summary dow_top20_summaries_df['Topic'] = dow_top20_summaries_df['Summary'].apply(assign_topic_to_summary) # Perform sentiment analysis on each summary dow_top20_summaries_df['Sentiment'] = dow_top20_summaries_df['Summary'].apply(calculate_sentiment) # Group by topic and calculate average sentiment topic_sentiment_df = dow_top20_summaries_df.groupby('Topic')['Sentiment'].mean().reset_index() topic_sentiment_df # Function to calculate weighted sentiment based on topic sentiment def calculate_weighted_sentiment(row): topic = row['Topic'] sentiment = row['Sentiment'] topic_weight = topic_sentiment_df[topic_sentiment_df['Topic'] == topic]['Sentiment'].values[0] return sentiment * topic_weight # Calculate weighted sentiment for each summary dow_top20_summaries_df['Weighted_Sentiment'] = dow_top20_summaries_df.apply(calculate_weighted_sentiment, axis=1) # Calculate new average sentiment for each company based on weighted sentiment new_dow_sentiment_df = dow_top20_summaries_df.groupby('Ticker')['Weighted_Sentiment'].mean().reset_index() # Merge with original dow_sentiment_df to compare comparison_df = pd.merge(dow_sentiment_df, new_dow_sentiment_df, on='Ticker', how='inner') comparison_df.columns = ['Ticker', 'Original_Sentiment', 'New_Weighted_Sentiment'] comparison_df