!pip install -q yahoo_fin pandas_datareader gensim textblob

import nltk
nltk.download('stopwords')
nltk.download('punkt')

import requests
import pandas as pd
from yahoo_fin import stock_info as info
from yahoo_fin import news
from pandas_datareader import DataReader
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


# Get the list of tickers that comprise the Dow Jones Industrial Average
tickers = info.tickers_dow()
tickers

# Initialize an empty DataFrame to store the summaries
dow_news_df = pd.DataFrame(columns=['Ticker', 'Summaries'])
# Iterate through the list of Dow tickers and fetch news summaries
for ticker in tickers:
    ticker_news = news.get_yf_rss(ticker)
    summaries = [article['summary'] for article in ticker_news]
    dow_news_df = dow_news_df.append({'Ticker': ticker, 'Summaries': summaries}, ignore_index=True)
dow_news_df.head()

dow_news_df

from textblob import TextBlob
# Function to calculate sentiment polarity
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity
# Initialize an empty DataFrame to store the sentiment scores
dow_sentiment_df = pd.DataFrame(columns=['Ticker', 'Average Sentiment'])
# Iterate through the DataFrame and calculate the average sentiment for each ticker
for index, row in dow_news_df.iterrows():
    ticker = row['Ticker']
    summaries = row['Summaries']
    if summaries:
        avg_sentiment = np.mean([calculate_sentiment(summary) for summary in summaries])
        dow_sentiment_df = dow_sentiment_df.append({'Ticker': ticker, 'Average Sentiment': avg_sentiment}, ignore_index=True)
dow_sentiment_df.head()

dow_sentiment_df


# Initialize an empty DataFrame to store the top 20 summaries for each ticker
dow_top20_summaries_df = pd.DataFrame(columns=['Ticker', 'Summary'])
# Iterate through the list of Dow tickers and fetch the top 20 news summaries
for ticker in tickers:
    ticker_news = news.get_yf_rss(ticker)[:20]
    for article in ticker_news:
        summary = article['summary']
        dow_top20_summaries_df = dow_top20_summaries_df.append({'Ticker': ticker, 'Summary': summary}, ignore_index=True)
dow_top20_summaries_df.head(40)

dow_top20_summaries_df

# Function to calculate sentiment polarity
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity
# Initialize an empty DataFrame to store the sentiment scores for the top 20 summaries
dow_top20_sentiment_df = pd.DataFrame(columns=['Ticker', 'Summary', 'Sentiment'])
# Iterate through the DataFrame and calculate the sentiment for each summary
for index, row in dow_top20_summaries_df.iterrows():
    ticker = row['Ticker']
    summary = row['Summary']
    sentiment = calculate_sentiment(summary)
    dow_top20_sentiment_df = dow_top20_sentiment_df.append({'Ticker': ticker, 'Summary': summary, 'Sentiment': sentiment}, ignore_index=True)
dow_top20_sentiment_df.head(40)

# Function to clean and tokenize text
def clean_tokenize(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

# Tokenize the summaries
tokenized_summaries = dow_top20_summaries_df['Summary'].apply(clean_tokenize)

# Create a dictionary and corpus from the tokenized summaries
dictionary = corpora.Dictionary(tokenized_summaries)
corpus = [dictionary.doc2bow(text) for text in tokenized_summaries]

# Apply LDA model
lda_model = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=4)

topics

# Re-run the LDA topic modeling code after downloading the required NLTK resources
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Function to clean and tokenize text
def clean_tokenize(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

# Tokenize the summaries
tokenized_summaries = dow_top20_summaries_df['Summary'].apply(clean_tokenize)

# Create a dictionary and corpus from the tokenized summaries
dictionary = corpora.Dictionary(tokenized_summaries)
corpus = [dictionary.doc2bow(text) for text in tokenized_summaries]

# Apply LDA model
lda_model = models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=4)

topics

# Function to assign topics to summaries based on LDA model
def assign_topic_to_summary(summary):
    bow = dictionary.doc2bow(clean_tokenize(summary))
    topic_scores = lda_model[bow]
    dominant_topic = max(topic_scores, key=lambda x: x[1])[0]
    return dominant_topic

# Assign topics to each summary
dow_top20_summaries_df['Topic'] = dow_top20_summaries_df['Summary'].apply(assign_topic_to_summary)

# Perform sentiment analysis on each summary
dow_top20_summaries_df['Sentiment'] = dow_top20_summaries_df['Summary'].apply(calculate_sentiment)

# Group by topic and calculate average sentiment
topic_sentiment_df = dow_top20_summaries_df.groupby('Topic')['Sentiment'].mean().reset_index()

topic_sentiment_df

# Function to calculate weighted sentiment based on topic sentiment
def calculate_weighted_sentiment(row):
    topic = row['Topic']
    sentiment = row['Sentiment']
    topic_weight = topic_sentiment_df[topic_sentiment_df['Topic'] == topic]['Sentiment'].values[0]
    return sentiment * topic_weight

# Calculate weighted sentiment for each summary
dow_top20_summaries_df['Weighted_Sentiment'] = dow_top20_summaries_df.apply(calculate_weighted_sentiment, axis=1)

# Calculate new average sentiment for each company based on weighted sentiment
new_dow_sentiment_df = dow_top20_summaries_df.groupby('Ticker')['Weighted_Sentiment'].mean().reset_index()

# Merge with original dow_sentiment_df to compare
comparison_df = pd.merge(dow_sentiment_df, new_dow_sentiment_df, on='Ticker', how='inner')
comparison_df.columns = ['Ticker', 'Original_Sentiment', 'New_Weighted_Sentiment']

comparison_df