import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
data = pd.read_csv('/Users/alksnk/Downloads/covid-vaccination-doses-per-capita.csv')
# Find the rows with the maximum values in a column for each group
max_rows = data.loc[data.groupby('Entity')['total_vaccinations_per_hundred'].idxmax()]
# Sort the dataframe by a column in descending order
sorted_data = max_rows.sort_values('total_vaccinations_per_hundred', ascending=False)
# Select the first 100 rows
subset = sorted_data.head(11)
# Drop the remaining rows
subset = subset.reset_index(drop=True)
# Display the subset of the data
print(subset)
Entity Code Day total_vaccinations_per_hundred 0 Gibraltar GIB 2022-12-16 406.43 1 Cuba CUB 2023-02-25 389.33 2 Chile CHL 2023-02-08 319.78 3 Japan JPN 2023-02-26 308.08 4 Brunei BRN 2023-02-10 287.09 5 Tokelau TKL 2022-12-05 283.89 6 Qatar QAT 2023-02-19 282.24 7 Guernsey GGY 2022-12-19 281.90 8 Taiwan TWN 2023-02-23 280.39 9 Hong Kong HKG 2023-02-19 276.16 10 Cambodia KHM 2023-02-17 275.04
import matplotlib.pyplot as plt
import pandas as pd
# Load the data from the CSV file
data = pd.read_csv('/Users/alksnk/Downloads/covid-vaccination-doses-per-capita.csv')
# Create a line plot using matplotlib
plt.bar(subset['Entity'], subset['total_vaccinations_per_hundred'])
plt.xticks(rotation='vertical')
plt.title('My Data Plot')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
# Display the plot
plt.show()
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
# label encode the 'Country' column
le = LabelEncoder()
data['Entity'] = le.fit_transform(data['Entity'])
# extract the relevant columns into X and y variables
X = data['Entity'].values.reshape(-1, 1) # reshape to a 2D array
y = data['total_vaccinations_per_hundred'].values.reshape(-1, 1)
# split your dataset into training and testing sets
split_ratio = 0.8 # 80% training, 20% testing
split_index = int(len(X) * split_ratio)
X_train, y_train = X[:split_index], y[:split_index]
X_test, y_test = X[split_index:], y[split_index:]
# perform linear regression on the training data
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# make predictions on the testing data
y_pred = regressor.predict(X_test)
# calculate the accuracy of the model using R-squared score
accuracy = r2_score(y_test, y_pred)
# print the accuracy of the model
print("Accuracy: ", accuracy)
# plot the results
import matplotlib.pyplot as plt
plt.scatter(X_test, y_test, color='blue')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.title('Linear Regression')
plt.xlabel('Country')
plt.ylabel('Vaccine Doses per Capita')
plt.show()
Accuracy: -0.01940350583714734