#!pip install pandas
import pandas as pd
df_raw = pd.read_csv('cases.csv')
window_size = 10
df = df_raw
#df = df[df['countriesAndTerritories']=='Italy']
df = df.sort_values(['year', 'month' ,'day'], ascending=[1, 1, 1])
df['ts'] = pd.to_datetime(df[['year', 'month' ,'day']])
df['increase_cases'] = df.cases - df.cases.shift(1)
df['increase_ratio'] = df.cases / df.cases.shift(1)
df['cases_estimated'] = df.deaths*200
df['percentage_infected'] = 100/df["popData2019"]*df['cases_estimated']
df['percentage_died'] = 100/df["popData2019"]*df['deaths']
for i in range(1,window_size+1):
df['cases_'+str(i)+'_days_before'] = df.cases.shift(i)
for i in range(1,window_size+1):
df['percentage_died_'+str(i)+'_days_before'] = df.percentage_died.shift(i)
for i in range(1,window_size+1):
df['deaths_'+str(i)+'_days_before'] = df.deaths.shift(i)
average = 0
for i in range(1,window_size+1):
average = average + df.cases.shift(i)
df['cases_'+str(window_size)+'_day_average'] = average/window_size
average = 0
for i in range(1,window_size+1):
average = average + df.percentage_died.shift(i)
df['percentage_died_'+str(window_size)+'_day_average'] = average/window_size
average = 0
for i in range(1,window_size+1):
average = average + df.deaths.shift(i)
df['deaths_'+str(window_size)+'_day_average'] = average/window_size
#for i in range(1,window_size+1):
# df = df[df['deaths_'+str(i)+'_days_before']>0]
#df = df[df['Cases']>100]
#df = df[df['ts']>'2020-04-01']
group_by_deaths = df.groupby(['countriesAndTerritories']).sum()['percentage_died']
bad_countries = list(group_by_deaths[group_by_deaths>0.001].index)
#df = df[df['countriesAndTerritories'].isin(bad_countries)]
#df
bad_countries
#Germany 1607
#Italy 15253
#Netherlands 1538
#Spain 11570
#United_Kingdom 4972
#United_States_of_America 10973
['Andorra', 'Antigua_and_Barbuda', 'Aruba', 'Austria', 'Bahamas', 'Barbados', 'Belgium', 'Bermuda', 'Bosnia_and_Herzegovina', 'Brazil', 'British_Virgin_Islands', 'Canada', 'Cases_on_an_international_conveyance_Japan', 'Cayman_Islands', 'Croatia', 'Cyprus', 'Czechia', 'Denmark', 'Dominican_Republic', 'Ecuador', 'Estonia', 'Finland', 'France', 'Germany', 'Gibraltar', 'Greece', 'Guam', 'Guernsey', 'Hungary', 'Iceland', 'Iran', 'Ireland', 'Isle_of_Man', 'Israel', 'Italy', 'Jersey', 'Kosovo', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Moldova', 'Monaco', 'Netherlands', 'North_Macedonia', 'Northern_Mariana_Islands', 'Norway', 'Panama', 'Peru', 'Poland', 'Portugal', 'Puerto_Rico', 'Romania', 'San_Marino', 'Serbia', 'Sint_Maarten', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'Turks_and_Caicos_islands', 'United_Kingdom', 'United_States_Virgin_Islands', 'United_States_of_America']
df.to_csv('cases_features.csv')