import pylab as plt
import pandas as pd
import seaborn as sns
import numpy as np
import json
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号, 注意['SimHei']对应这句不行.
#j = json.load(open('../data/zhiwei_line.json'))
j = json.load(open('./data/zhiwei_line0417.json'))
df = pd.DataFrame(j)
df.tail()
time | voice | heat | case | allCase | |
---|---|---|---|---|---|
100 | 2020-04-08 | 225533 | 63 | 81865 | |
101 | 2020-04-09 | 197162 | 42 | 81907 | |
102 | 2020-04-10 | 212046 | 31 | 81953 | |
103 | 2020-04-11 | 164010 | 31 | 82052 | |
104 | 2020-04-12 | 157726 | 31 | 82160 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 105 entries, 0 to 104 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 time 105 non-null object 1 voice 105 non-null object 2 heat 105 non-null object 3 case 105 non-null object 4 allCase 105 non-null object dtypes: object(5) memory usage: 4.2+ KB
df['time'][:3]
0 2019-12-30 1 2019-12-31 2 2020-01-01 Name: time, dtype: object
df['heat'] = [np.float64(i) for i in df['heat']]
df['case'] = [np.int32(i) for i in df['case']]
df['heat']
0 2.00000 1 62978.12912 2 19890.45538 3 13725.50326 4 16509.66362 ... 100 225533.00000 101 197162.00000 102 212046.00000 103 164010.00000 104 157726.00000 Name: heat, Length: 105, dtype: float64
plt.style.use('ggplot')
plt.figure(figsize = [10, 6], dpi = 100)
plt.plot(df['heat'], df['case'], 'bo')
plt.yscale('log')
#plt.xscale('log')
plt.ylabel('新增确诊', fontsize = 16)
plt.xlabel('舆论热度', fontsize = 16)
plt.xlim([100000, 800000])
plt.show()
MIN = df['heat'].min()
MAX = df['heat'].max()
bins = 10 ** np.linspace( np.log10(MIN), np.log10(MAX),20 )
plt.hist(df['heat'], bins = bins)
plt.xlabel('舆论热度', fontsize = 20)
plt.ylabel('频数', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.show()
MIN = df['case'].min()+1
MAX = df['case'].max()
bins = 10 ** np.linspace( np.log10(MIN), np.log10(MAX),20 )
plt.hist(df['case'], bins = bins)
plt.xlabel('新增确诊', fontsize = 20)
plt.ylabel('频数', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.show()
plt.hist( df['heat'], bins = 50)
plt.yscale('log')
plt.xscale('log')
plt.show()
#plt.hist( df['heat'], bins = 50)
plt.hist( df['case'], bins = 50)
plt.yscale('log')
plt.xscale('log')
plt.show()
# plot
fig = plt.figure(figsize=(30,10),dpi = 200)
plt.style.use('fivethirtyeight')
#plt.tick_params(labelsize = 20)
ax1=fig.add_subplot(111)
ax1.plot(df['time'], df['heat'], 'r-s')
ax1.set_ylabel('舆论热度', fontsize = 26)
ax1.tick_params(axis='x', rotation=60)
ax1.legend(('舆论热度',),loc='upper left', fontsize = 26)
#ax1.set_yscale('log')
ax2=ax1.twinx()
ax2.plot(df['time'], df['case'], 'g-o')
ax2.set_ylabel('新增确诊', fontsize = 26)
ax2.legend(('新增确诊',),loc='upper right', fontsize = 26)
#ax2.set_yscale('log')
plt.show()
# plot
plt.figure(figsize=(12, 6), dpi = 200)
plt.style.use('fivethirtyeight')
plt.plot(df['time'], [float(i)/100 for i in df['heat']], 'r-s', label = '舆论热度/100')
plt.plot(df['time'], [int(i) for i in df['case']], 'g-o', label = '新增确诊')
plt.legend()
plt.xticks(rotation=60)
plt.ylabel('数量', fontsize = 20)
plt.show()
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
import numpy as np
help(df.pct_change)
Percentage change between the current and a prior element.
Computes the percentage change from the immediately previous row by
default. This is useful in comparing the percentage of change in a time
series of elements.
H0: the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1.
Grange causality means that past values of x2 have a statistically significant effect on the current value of x1, taking past values of x1 into account as regressors. We reject the null hypothesis that x2 does not Granger cause x1 if the pvalues are below a desired size of the test.
grangercausalitytests?
# The data for test whether the time series in the second column Granger
# causes the time series in the first column. Missing values are not
# supported.
data = df[21:][['case','heat' ]].pct_change().dropna()
data.head()
case | heat | |
---|---|---|
22 | 0.935065 | 1.442401 |
23 | -0.120805 | 0.299869 |
24 | 0.977099 | 0.310704 |
25 | 0.714286 | -0.081957 |
26 | 0.549550 | -0.044222 |
data.plot();
gc_res = grangercausalitytests(data,4)
Granger Causality number of lags (no zero) 1 ssr based F test: F=0.2522 , p=0.6169 , df_denom=79, df_num=1 ssr based chi2 test: chi2=0.2618 , p=0.6089 , df=1 likelihood ratio test: chi2=0.2613 , p=0.6092 , df=1 parameter F test: F=0.2522 , p=0.6169 , df_denom=79, df_num=1 Granger Causality number of lags (no zero) 2 ssr based F test: F=12.3885 , p=0.0000 , df_denom=76, df_num=2 ssr based chi2 test: chi2=26.4070 , p=0.0000 , df=2 likelihood ratio test: chi2=22.8563 , p=0.0000 , df=2 parameter F test: F=12.3885 , p=0.0000 , df_denom=76, df_num=2 Granger Causality number of lags (no zero) 3 ssr based F test: F=8.8247 , p=0.0000 , df_denom=73, df_num=3 ssr based chi2 test: chi2=29.0127 , p=0.0000 , df=3 likelihood ratio test: chi2=24.7550 , p=0.0000 , df=3 parameter F test: F=8.8247 , p=0.0000 , df_denom=73, df_num=3 Granger Causality number of lags (no zero) 4 ssr based F test: F=6.1783 , p=0.0003 , df_denom=70, df_num=4 ssr based chi2 test: chi2=27.8906 , p=0.0000 , df=4 likelihood ratio test: chi2=23.8863 , p=0.0001 , df=4 parameter F test: F=6.1783 , p=0.0003 , df_denom=70, df_num=4
data1 = df[21:][['heat','case']].pct_change().dropna()
gc_res1 = grangercausalitytests(data1,4)
Granger Causality number of lags (no zero) 1 ssr based F test: F=0.0044 , p=0.9473 , df_denom=79, df_num=1 ssr based chi2 test: chi2=0.0046 , p=0.9461 , df=1 likelihood ratio test: chi2=0.0046 , p=0.9461 , df=1 parameter F test: F=0.0044 , p=0.9473 , df_denom=79, df_num=1 Granger Causality number of lags (no zero) 2 ssr based F test: F=0.0877 , p=0.9162 , df_denom=76, df_num=2 ssr based chi2 test: chi2=0.1869 , p=0.9108 , df=2 likelihood ratio test: chi2=0.1866 , p=0.9109 , df=2 parameter F test: F=0.0877 , p=0.9162 , df_denom=76, df_num=2 Granger Causality number of lags (no zero) 3 ssr based F test: F=0.7988 , p=0.4985 , df_denom=73, df_num=3 ssr based chi2 test: chi2=2.6262 , p=0.4529 , df=3 likelihood ratio test: chi2=2.5840 , p=0.4603 , df=3 parameter F test: F=0.7988 , p=0.4985 , df_denom=73, df_num=3 Granger Causality number of lags (no zero) 4 ssr based F test: F=0.8460 , p=0.5008 , df_denom=70, df_num=4 ssr based chi2 test: chi2=3.8190 , p=0.4311 , df=4 likelihood ratio test: chi2=3.7296 , p=0.4438 , df=4 parameter F test: F=0.8460 , p=0.5008 , df_denom=70, df_num=4
df['case'][df['time']=='2020-02-12'] = np.nan
df['case'][df['time']=='2020-02-13'] = np.nan
df = df.fillna(method='ffill')
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """Entry point for launching an IPython kernel.
# df = pd.read_excel('zhiwei_line_no_peak.xlsx')
df['heat'] = [float(i) for i in df['heat']]
df['case'] = [int(i) for i in df['case']]
df.tail()
time | voice | heat | case | allCase | |
---|---|---|---|---|---|
100 | 2020-04-08 | 225533.0 | 63 | 81865 | |
101 | 2020-04-09 | 197162.0 | 42 | 81907 | |
102 | 2020-04-10 | 212046.0 | 31 | 81953 | |
103 | 2020-04-11 | 164010.0 | 31 | 82052 | |
104 | 2020-04-12 | 157726.0 | 31 | 82160 |
# plot
fig = plt.figure(figsize=(30,10),dpi = 200)
plt.style.use('fivethirtyeight')
ax1=fig.add_subplot(111)
ax1.plot(df['time'], df['heat'], 'r-s')
ax1.set_ylabel('舆论热度', fontsize = 26)
ax1.tick_params(axis='x', rotation=60)
ax1.legend(('舆论热度',),loc='upper left', fontsize = 26)
ax2=ax1.twinx()
ax2.plot(df['time'], df['case'], 'g-o')
ax2.set_ylabel('新增确诊', fontsize = 26)
ax2.legend(('新增确诊',),loc='upper right', fontsize = 26)
plt.show()
data = df[21:][['case','heat' ]].pct_change().dropna()
data.plot();
gc_res = grangercausalitytests(data,4)
Granger Causality number of lags (no zero) 1 ssr based F test: F=0.0104 , p=0.9189 , df_denom=79, df_num=1 ssr based chi2 test: chi2=0.0108 , p=0.9172 , df=1 likelihood ratio test: chi2=0.0108 , p=0.9172 , df=1 parameter F test: F=0.0104 , p=0.9189 , df_denom=79, df_num=1 Granger Causality number of lags (no zero) 2 ssr based F test: F=3.7056 , p=0.0291 , df_denom=76, df_num=2 ssr based chi2 test: chi2=7.8988 , p=0.0193 , df=2 likelihood ratio test: chi2=7.5370 , p=0.0231 , df=2 parameter F test: F=3.7056 , p=0.0291 , df_denom=76, df_num=2 Granger Causality number of lags (no zero) 3 ssr based F test: F=1.6880 , p=0.1771 , df_denom=73, df_num=3 ssr based chi2 test: chi2=5.5496 , p=0.1357 , df=3 likelihood ratio test: chi2=5.3656 , p=0.1469 , df=3 parameter F test: F=1.6880 , p=0.1771 , df_denom=73, df_num=3 Granger Causality number of lags (no zero) 4 ssr based F test: F=1.1838 , p=0.3255 , df_denom=70, df_num=4 ssr based chi2 test: chi2=5.3439 , p=0.2538 , df=4 likelihood ratio test: chi2=5.1709 , p=0.2702 , df=4 parameter F test: F=1.1838 , p=0.3255 , df_denom=70, df_num=4
data = df[21:][['heat','case' ]].pct_change().dropna()
gc_res = grangercausalitytests(data,4)
Granger Causality number of lags (no zero) 1 ssr based F test: F=0.3389 , p=0.5621 , df_denom=79, df_num=1 ssr based chi2 test: chi2=0.3518 , p=0.5531 , df=1 likelihood ratio test: chi2=0.3511 , p=0.5535 , df=1 parameter F test: F=0.3389 , p=0.5621 , df_denom=79, df_num=1 Granger Causality number of lags (no zero) 2 ssr based F test: F=0.4490 , p=0.6400 , df_denom=76, df_num=2 ssr based chi2 test: chi2=0.9571 , p=0.6197 , df=2 likelihood ratio test: chi2=0.9514 , p=0.6214 , df=2 parameter F test: F=0.4490 , p=0.6400 , df_denom=76, df_num=2 Granger Causality number of lags (no zero) 3 ssr based F test: F=0.2301 , p=0.8751 , df_denom=73, df_num=3 ssr based chi2 test: chi2=0.7565 , p=0.8598 , df=3 likelihood ratio test: chi2=0.7530 , p=0.8607 , df=3 parameter F test: F=0.2301 , p=0.8751 , df_denom=73, df_num=3 Granger Causality number of lags (no zero) 4 ssr based F test: F=0.3046 , p=0.8740 , df_denom=70, df_num=4 ssr based chi2 test: chi2=1.3749 , p=0.8485 , df=4 likelihood ratio test: chi2=1.3631 , p=0.8506 , df=4 parameter F test: F=0.3046 , p=0.8740 , df_denom=70, df_num=4
#df = pd.read_excel('zhiwei_line_no_peak.xlsx')
df['heat'] = [float(i) for i in df['heat']]
df['case'] = [int(i) for i in df['case']]
df[40:]
time | voice | heat | case | allCase | |
---|---|---|---|---|---|
40 | 2020-02-08 | 198955.9754 | 2656 | 37198 | |
41 | 2020-02-09 | 178865.4643 | 3062 | 40171 | |
42 | 2020-02-10 | 366537.5655 | 2478 | 42638 | |
43 | 2020-02-11 | 463839.3934 | 2015 | 44653 | |
44 | 2020-02-12 | 548711.0169 | 2015 | 59804 | |
... | ... | ... | ... | ... | ... |
100 | 2020-04-08 | 225533.0000 | 63 | 81865 | |
101 | 2020-04-09 | 197162.0000 | 42 | 81907 | |
102 | 2020-04-10 | 212046.0000 | 31 | 81953 | |
103 | 2020-04-11 | 164010.0000 | 31 | 82052 | |
104 | 2020-04-12 | 157726.0000 | 31 | 82160 |
65 rows × 5 columns
data = df[40:][['heat','case' ]].pct_change().dropna()
gc_res = grangercausalitytests(data,3)
Granger Causality number of lags (no zero) 1 ssr based F test: F=0.7319 , p=0.3957 , df_denom=60, df_num=1 ssr based chi2 test: chi2=0.7685 , p=0.3807 , df=1 likelihood ratio test: chi2=0.7638 , p=0.3821 , df=1 parameter F test: F=0.7319 , p=0.3957 , df_denom=60, df_num=1 Granger Causality number of lags (no zero) 2 ssr based F test: F=0.8160 , p=0.4473 , df_denom=57, df_num=2 ssr based chi2 test: chi2=1.7752 , p=0.4117 , df=2 likelihood ratio test: chi2=1.7502 , p=0.4168 , df=2 parameter F test: F=0.8160 , p=0.4473 , df_denom=57, df_num=2 Granger Causality number of lags (no zero) 3 ssr based F test: F=0.5435 , p=0.6546 , df_denom=54, df_num=3 ssr based chi2 test: chi2=1.8419 , p=0.6059 , df=3 likelihood ratio test: chi2=1.8146 , p=0.6118 , df=3 parameter F test: F=0.5435 , p=0.6546 , df_denom=54, df_num=3
data = df[40:][['case','heat' ]].pct_change().dropna()
gc_res = grangercausalitytests(data,3)
Granger Causality number of lags (no zero) 1 ssr based F test: F=0.4303 , p=0.5143 , df_denom=60, df_num=1 ssr based chi2 test: chi2=0.4518 , p=0.5015 , df=1 likelihood ratio test: chi2=0.4502 , p=0.5022 , df=1 parameter F test: F=0.4303 , p=0.5143 , df_denom=60, df_num=1 Granger Causality number of lags (no zero) 2 ssr based F test: F=1.7140 , p=0.1893 , df_denom=57, df_num=2 ssr based chi2 test: chi2=3.7286 , p=0.1550 , df=2 likelihood ratio test: chi2=3.6208 , p=0.1636 , df=2 parameter F test: F=1.7140 , p=0.1893 , df_denom=57, df_num=2 Granger Causality number of lags (no zero) 3 ssr based F test: F=1.0282 , p=0.3875 , df_denom=54, df_num=3 ssr based chi2 test: chi2=3.4843 , p=0.3228 , df=3 likelihood ratio test: chi2=3.3884 , p=0.3355 , df=3 parameter F test: F=1.0282 , p=0.3875 , df_denom=54, df_num=3
import numpy as np
suicide = [5427,5688,6198,6462,6635,7336,7248,7491,8161,8578,9000]
spending = [18.079,18.594,19.753,20.734,20.831,23.029,23.597,23.584,25.525,27.731,29.449]
d = np.array([suicide, spending])
df = pd.DataFrame(d.T, columns = ['suicide', 'spending']) # .pct_change().dropna()
data = df[['suicide','spending' ]].pct_change().dropna()
gc_res = grangercausalitytests(data,2)
Granger Causality number of lags (no zero) 1 ssr based F test: F=7.6049 , p=0.0330 , df_denom=6, df_num=1 ssr based chi2 test: chi2=11.4073 , p=0.0007 , df=1 likelihood ratio test: chi2=7.3680 , p=0.0066 , df=1 parameter F test: F=7.6049 , p=0.0330 , df_denom=6, df_num=1 Granger Causality number of lags (no zero) 2 ssr based F test: F=0.5180 , p=0.6408 , df_denom=3, df_num=2 ssr based chi2 test: chi2=2.7627 , p=0.2512 , df=2 likelihood ratio test: chi2=2.3731 , p=0.3053 , df=2 parameter F test: F=0.5180 , p=0.6408 , df_denom=3, df_num=2
data = df[['spending', 'suicide' ]].pct_change().dropna()
gc_res = grangercausalitytests(data,2)
Granger Causality number of lags (no zero) 1 ssr based F test: F=2.5229 , p=0.1633 , df_denom=6, df_num=1 ssr based chi2 test: chi2=3.7844 , p=0.0517 , df=1 likelihood ratio test: chi2=3.1590 , p=0.0755 , df=1 parameter F test: F=2.5229 , p=0.1633 , df_denom=6, df_num=1 Granger Causality number of lags (no zero) 2 ssr based F test: F=0.1465 , p=0.8695 , df_denom=3, df_num=2 ssr based chi2 test: chi2=0.7814 , p=0.6766 , df=2 likelihood ratio test: chi2=0.7455 , p=0.6888 , df=2 parameter F test: F=0.1465 , p=0.8695 , df_denom=3, df_num=2