This notebook presents a quick-and-dirty analysis of the association between Internet use and religion in Europe, using data from the European Social Survey (http://www.europeansocialsurvey.org).
Copyright 2015 Allen Downey
MIT License: http://opensource.org/licenses/MIT
from __future__ import print_function, division
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
%matplotlib inline
The following function selects the columns I need.
def select_cols(df):
cols = ['cntry', 'tvtot', 'tvpol', 'rdtot', 'rdpol', 'nwsptot', 'nwsppol', 'netuse',
'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta', 'yrbrn', 'eisced', 'pspwght', 'pweight']
df = df[cols]
return df
Read data from Cycle 1.
TODO: investigate the difference between hinctnt and hinctnta; is there a recode that reconciles them?
df1 = pd.read_stata('ESS1e06_4.dta', convert_categoricals=False)
df1['hinctnta'] = df1.hinctnt
df1 = select_cols(df1)
df1.head()
cntry | tvtot | tvpol | rdtot | rdpol | nwsptot | nwsppol | netuse | rlgblg | rlgdgr | eduyrs | hinctnta | yrbrn | eisced | pspwght | pweight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AT | 1 | 1 | 1 | 1 | 1 | 1 | 5 | 1 | 8 | 11 | 77 | 1949 | 0 | 0.940933 | 0.271488 |
1 | AT | 3 | 2 | 4 | 1 | 2 | 2 | 6 | 2 | 5 | 14 | 2 | 1953 | 0 | 0.470466 | 0.271488 |
2 | AT | 7 | 3 | 0 | 66 | 0 | 66 | 0 | 1 | 7 | 9 | 77 | 1940 | 0 | 1.392155 | 0.271488 |
3 | AT | 1 | 1 | 1 | 1 | 2 | 2 | 4 | 1 | 7 | 18 | 9 | 1959 | 0 | 1.382163 | 0.271488 |
4 | AT | 0 | 66 | 1 | 1 | 0 | 66 | 7 | 1 | 10 | 15 | 9 | 1962 | 0 | 1.437766 | 0.271488 |
Read data from Cycle 2.
df2 = pd.read_stata('ESS2e03_4.dta', convert_categoricals=False)
df2['hinctnta'] = df2.hinctnt
df2 = select_cols(df2)
df2.head()
cntry | tvtot | tvpol | rdtot | rdpol | nwsptot | nwsppol | netuse | rlgblg | rlgdgr | eduyrs | hinctnta | yrbrn | eisced | pspwght | pweight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AT | 3 | 2 | 0 | 66 | 4 | 2 | 7 | 1 | 7 | 12 | 8 | 1971 | 0 | 0.682185 | 0.302006 |
1 | AT | 7 | 2 | 3 | 1 | 5 | 2 | 0 | 1 | 7 | 8 | 4 | 1925 | 0 | 0.565038 | 0.302006 |
2 | AT | 6 | 2 | 1 | 0 | 1 | 0 | 6 | 2 | 4 | 13 | 6 | 1977 | 0 | 0.341133 | 0.302006 |
3 | AT | 3 | 1 | 2 | 1 | 2 | 1 | 7 | 1 | 5 | 8 | 9 | 1989 | 0 | 0.804050 | 0.302006 |
4 | AT | 2 | 1 | 1 | 1 | 1 | 1 | 4 | 2 | 1 | 11 | 88 | 1988 | 0 | 1.125251 | 0.302006 |
Read data from Cycle 3.
df3 = pd.read_stata('ESS3e03_5.dta', convert_categoricals=False)
df3['hinctnta'] = df3.hinctnt
df3 = select_cols(df3)
df3.head()
cntry | tvtot | tvpol | rdtot | rdpol | nwsptot | nwsppol | netuse | rlgblg | rlgdgr | eduyrs | hinctnta | yrbrn | eisced | pspwght | pweight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AT | 5 | 1 | 7 | 2 | 0 | 66 | 6 | 1 | 3 | 11 | 7 | 1980 | 0 | 0.949578 | 0.289116 |
1 | AT | 3 | 1 | 2 | 1 | 2 | 1 | 7 | 1 | 9 | 20 | 5 | 1974 | 0 | 1.412180 | 0.289116 |
2 | AT | 1 | 1 | 7 | 2 | 2 | 1 | 7 | 1 | 6 | 16 | 77 | 1954 | 0 | 0.723276 | 0.289116 |
3 | AT | 4 | 1 | 7 | 2 | 3 | 2 | 6 | 1 | 5 | 12 | 88 | 1967 | 0 | 0.625744 | 0.289116 |
4 | AT | 6 | 2 | 6 | 2 | 3 | 1 | 0 | 1 | 7 | 11 | 88 | 1971 | 0 | 0.417162 | 0.289116 |
Read data from Cycle 4.
df4 = pd.read_stata('ESS4e04_3.dta', convert_categoricals=False)
df4 = select_cols(df4)
df4.head()
cntry | tvtot | tvpol | rdtot | rdpol | nwsptot | nwsppol | netuse | rlgblg | rlgdgr | eduyrs | hinctnta | yrbrn | eisced | pspwght | pweight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BE | 7 | 1 | 0 | 66 | 0 | 66 | 0 | 2 | 1 | 18 | 4 | 1972 | 6 | 0.823223 | 0.503773 |
1 | BE | 7 | 2 | 7 | 3 | 0 | 66 | 0 | 2 | 0 | 15 | 7 | 1982 | 6 | 0.798610 | 0.503773 |
2 | BE | 2 | 2 | 3 | 3 | 3 | 3 | 7 | 1 | 6 | 18 | 10 | 1940 | 7 | 0.778020 | 0.503773 |
3 | BE | 7 | 2 | 2 | 2 | 2 | 2 | 0 | 1 | 6 | 15 | 7 | 1931 | 6 | 0.777735 | 0.503773 |
4 | BE | 0 | 66 | 3 | 0 | 1 | 1 | 7 | 2 | 0 | 13 | 7 | 1982 | 4 | 0.960960 | 0.503773 |
Read data from Cycle 5.
df5 = pd.read_stata('ESS5e03_2.dta', convert_categoricals=False)
df5 = select_cols(df5)
df5.head()
cntry | tvtot | tvpol | rdtot | rdpol | nwsptot | nwsppol | netuse | rlgblg | rlgdgr | eduyrs | hinctnta | yrbrn | eisced | pspwght | pweight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BE | 5 | 1 | 1 | 0 | 1 | 1 | 2 | 1 | 5 | 15 | 88 | 1988 | 4 | 0.792865 | 0.528619 |
1 | BE | 4 | 3 | 2 | 2 | 1 | 2 | 6 | 2 | 7 | 15 | 5 | 1967 | 6 | 0.871107 | 0.528619 |
2 | BE | 4 | 0 | 0 | 66 | 0 | 66 | 7 | 2 | 0 | 13 | 1 | 1991 | 3 | 0.799453 | 0.528619 |
3 | BE | 2 | 2 | 7 | 2 | 0 | 66 | 7 | 1 | 5 | 15 | 10 | 1987 | 6 | 0.816030 | 0.528619 |
4 | BE | 7 | 3 | 7 | 4 | 0 | 66 | 0 | 2 | 5 | 15 | 6 | 1952 | 6 | 0.764902 | 0.528619 |
Concatenate the cycles.
TODO: Have to resample each cycle before concatenating.
df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
df.head()
cntry | tvtot | tvpol | rdtot | rdpol | nwsptot | nwsppol | netuse | rlgblg | rlgdgr | eduyrs | hinctnta | yrbrn | eisced | pspwght | pweight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AT | 1 | 1 | 1 | 1 | 1 | 1 | 5 | 1 | 8 | 11 | 77 | 1949 | 0 | 0.940933 | 0.271488 |
1 | AT | 3 | 2 | 4 | 1 | 2 | 2 | 6 | 2 | 5 | 14 | 2 | 1953 | 0 | 0.470466 | 0.271488 |
2 | AT | 7 | 3 | 0 | 66 | 0 | 66 | 0 | 1 | 7 | 9 | 77 | 1940 | 0 | 1.392155 | 0.271488 |
3 | AT | 1 | 1 | 1 | 1 | 2 | 2 | 4 | 1 | 7 | 18 | 9 | 1959 | 0 | 1.382163 | 0.271488 |
4 | AT | 0 | 66 | 1 | 1 | 0 | 66 | 7 | 1 | 10 | 15 | 9 | 1962 | 0 | 1.437766 | 0.271488 |
TV watching time on average weekday
df.tvtot.replace([77, 88, 99], np.nan, inplace=True)
df.tvtot.value_counts().sort_index()
0 8601 1 12822 2 33235 3 32962 4 40128 5 30598 6 29477 7 53602 Name: tvtot, dtype: int64
Radio listening, total time on average weekday.
df.rdtot.replace([77, 88, 99], np.nan, inplace=True)
df.rdtot.value_counts().sort_index()
0 58733 1 36957 2 38032 3 19077 4 16108 5 10644 6 9737 7 51596 Name: rdtot, dtype: int64
Newspaper reading, total time on average weekday.
df.nwsptot.replace([77, 88, 99], np.nan, inplace=True)
df.nwsptot.value_counts().sort_index()
0 70283 1 73250 2 65009 3 18684 4 7635 5 2890 6 1440 7 2004 Name: nwsptot, dtype: int64
TV watching: news, politics, current affairs
df.tvpol.replace([66, 77, 88, 99], np.nan, inplace=True)
df.tvpol.value_counts().sort_index()
0 17050 1 74427 2 87036 3 29333 4 12751 5 5137 6 2895 7 3822 Name: tvpol, dtype: int64
Radio listening: news, politics, current affairs
df.rdpol.replace([66, 77, 88, 99], np.nan, inplace=True)
df.rdpol.value_counts().sort_index()
0 32696 1 77746 2 40234 3 13091 4 6583 5 3196 6 2268 7 5418 Name: rdpol, dtype: int64
Newspaper reading: politics, current affairs
df.nwsppol.replace([66, 77, 88, 99], np.nan, inplace=True)
df.nwsppol.value_counts().sort_index()
0 24865 1 102663 2 32425 3 6342 4 2126 5 745 6 409 7 548 Name: nwsppol, dtype: int64
Personal use of Internet, email, www
df.netuse.replace([77, 88, 99], np.nan, inplace=True)
df.netuse.value_counts().sort_index()
0 76194 1 38069 2 4603 3 3817 4 8159 5 9623 6 27492 7 67203 Name: netuse, dtype: int64
Belong to a particular religion or denomination
df.rlgblg.replace([7, 8, 9], np.nan, inplace=True)
df.rlgblg.value_counts().sort_index()
1 152296 2 85882 Name: rlgblg, dtype: int64
How religious
df.rlgdgr.replace([77, 88, 99], np.nan, inplace=True)
df.rlgdgr.value_counts().sort_index()
0 30804 1 13613 2 16672 3 19127 4 15676 5 41533 6 23765 7 28024 8 24844 9 11268 10 14492 Name: rlgdgr, dtype: int64
Total household net income, all sources
TODO: It looks like one cycle measured HINCTNT on a 12 point scale. Might need to reconcile
df.hinctnta.replace([77, 88, 99], np.nan, inplace=True)
df.hinctnta.value_counts().sort_index()
1 11124 2 14462 3 16930 4 22156 5 21217 6 18625 7 17126 8 15892 9 20462 10 12154 11 1810 12 1177 Name: hinctnta, dtype: int64
Shift income to mean near 0.
df['hinctnta5'] = df.hinctnta - 5
df.hinctnta5.describe()
count 173135.000000 mean 0.683698 std 2.741579 min -4.000000 25% -1.000000 50% 1.000000 75% 3.000000 max 7.000000 Name: hinctnta5, dtype: float64
Year born
df.yrbrn.replace([7777, 8888, 9999], np.nan, inplace=True)
df.yrbrn.describe()
count 240948.000000 mean 1959.308378 std 18.662873 min 1885.000000 25% 1945.000000 50% 1960.000000 75% 1974.000000 max 1996.000000 Name: yrbrn, dtype: float64
Shifted to mean near 0
df['yrbrn60'] = df.yrbrn - 1960
df.yrbrn60.describe()
count 240948.000000 mean -0.691622 std 18.662873 min -75.000000 25% -15.000000 50% 0.000000 75% 14.000000 max 36.000000 Name: yrbrn60, dtype: float64
Number of years of education
df.eduyrs.replace([77, 88, 99], np.nan, inplace=True)
df.loc[df.eduyrs > 25, 'eduyrs'] = 25
df.eduyrs.value_counts().sort_index()
0.0 2178 1.0 516 2.0 811 3.0 1754 4.0 5548 5.0 3736 6.0 6954 7.0 6299 8.0 15688 9.0 16027 10.0 17956 11.0 25579 11.5 4 12.0 38669 13.0 21018 14.0 16129 14.5 1 15.0 15718 16.0 14073 17.0 10671 18.0 8208 18.5 1 19.0 3937 20.0 3810 21.0 1235 22.0 986 23.0 518 24.0 372 25.0 796 Name: eduyrs, dtype: int64
There are a bunch of really high values for eduyrs, need to investigate.
df.eduyrs.describe()
count 239192.000000 mean 11.945939 std 4.057455 min 0.000000 25% 10.000000 50% 12.000000 75% 15.000000 max 25.000000 Name: eduyrs, dtype: float64
Shift to mean near 0
df['eduyrs12'] = df.eduyrs - 12
df.eduyrs12.describe()
count 239192.000000 mean -0.054061 std 4.057455 min -12.000000 25% -2.000000 50% 0.000000 75% 3.000000 max 13.000000 Name: eduyrs12, dtype: float64
Country codes
df.cntry.value_counts().sort_index()
AT 6918 BE 8939 BG 6064 CH 9310 CY 3293 CZ 8790 DE 14487 DK 7684 EE 6960 ES 9729 FI 9991 FR 9096 GB 11117 GR 9759 HR 3133 HU 7806 IE 10472 IL 7283 IS 579 IT 1207 LT 1677 LU 3187 LV 1980 NL 9741 NO 8643 PL 8917 PT 10302 RO 2146 RU 7544 SE 9201 SI 7126 SK 6944 TR 4272 UA 7809 Name: cntry, dtype: int64
Make a binary dependent variable
df['hasrelig'] = (df.rlgblg==1).astype(int)
Run the model
def run_model(df, formula):
model = smf.logit(formula, data=df)
results = model.fit(disp=False)
return results
Here's the model with all control variables and all media variables:
formula = ('hasrelig ~ yrbrn60 + eduyrs12 + hinctnta5 +'
'tvtot + tvpol + rdtot + rdpol + nwsptot + nwsppol + netuse')
res = run_model(df, formula)
res.summary()
Dep. Variable: | hasrelig | No. Observations: | 93549 |
---|---|---|---|
Model: | Logit | Df Residuals: | 93538 |
Method: | MLE | Df Model: | 10 |
Date: | Mon, 16 Nov 2015 | Pseudo R-squ.: | 0.01844 |
Time: | 09:35:26 | Log-Likelihood: | -62073. |
converged: | True | LL-Null: | -63239. |
LLR p-value: | 0.000 |
coef | std err | z | P>|z| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 0.7241 | 0.026 | 27.825 | 0.000 | 0.673 0.775 |
yrbrn60 | -0.0064 | 0.000 | -13.662 | 0.000 | -0.007 -0.005 |
eduyrs12 | -0.0133 | 0.002 | -6.450 | 0.000 | -0.017 -0.009 |
hinctnta5 | -0.0269 | 0.003 | -9.669 | 0.000 | -0.032 -0.021 |
tvtot | -0.0260 | 0.004 | -6.054 | 0.000 | -0.034 -0.018 |
tvpol | 0.0031 | 0.007 | 0.461 | 0.645 | -0.010 0.016 |
rdtot | -0.0030 | 0.003 | -0.869 | 0.385 | -0.010 0.004 |
rdpol | 0.0115 | 0.006 | 2.054 | 0.040 | 0.001 0.023 |
nwsptot | 0.0129 | 0.008 | 1.610 | 0.107 | -0.003 0.029 |
nwsppol | 0.0227 | 0.011 | 2.129 | 0.033 | 0.002 0.044 |
netuse | -0.0690 | 0.003 | -24.256 | 0.000 | -0.075 -0.063 |
Most of the media variables are not statistically significant. If we drop the politial media variables, we get a cleaner model:
formula = ('hasrelig ~ yrbrn60 + eduyrs12 + hinctnta5 +'
'tvtot + rdtot + nwsptot + netuse')
res = run_model(df, formula)
res.summary()
Dep. Variable: | hasrelig | No. Observations: | 166035 |
---|---|---|---|
Model: | Logit | Df Residuals: | 166027 |
Method: | MLE | Df Model: | 7 |
Date: | Mon, 16 Nov 2015 | Pseudo R-squ.: | 0.03241 |
Time: | 09:35:27 | Log-Likelihood: | -1.0707e+05 |
converged: | True | LL-Null: | -1.1066e+05 |
LLR p-value: | 0.000 |
coef | std err | z | P>|z| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 0.9297 | 0.017 | 56.089 | 0.000 | 0.897 0.962 |
yrbrn60 | -0.0077 | 0.000 | -22.772 | 0.000 | -0.008 -0.007 |
eduyrs12 | -0.0339 | 0.002 | -22.558 | 0.000 | -0.037 -0.031 |
hinctnta5 | -0.0327 | 0.002 | -15.633 | 0.000 | -0.037 -0.029 |
tvtot | -0.0225 | 0.003 | -8.498 | 0.000 | -0.028 -0.017 |
rdtot | -0.0122 | 0.002 | -6.187 | 0.000 | -0.016 -0.008 |
nwsptot | -0.0220 | 0.004 | -5.248 | 0.000 | -0.030 -0.014 |
netuse | -0.0753 | 0.002 | -35.148 | 0.000 | -0.079 -0.071 |
And if we fill missing values for income, cleaner still.
def fill_var(df, var):
fill = df[var].dropna().sample(len(df), replace=True)
fill.index = df.index
df[var].fillna(fill, inplace=True)
fill_var(df, var='hinctnta5')
formula = ('hasrelig ~ yrbrn60 + eduyrs12 + hinctnta5 +'
'tvtot + rdtot + nwsptot + netuse')
res = run_model(df, formula)
res.summary()
Dep. Variable: | hasrelig | No. Observations: | 229307 |
---|---|---|---|
Model: | Logit | Df Residuals: | 229299 |
Method: | MLE | Df Model: | 7 |
Date: | Mon, 16 Nov 2015 | Pseudo R-squ.: | 0.03200 |
Time: | 09:35:27 | Log-Likelihood: | -1.4610e+05 |
converged: | True | LL-Null: | -1.5093e+05 |
LLR p-value: | 0.000 |
coef | std err | z | P>|z| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 0.9819 | 0.014 | 70.060 | 0.000 | 0.954 1.009 |
yrbrn60 | -0.0078 | 0.000 | -27.847 | 0.000 | -0.008 -0.007 |
eduyrs12 | -0.0375 | 0.001 | -29.599 | 0.000 | -0.040 -0.035 |
hinctnta5 | -0.0226 | 0.002 | -13.297 | 0.000 | -0.026 -0.019 |
tvtot | -0.0163 | 0.002 | -7.254 | 0.000 | -0.021 -0.012 |
rdtot | -0.0149 | 0.002 | -8.839 | 0.000 | -0.018 -0.012 |
nwsptot | -0.0319 | 0.004 | -8.908 | 0.000 | -0.039 -0.025 |
netuse | -0.0757 | 0.002 | -42.036 | 0.000 | -0.079 -0.072 |
Now all variables have small p-values. All parameters have the expected signs:
The strength of the Internet effect is stronger than for other media.
These results are consistent in each cycle of the data, and across a few changes I've made in the cleaning process.
However, these results should be considered preliminary:
Nevertheless, I'll run a breakdown by country.
Here's a function to extract the parameter associated with netuse:
def extract_res(res, var='netuse'):
param = res.params[var]
pvalue = res.pvalues[var]
stars = '**' if pvalue < 0.01 else '*' if pvalue < 0.05 else ''
return res.nobs, param, stars
extract_res(res)
(229307, -0.075724674567100483, '**')
Running a similar model with degree of religiosity.
formula = ('rlgdgr ~ yrbrn60 + eduyrs12 + hinctnta5 +'
'tvtot + rdtot + nwsptot + netuse')
model = smf.ols(formula, data=df)
res = model.fit(disp=False)
res.summary()
Dep. Variable: | rlgdgr | R-squared: | 0.060 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.060 |
Method: | Least Squares | F-statistic: | 2067. |
Date: | Mon, 16 Nov 2015 | Prob (F-statistic): | 0.00 |
Time: | 09:35:28 | Log-Likelihood: | -5.6310e+05 |
No. Observations: | 227366 | AIC: | 1.126e+06 |
Df Residuals: | 227358 | BIC: | 1.126e+06 |
Df Model: | 7 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 5.6668 | 0.019 | 300.071 | 0.000 | 5.630 5.704 |
yrbrn60 | -0.0180 | 0.000 | -47.352 | 0.000 | -0.019 -0.017 |
eduyrs12 | -0.0688 | 0.002 | -40.159 | 0.000 | -0.072 -0.065 |
hinctnta5 | -0.0266 | 0.002 | -11.397 | 0.000 | -0.031 -0.022 |
tvtot | -0.0801 | 0.003 | -26.334 | 0.000 | -0.086 -0.074 |
rdtot | -0.0179 | 0.002 | -7.791 | 0.000 | -0.022 -0.013 |
nwsptot | -0.0531 | 0.005 | -10.873 | 0.000 | -0.063 -0.044 |
netuse | -0.1020 | 0.002 | -40.942 | 0.000 | -0.107 -0.097 |
Omnibus: | 29438.962 | Durbin-Watson: | 1.629 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 8137.927 |
Skew: | -0.142 | Prob(JB): | 0.00 |
Kurtosis: | 2.118 | Cond. No. | 59.2 |
Group by country:
grouped = df.groupby('cntry')
for name, group in grouped:
print(name, len(group))
AT 6918 BE 8939 BG 6064 CH 9310 CY 3293 CZ 8790 DE 14487 DK 7684 EE 6960 ES 9729 FI 9991 FR 9096 GB 11117 GR 9759 HR 3133 HU 7806 IE 10472 IL 7283 IS 579 IT 1207 LT 1677 LU 3187 LV 1980 NL 9741 NO 8643 PL 8917 PT 10302 RO 2146 RU 7544 SE 9201 SI 7126 SK 6944 TR 4272 UA 7809
Run a sample country
gb = grouped.get_group('DK')
run_model(gb, formula).summary()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-37-4ff722b3b80b> in <module>() 1 gb = grouped.get_group('DK') ----> 2 run_model(gb, formula).summary() <ipython-input-28-b3458574c116> in run_model(df, formula) 1 def run_model(df, formula): ----> 2 model = smf.logit(formula, data=df) 3 results = model.fit(disp=False) 4 return results /home/downey/anaconda/lib/python2.7/site-packages/statsmodels/base/model.pyc in from_formula(cls, formula, data, subset, *args, **kwargs) 148 kwargs.update({'missing_idx': missing_idx, 149 'missing': missing}) --> 150 mod = cls(endog, exog, *args, **kwargs) 151 mod.formula = formula 152 /home/downey/anaconda/lib/python2.7/site-packages/statsmodels/discrete/discrete_model.pyc in __init__(self, endog, exog, **kwargs) 402 if (self.__class__.__name__ != 'MNLogit' and 403 not np.all((self.endog >= 0) & (self.endog <= 1))): --> 404 raise ValueError("endog must be in the unit interval.") 405 406 ValueError: endog must be in the unit interval.
Run all countries
for name, group in grouped:
try:
fill_var(group, var='hinctnta5')
res = run_model(group, formula)
nobs, param, stars = extract_res(res)
arrow = '<--' if stars and param > 0 else ''
print(name, len(group), nobs, '%0.3g'%param, stars, arrow, sep='\t')
except:
print(name, len(group), ' ', 'NA', sep='\t')
In more than half of the countries, the association between Internet use and religious affiliation is statistically significant. In all except two, the association is negative.
In many countries we've lost a substantial number of observations due to missing data. Really need to fill that in!
group = grouped.get_group('FR')
len(group)
for col in group.columns:
print(col, sum(group[col].isnull()))
fill_var(group, 'hinctnta5')
formula
res = run_model(group, formula)
res.summary()