#!/usr/bin/env python # coding: utf-8 # In[2]: import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import statsmodels.api as sm from statsmodels.formula.api import ols # In[3]: yield_data = pd.read_excel("2021 Yield.xlsx") # In[4]: yield_data.head() # In[5]: ax = sns.boxplot(x = "Treatment", y = "Yield (avg; bu/ac)", data = yield_data) plt.savefig('all_yield_box.png') # Including all of the yield data on the same plot doesn't make sense because some treatments are soybeans and some are corn # In[6]: yield_data.dtypes # In[7]: yield_data['Treatment'] = yield_data['Treatment'].astype('string') yield_data.dtypes # Filtering out the soybean treatment data # In[8]: corn_only = yield_data[yield_data['Treatment'].str.contains('1C|2C|3.1|3.2|4.1|4.2|5C|6C')] corn_only.head() # In[9]: len(corn_only) # Making sure all of the treatments were included. There are 8 treatments with corn growing and each treatment has three reps, so 24 is correct. # In[10]: ax = sns.boxplot(x = "Treatment", y = "Yield (avg; bu/ac)", data = corn_only, hue = 'Short Trt Description', dodge = False) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.savefig('corn_yield_box.jpg', bbox_inches='tight') # In[11]: soy_only = yield_data[yield_data['Treatment'].str.contains('1S|2S|5S|6S')] soy_only.head() # In[12]: ax = sns.boxplot(x = "Treatment", y = "Yield (avg; bu/ac)", data = soy_only, hue = 'Short Trt Description', dodge = False) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.savefig('soy_yield_box.jpg', bbox_inches='tight') # In[13]: corn_only = corn_only.rename(columns={"Yield (avg; bu/ac)": "Yield"}) # In[14]: corn_only.to_excel("Corn_Yield_2021.xlsx") # Saving the corn data as its own spreadsheet in case I want to come back to it # ### Bar Plots # In[15]: corn_only.groupby('Treatment')['Yield'].describe()['mean'].plot(kind='bar', title = 'Mean Yield of Corn Treatments') # In[16]: corn_means = pd.DataFrame(corn_only.groupby('Treatment')['Yield'].describe()['mean']) corn_means = corn_means.reset_index() corn_means.head() # In[17]: print(corn_means.columns) # In[18]: ax = sns.barplot(x = 'Treatment', y = 'mean', data = corn_means) ax.set_title('2021 Yield of Corn Treatments') ax.set(xlabel='Treatment', ylabel='Mean Yield (bu/ac)') #for annotating for p in ax.patches: ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points') plt.savefig('corn_bar.jpg', bbox_inches='tight') # In[19]: corn_block_means = pd.DataFrame(corn_only.groupby('Block')['Yield'].describe()['mean']) corn_block_means = corn_block_means.reset_index() corn_block_means.head() # In[20]: ax = sns.barplot(x = 'Block', y = 'mean', data = corn_block_means) ax.set_title('2021 Corn Yield of Blocks') ax.set(xlabel='Block', ylabel='Mean Yield (bu/ac)') #for annotating for p in ax.patches: ax.annotate("%.1f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points') plt.savefig('corn_block_bar.jpg', bbox_inches='tight') # In[21]: soy_only.head() # In[22]: soy_only = soy_only.rename(columns={"Yield (avg; bu/ac)": "Yield"}) # In[23]: soy_means = pd.DataFrame(soy_only.groupby('Treatment')['Yield'].describe()['mean']) soy_means = soy_means.reset_index() ax = sns.barplot(x = 'Treatment', y = 'mean', data = soy_means) ax.set_title('2021 Yield of Soy Treatments') ax.set(xlabel='Treatment', ylabel='Mean Yield (bu/ac)') #for annotating for p in ax.patches: ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points') plt.savefig('soy_bar.jpg', bbox_inches='tight') # In[24]: soy_block_means = pd.DataFrame(soy_only.groupby('Block')['Yield'].describe()['mean']) soy_block_means = soy_block_means.reset_index() ax = sns.barplot(x = 'Block', y = 'mean', data = soy_block_means) ax.set_title('2021 Soy Yield of Blocks') ax.set(xlabel='Block', ylabel='Mean Yield (bu/ac)') #for annotating for p in ax.patches: ax.annotate("%.1f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points') plt.savefig('soy_block_bar.jpg', bbox_inches='tight') # ## ANOVA # In[25]: corn_only = corn_only.rename(columns={"Simp. Treatment": "simple_treatment"}) corn_only.head() # In[26]: corn_only # In[27]: corn_only_stats = pd.DataFrame(corn_only.groupby('Treatment')['Yield'].describe()) corn_only_stats # In[28]: plt.hist(x='Yield', bins = 'auto', data = corn_only) plt.title("Corn Yields 2021") plt.xlabel('Yield (bu/ac)') plt.ylabel('Frequency') plt.savefig('Corn Histogram.jpg', bbox_inches='tight') # This is not looking very normally distributed. # In[42]: sns.kdeplot(data=corn_only, x="Yield") plt.title('Corn Yield Kernel Density Plot') plt.savefig('corn_yield_kde.jpg', bbox_inches='tight') # In[29]: import scipy.stats as stats stats.probplot(corn_only['Yield'], dist="norm", plot=plt) plt.show() plt.savefig('corn_yield_probplot', bbox_inches='tight') # In[30]: import scipy.stats as stats unique_treatments = corn_only['Treatment'].unique() for Treatment in unique_treatments: stats.probplot(corn_only[corn_only['Treatment'] == Treatment]['Yield'], dist="norm", plot=plt) plt.title("Probability Plot - " + Treatment) plt.show() # ### Bartlett’s Test of Homogeneity of Variances # Following this tutorial: https://www.marsja.se/levenes-bartletts-test-of-equality-homogeneity-of-variance-in-python/ # Null Hypothesis: the variances are equal across all samples/groups # Alternative Hypothesis: the variances are not equal across all samples/groups # In[31]: from scipy.stats import bartlett # subsetting the data: trt1 = corn_only.query('Treatment == "1C"')['Yield'] trt2 = corn_only.query('Treatment == "2C"')['Yield'] trt31 = corn_only.query('Treatment == "3.1"')['Yield'] trt32 = corn_only.query('Treatment == "3.2"')['Yield'] trt41 = corn_only.query('Treatment == "4.1"')['Yield'] trt42 = corn_only.query('Treatment == "4.2"')['Yield'] trt5 = corn_only.query('Treatment == "5C"')['Yield'] trt6 = corn_only.query('Treatment == "6C"')['Yield'] # Bartlett's test in Python with SciPy: stat, p = bartlett(trt1, trt2, trt31, trt32, trt41, trt42, trt5, trt6) # Get the results: print(stat, p) # p-value is high, so assume variances are the same. # ### Levene’s Test of Equality of Variances # In[32]: from scipy.stats import levene # Create three arrays for each sample: trt1 = corn_only.query('Treatment == "1C"')['Yield'] trt2 = corn_only.query('Treatment == "2C"')['Yield'] trt31 = corn_only.query('Treatment == "3.1"')['Yield'] trt32 = corn_only.query('Treatment == "3.2"')['Yield'] trt41 = corn_only.query('Treatment == "4.1"')['Yield'] trt42 = corn_only.query('Treatment == "4.2"')['Yield'] trt5 = corn_only.query('Treatment == "5C"')['Yield'] trt6 = corn_only.query('Treatment == "6C"')['Yield'] # Levene's Test in Python with Scipy: stat, p = levene(trt1, trt2, trt31, trt32, trt41, trt42, trt5, trt6) print(stat, p) # Again, p-value is high, so assume variances are the same. # In[33]: unique_treatments = corn_only['Treatment'].unique() for Treatment in unique_treatments: plt.hist(corn_only[corn_only['Treatment'] == Treatment]['Yield'], bins = 'auto') plt.title("Histogram - " + Treatment) plt.show() # ### Corn Analysis of Variance # In[34]: #corn_formula = 'Yield ~C(Block)+C(Treatment)+C(Block):C(Treatment)' model = ols('Yield ~ C(Block) + C(simple_treatment)', data = corn_only).fit() aov_table = sm.stats.anova_lm(model) print(aov_table) # It looks like block does not have a significant effect (p = 0.164), but treatment does (p < 0.05). # In[35]: fig = plt.figure(figsize = (16, 9)) ax = sns.distplot(model.resid, hist = False, kde_kws = {"shade" : True, "lw": 1}, fit = stats.norm) ax.set_title("KDE Plot of Model Residuals (Blue) and Normal Distribution (Black)") ax.set_xlabel("Residuals") # In[35]: print(model.summary()) # In[36]: model = ols('Yield ~ C(Block) + C(simple_treatment) + C(Block):C(simple_treatment)', data = corn_only).fit() aov_table = sm.stats.anova_lm(model) print(aov_table) # ## Mean Comparisons Attempt # In[38]: import statsmodels.api as sa import statsmodels.formula.api as sfa import scikit_posthocs as sp sp.posthoc_ttest(corn_only, val_col='Yield', group_col='Treatment', p_adjust='holm') # In[39]: sp.posthoc_conover(corn_only, val_col='Yield', group_col='Treatment', p_adjust='holm') # In[43]: corn_only_no60 = corn_only[corn_only['Treatment'].str.contains('1C|2C|3.1|3.2|4.1|5C|6C')] model = ols('Yield ~ C(Block) + C(simple_treatment)', data = corn_only_no60).fit() aov_table = sm.stats.anova_lm(model) print(aov_table) # In[44]: sp.posthoc_ttest(corn_only_no60, val_col='Yield', group_col='Treatment', p_adjust='holm') # ### Soy ANOVA # In[43]: soy_only = soy_only.rename(columns={"Simp. Treatment": "simple_treatment"}) soy_only.head() # In[48]: soy_only.to_excel("Soy_Yield_2021.xlsx") # In[44]: soy_only_stats = pd.DataFrame(soy_only.groupby('Treatment')['Yield'].describe()) soy_only_stats # In[45]: plt.hist(x='Yield', bins = 'auto', data = soy_only) plt.title("Soy Yields 2021") plt.xlabel('Yield (bu/ac)') plt.ylabel('Frequency') plt.savefig('Soy Histogram.jpg', bbox_inches='tight') # In[46]: sns.kdeplot(data=soy_only, x="Yield") plt.title('Soy Yield KDE Plot') plt.savefig('soy_yield_kde.jpg', bbox_inches='tight') # In[47]: import scipy.stats as stats stats.probplot(soy_only['Yield'], dist="norm", plot=plt) plt.show() plt.savefig('soy_yield_probplot', bbox_inches='tight') # In[42]: import scipy.stats as stats unique_treatments = soy_only['Treatment'].unique() for Treatment in unique_treatments: stats.probplot(soy_only[soy_only['Treatment'] == Treatment]['Yield'], dist="norm", plot=plt) plt.title("Probability Plot - " + Treatment) plt.show() # In[43]: from scipy.stats import bartlett # subsetting the data: trt2 = soy_only.query('Treatment == "2S"')['Yield'] trt1 = soy_only.query('Treatment == "1S"')['Yield'] trt5 = soy_only.query('Treatment == "5S"')['Yield'] trt6 = soy_only.query('Treatment == "6S"')['Yield'] # Bartlett's test in Python with SciPy: stat, p = bartlett(trt1, trt2, trt5, trt6) # Get the results: print(stat, p) # In[44]: model = ols('Yield ~ C(Block) + C(simple_treatment)', data = soy_only).fit() aov_table = sm.stats.anova_lm(model) print(aov_table) # Again, no effect from block, but treatments do have an effect. # https://medium.com/budding-data-scientist/data-analytics-using-python-part-6-9e740a1dd681 # In[ ]: from patsy.contrasts import Treatment levels = [1, 2, 3, 4] contrast = Treatment(reference=0).code_without_intercept(levels) print(contrast.matrix) # https://www.pythonfordatascience.org/mixed-effects-regression-python/ # https://pbgworks.org/sites/pbgworks.org/files/RandomizedCompleteBlockDesignTutorial.pdf # In[ ]: