#!/usr/bin/env python
# coding: utf-8

# In[2]:


import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import statsmodels.api as sm
from statsmodels.formula.api import ols


# In[3]:


yield_data = pd.read_excel("2021 Yield.xlsx")


# In[4]:


yield_data.head()


# In[5]:


ax = sns.boxplot(x = "Treatment", y = "Yield (avg; bu/ac)", data = yield_data)
plt.savefig('all_yield_box.png')


# Including all of the yield data on the same plot doesn't make sense because some treatments are soybeans and some are corn

# In[6]:


yield_data.dtypes


# In[7]:


yield_data['Treatment'] = yield_data['Treatment'].astype('string')
yield_data.dtypes


# Filtering out the soybean treatment data

# In[8]:


corn_only = yield_data[yield_data['Treatment'].str.contains('1C|2C|3.1|3.2|4.1|4.2|5C|6C')]
corn_only.head()


# In[9]:


len(corn_only)


# Making sure all of the treatments were included. There are 8 treatments with corn growing and each treatment has three reps, so 24 is correct. 

# In[10]:


ax = sns.boxplot(x = "Treatment", y = "Yield (avg; bu/ac)", data = corn_only, hue = 'Short Trt Description', dodge = False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('corn_yield_box.jpg', bbox_inches='tight')


# In[11]:


soy_only = yield_data[yield_data['Treatment'].str.contains('1S|2S|5S|6S')]
soy_only.head()


# In[12]:


ax = sns.boxplot(x = "Treatment", y = "Yield (avg; bu/ac)", data = soy_only, hue = 'Short Trt Description', dodge = False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('soy_yield_box.jpg', bbox_inches='tight')


# In[13]:


corn_only = corn_only.rename(columns={"Yield (avg; bu/ac)": "Yield"})


# In[14]:


corn_only.to_excel("Corn_Yield_2021.xlsx")


# Saving the corn data as its own spreadsheet in case I want to come back to it 

# ### Bar Plots

# In[15]:


corn_only.groupby('Treatment')['Yield'].describe()['mean'].plot(kind='bar', title = 'Mean Yield of Corn Treatments')


# In[16]:


corn_means = pd.DataFrame(corn_only.groupby('Treatment')['Yield'].describe()['mean'])
corn_means = corn_means.reset_index()
corn_means.head()


# In[17]:


print(corn_means.columns)


# In[18]:


ax = sns.barplot(x = 'Treatment', y = 'mean', data = corn_means)
ax.set_title('2021 Yield of Corn Treatments')
ax.set(xlabel='Treatment', ylabel='Mean Yield (bu/ac)')

#for annotating 
for p in ax.patches:
             ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                 textcoords='offset points')
        
plt.savefig('corn_bar.jpg', bbox_inches='tight')


# In[19]:


corn_block_means = pd.DataFrame(corn_only.groupby('Block')['Yield'].describe()['mean'])
corn_block_means = corn_block_means.reset_index()
corn_block_means.head()


# In[20]:


ax = sns.barplot(x = 'Block', y = 'mean', data = corn_block_means)
ax.set_title('2021 Corn Yield of Blocks')
ax.set(xlabel='Block', ylabel='Mean Yield (bu/ac)')

#for annotating 
for p in ax.patches:
             ax.annotate("%.1f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                 textcoords='offset points')
        
plt.savefig('corn_block_bar.jpg', bbox_inches='tight')


# In[21]:


soy_only.head()


# In[22]:


soy_only = soy_only.rename(columns={"Yield (avg; bu/ac)": "Yield"})


# In[23]:


soy_means = pd.DataFrame(soy_only.groupby('Treatment')['Yield'].describe()['mean'])
soy_means = soy_means.reset_index()

ax = sns.barplot(x = 'Treatment', y = 'mean', data = soy_means)
ax.set_title('2021 Yield of Soy Treatments')
ax.set(xlabel='Treatment', ylabel='Mean Yield (bu/ac)')

#for annotating 
for p in ax.patches:
             ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                 textcoords='offset points')
        
plt.savefig('soy_bar.jpg', bbox_inches='tight')


# In[24]:


soy_block_means = pd.DataFrame(soy_only.groupby('Block')['Yield'].describe()['mean'])
soy_block_means = soy_block_means.reset_index()

ax = sns.barplot(x = 'Block', y = 'mean', data = soy_block_means)
ax.set_title('2021 Soy Yield of Blocks')
ax.set(xlabel='Block', ylabel='Mean Yield (bu/ac)')

#for annotating 
for p in ax.patches:
             ax.annotate("%.1f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                 textcoords='offset points')
        
plt.savefig('soy_block_bar.jpg', bbox_inches='tight')


# ## ANOVA

# In[25]:


corn_only = corn_only.rename(columns={"Simp. Treatment": "simple_treatment"})
corn_only.head()


# In[26]:


corn_only


# In[27]:


corn_only_stats = pd.DataFrame(corn_only.groupby('Treatment')['Yield'].describe())
corn_only_stats


# In[28]:


plt.hist(x='Yield', bins = 'auto', data = corn_only)
plt.title("Corn Yields 2021")
plt.xlabel('Yield (bu/ac)')
plt.ylabel('Frequency')
plt.savefig('Corn Histogram.jpg', bbox_inches='tight')


# This is not looking very normally distributed. 

# In[42]:


sns.kdeplot(data=corn_only, x="Yield")
plt.title('Corn Yield Kernel Density Plot')
plt.savefig('corn_yield_kde.jpg', bbox_inches='tight')


# In[29]:


import scipy.stats as stats
stats.probplot(corn_only['Yield'], dist="norm", plot=plt)
plt.show()
plt.savefig('corn_yield_probplot', bbox_inches='tight')


# In[30]:


import scipy.stats as stats
unique_treatments = corn_only['Treatment'].unique()
for Treatment in unique_treatments:
    stats.probplot(corn_only[corn_only['Treatment'] == Treatment]['Yield'], dist="norm", plot=plt)
    plt.title("Probability Plot - " +  Treatment)
    plt.show()


# ### Bartlett’s Test of Homogeneity of Variances

# Following this tutorial: https://www.marsja.se/levenes-bartletts-test-of-equality-homogeneity-of-variance-in-python/

# Null Hypothesis: the variances are equal across all samples/groups     
# Alternative Hypothesis:  the variances are not equal across all samples/groups    

# In[31]:


from scipy.stats import bartlett

# subsetting the data:
trt1 = corn_only.query('Treatment == "1C"')['Yield']
trt2 = corn_only.query('Treatment == "2C"')['Yield']
trt31 = corn_only.query('Treatment == "3.1"')['Yield']
trt32 = corn_only.query('Treatment == "3.2"')['Yield']
trt41 = corn_only.query('Treatment == "4.1"')['Yield']
trt42 = corn_only.query('Treatment == "4.2"')['Yield']
trt5 = corn_only.query('Treatment == "5C"')['Yield']
trt6 = corn_only.query('Treatment == "6C"')['Yield']

# Bartlett's test in Python with SciPy:
stat, p = bartlett(trt1, trt2, trt31, trt32, trt41, trt42, trt5, trt6)

# Get the results:
print(stat, p)


# p-value is high, so assume variances are the same. 

# ### Levene’s Test of Equality of Variances

# In[32]:


from scipy.stats import levene

# Create three arrays for each sample:
trt1 = corn_only.query('Treatment == "1C"')['Yield']
trt2 = corn_only.query('Treatment == "2C"')['Yield']
trt31 = corn_only.query('Treatment == "3.1"')['Yield']
trt32 = corn_only.query('Treatment == "3.2"')['Yield']
trt41 = corn_only.query('Treatment == "4.1"')['Yield']
trt42 = corn_only.query('Treatment == "4.2"')['Yield']
trt5 = corn_only.query('Treatment == "5C"')['Yield']
trt6 = corn_only.query('Treatment == "6C"')['Yield']

# Levene's Test in Python with Scipy:
stat, p = levene(trt1, trt2, trt31, trt32, trt41, trt42, trt5, trt6)

print(stat, p)


# Again, p-value is high, so assume variances are the same. 

# In[33]:


unique_treatments = corn_only['Treatment'].unique()
for Treatment in unique_treatments:
    plt.hist(corn_only[corn_only['Treatment'] == Treatment]['Yield'], bins = 'auto')
    plt.title("Histogram - " +  Treatment)
    plt.show()


# ### Corn Analysis of Variance

# In[34]:


#corn_formula = 'Yield ~C(Block)+C(Treatment)+C(Block):C(Treatment)'

model = ols('Yield ~ C(Block) + C(simple_treatment)', data = corn_only).fit()

aov_table = sm.stats.anova_lm(model)

print(aov_table)


# It looks like block does not have a significant effect (p = 0.164), but treatment does (p < 0.05).

# In[35]:


fig = plt.figure(figsize = (16, 9))

ax = sns.distplot(model.resid, hist = False, kde_kws = {"shade" : True, "lw": 1}, fit = stats.norm)

ax.set_title("KDE Plot of Model Residuals (Blue) and Normal Distribution (Black)")
ax.set_xlabel("Residuals")


# In[35]:


print(model.summary())


# In[36]:


model = ols('Yield ~ C(Block) + C(simple_treatment) + C(Block):C(simple_treatment)', data = corn_only).fit()

aov_table = sm.stats.anova_lm(model)

print(aov_table)


# ## Mean Comparisons Attempt

# In[38]:


import statsmodels.api as sa
import statsmodels.formula.api as sfa
import scikit_posthocs as sp

sp.posthoc_ttest(corn_only, val_col='Yield', group_col='Treatment', p_adjust='holm')


# In[39]:


sp.posthoc_conover(corn_only, val_col='Yield', group_col='Treatment', p_adjust='holm')


# In[43]:


corn_only_no60 = corn_only[corn_only['Treatment'].str.contains('1C|2C|3.1|3.2|4.1|5C|6C')]

model = ols('Yield ~ C(Block) + C(simple_treatment)', data = corn_only_no60).fit()

aov_table = sm.stats.anova_lm(model)

print(aov_table)


# In[44]:


sp.posthoc_ttest(corn_only_no60, val_col='Yield', group_col='Treatment', p_adjust='holm')


# ### Soy ANOVA

# In[43]:


soy_only = soy_only.rename(columns={"Simp. Treatment": "simple_treatment"})
soy_only.head()


# In[48]:


soy_only.to_excel("Soy_Yield_2021.xlsx")


# In[44]:


soy_only_stats = pd.DataFrame(soy_only.groupby('Treatment')['Yield'].describe())
soy_only_stats


# In[45]:


plt.hist(x='Yield', bins = 'auto', data = soy_only)
plt.title("Soy Yields 2021")
plt.xlabel('Yield (bu/ac)')
plt.ylabel('Frequency')
plt.savefig('Soy Histogram.jpg', bbox_inches='tight')


# In[46]:


sns.kdeplot(data=soy_only, x="Yield")
plt.title('Soy Yield KDE Plot')
plt.savefig('soy_yield_kde.jpg', bbox_inches='tight')


# In[47]:


import scipy.stats as stats
stats.probplot(soy_only['Yield'], dist="norm", plot=plt)
plt.show()

plt.savefig('soy_yield_probplot', bbox_inches='tight')


# In[42]:


import scipy.stats as stats
unique_treatments = soy_only['Treatment'].unique()
for Treatment in unique_treatments:
    stats.probplot(soy_only[soy_only['Treatment'] == Treatment]['Yield'], dist="norm", plot=plt)
    plt.title("Probability Plot - " +  Treatment)
    plt.show()


# In[43]:


from scipy.stats import bartlett

# subsetting the data:
trt2 = soy_only.query('Treatment == "2S"')['Yield']
trt1 = soy_only.query('Treatment == "1S"')['Yield']
trt5 = soy_only.query('Treatment == "5S"')['Yield']
trt6 = soy_only.query('Treatment == "6S"')['Yield']


# Bartlett's test in Python with SciPy:
stat, p = bartlett(trt1, trt2, trt5, trt6)

# Get the results:
print(stat, p)


# In[44]:


model = ols('Yield ~ C(Block) + C(simple_treatment)', data = soy_only).fit()

aov_table = sm.stats.anova_lm(model)

print(aov_table)


# Again, no effect from block, but treatments do have an effect. 

# https://medium.com/budding-data-scientist/data-analytics-using-python-part-6-9e740a1dd681

# In[ ]:


from patsy.contrasts import Treatment

levels = [1, 2, 3, 4]
contrast = Treatment(reference=0).code_without_intercept(levels)
print(contrast.matrix)


# https://www.pythonfordatascience.org/mixed-effects-regression-python/

# https://pbgworks.org/sites/pbgworks.org/files/RandomizedCompleteBlockDesignTutorial.pdf

# In[ ]: