#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os
import numpy as np
import csv

print csv.__version__
print np.__version__


# In[2]:


# open file
# set delimiter as ';'
# typecast to list and store to variable "wine"

DATA_DIR = '../data'

with open(os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), 'r') as datafile:
    reader = csv.reader(datafile, delimiter=";")
    wines = list(reader)


# In[3]:


wines[0]


# In[4]:


# -1 to avoid header from counting into records
print 'Total records: {}'.format(len(wines)-1)


# In[5]:


# average of quality variable
qualities = [float(item[-1]) for item in wines[1:]]
print sum(qualities)/len(qualities)


# In[6]:


# converting python array to numpy array and typecasting the cell values to float
wines = wines[1:]
wines_np = np.array(wines, dtype='float')
wines_np.shape


# In[7]:


# printing out wines 2-D array
wines_np


# In[8]:


# creating numpy array with all zero elements
print np.zeros((2,2), dtype='float')


# In[9]:


# creating numpy array with all random numbers
print np.random.rand(2,2)


# In[10]:


# using numpy to read dataset
wines = np.genfromtxt(
                  os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), 
                  delimiter=";", 
                  skip_header=1
                    )


# In[11]:


wines == wines_np


# In[12]:


# accessing the value for 3rd row 2nd column of wines
print wines[2, 1]
print wines[2, 1] == wines_np[2, 1]


# In[13]:


# we would like to access 4rows from top of 3rd column
# wines[start:end, column_index]
# since the index start from zero; so slicing excludes 4 and finds out result from 0, 1, 2, 3
wines[:4,2]


# In[14]:


# we will override the existing value of 2nd column to 10.0 for all the rows
wines[:, 2] = 10.0
wines[:4, 2]


# In[15]:


# creating 1-D array in numpy
random_1d = np.random.rand(5)
random_1d


# In[16]:


# creating 3-D numpy array
random_3d = np.random.rand(2,4,3)


# In[17]:


# take this shape as any thing for 2 years across 4 quarters per month in that quarter
#2x4x3 = 24 months
random_3d.shape


# In[18]:


# Data types in numpy
# converting wines to type=int
wines.astype('int')


# In[19]:


# addition to any column across all rows
# as shows below all the remaining mathematical operations can be done
print wines[:, 11]
print wines[:, 11] + 1


# In[20]:


# multiplying 2 columns
# examples show the square of 12th column
wines[:, 11] * wines[: , 11]


# In[21]:


# sum any column across all rows
wines[:, 11].sum(axis=0)


# In[22]:


wines[:, 11].mean() #std, min, max are many other methods for fast stats computation