#!/usr/bin/env python # coding: utf-8 # In[1]: import os import numpy as np import csv print csv.__version__ print np.__version__ # In[2]: # open file # set delimiter as ';' # typecast to list and store to variable "wine" DATA_DIR = '../data' with open(os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), 'r') as datafile: reader = csv.reader(datafile, delimiter=";") wines = list(reader) # In[3]: wines[0] # In[4]: # -1 to avoid header from counting into records print 'Total records: {}'.format(len(wines)-1) # In[5]: # average of quality variable qualities = [float(item[-1]) for item in wines[1:]] print sum(qualities)/len(qualities) # In[6]: # converting python array to numpy array and typecasting the cell values to float wines = wines[1:] wines_np = np.array(wines, dtype='float') wines_np.shape # In[7]: # printing out wines 2-D array wines_np # In[8]: # creating numpy array with all zero elements print np.zeros((2,2), dtype='float') # In[9]: # creating numpy array with all random numbers print np.random.rand(2,2) # In[10]: # using numpy to read dataset wines = np.genfromtxt( os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), delimiter=";", skip_header=1 ) # In[11]: wines == wines_np # In[12]: # accessing the value for 3rd row 2nd column of wines print wines[2, 1] print wines[2, 1] == wines_np[2, 1] # In[13]: # we would like to access 4rows from top of 3rd column # wines[start:end, column_index] # since the index start from zero; so slicing excludes 4 and finds out result from 0, 1, 2, 3 wines[:4,2] # In[14]: # we will override the existing value of 2nd column to 10.0 for all the rows wines[:, 2] = 10.0 wines[:4, 2] # In[15]: # creating 1-D array in numpy random_1d = np.random.rand(5) random_1d # In[16]: # creating 3-D numpy array random_3d = np.random.rand(2,4,3) # In[17]: # take this shape as any thing for 2 years across 4 quarters per month in that quarter #2x4x3 = 24 months random_3d.shape # In[18]: # Data types in numpy # converting wines to type=int wines.astype('int') # In[19]: # addition to any column across all rows # as shows below all the remaining mathematical operations can be done print wines[:, 11] print wines[:, 11] + 1 # In[20]: # multiplying 2 columns # examples show the square of 12th column wines[:, 11] * wines[: , 11] # In[21]: # sum any column across all rows wines[:, 11].sum(axis=0) # In[22]: wines[:, 11].mean() #std, min, max are many other methods for fast stats computation