3 * 4
12
x = [1, 2, 3]
print(x)
[1, 2, 3]
x.append(4)
print(x)
[1, 2, 3, 4]
measurements = {'height': [1.70, 1.80, 1.50], 'weight': [60, 120, 50]}
measurements
{'height': [1.7, 1.8, 1.5], 'weight': [60, 120, 50]}
measurements['height']
[1.7, 1.8, 1.5]
x = [1, 2, 3, 4]
[i**2 for i in x]
[1, 4, 9, 16]
def calc_bmi(weight, height):
return weight / height**2
[calc_bmi(w, h) for w, h in zip(measurements['weight'], measurements['height'])]
[20.761245674740486, 37.03703703703704, 22.22222222222222]
import pandas as pd
import numpy as np
s = pd.Series([1,3,5,np.nan,6,8])
s
0 1 1 3 2 5 3 NaN 4 6 5 8 dtype: float64
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | -1.825304 | 0.016285 | -0.306601 | 0.004272 |
2013-01-02 | 0.192045 | -0.291267 | 1.951792 | 1.708918 |
2013-01-03 | -0.672820 | -2.051800 | 0.113415 | -0.594967 |
2013-01-04 | -1.453007 | 1.282767 | -0.113163 | -1.824503 |
2013-01-05 | 0.824425 | 2.156586 | -0.047000 | -1.042518 |
2013-01-06 | 0.359821 | -0.896814 | 0.617897 | -0.723848 |
df[df.A > 0]
A | B | C | D | |
---|---|---|---|---|
2013-01-02 | 0.192045 | -0.291267 | 1.951792 | 1.708918 |
2013-01-05 | 0.824425 | 2.156586 | -0.047000 | -1.042518 |
2013-01-06 | 0.359821 | -0.896814 | 0.617897 | -0.723848 |
df.mean()
A -0.429140 B 0.035960 C 0.369390 D -0.412108 dtype: float64
df.mean(axis='columns')
2013-01-01 -0.527837 2013-01-02 0.890372 2013-01-03 -0.801543 2013-01-04 -0.526977 2013-01-05 0.472873 2013-01-06 -0.160736 Freq: D, dtype: float64
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
df2
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1 | 2013-01-02 | 1 | 3 | test | foo |
1 | 1 | 2013-01-02 | 1 | 3 | train | foo |
2 | 1 | 2013-01-02 | 1 | 3 | test | foo |
3 | 1 | 2013-01-02 | 1 | 3 | train | foo |
df2.dtypes
A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | 0.343733 | -0.387430 |
1 | bar | one | 2.061690 | 2.454234 |
2 | foo | two | 0.626621 | -1.106039 |
3 | bar | three | 1.511600 | 0.963582 |
4 | foo | two | -1.187094 | -1.310974 |
5 | bar | two | -1.065222 | -0.983320 |
6 | foo | one | -0.924636 | 0.349164 |
7 | foo | three | 2.093810 | -0.913885 |
df.groupby('A').sum()
C | D | |
---|---|---|
A | ||
bar | 2.508068 | 2.434496 |
foo | 0.952434 | -3.369164 |
df.groupby(['A', 'B']).sum()
C | D | ||
---|---|---|---|
A | B | ||
bar | one | 2.061690 | 2.454234 |
three | 1.511600 | 0.963582 | |
two | -1.065222 | -0.983320 | |
foo | one | -0.580903 | -0.038266 |
three | 2.093810 | -0.913885 | |
two | -0.560473 | -2.417013 |
%matplotlib inline
import seaborn as sns
x = np.random.normal(size=100)
sns.distplot(x);
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns=["x", "y"])
df
x | y | |
---|---|---|
0 | 1.086599 | 0.232294 |
1 | -0.443143 | -0.354076 |
2 | 0.652826 | 1.165967 |
3 | -0.903281 | 0.170521 |
4 | -1.148413 | 2.050707 |
5 | -0.176674 | 2.375160 |
6 | 0.845552 | 1.879006 |
7 | -0.962104 | 0.340105 |
8 | 0.038327 | 0.011257 |
9 | -0.704808 | -0.132214 |
10 | -0.265136 | 0.426827 |
11 | 1.327651 | 3.319972 |
12 | -0.044100 | 0.249657 |
13 | 0.451112 | 0.740770 |
14 | 0.596272 | 1.360725 |
15 | -0.137805 | 1.041698 |
16 | 1.168201 | 2.713771 |
17 | -1.319453 | -0.503621 |
18 | 0.461970 | 2.389124 |
19 | -0.334954 | 1.328563 |
20 | -0.649478 | 1.537904 |
21 | 0.310419 | 1.523426 |
22 | 0.710832 | 0.943226 |
23 | -0.823799 | 1.775166 |
24 | 0.992667 | 1.022176 |
25 | 0.568753 | 0.557493 |
26 | -0.301274 | 1.079018 |
27 | -1.733951 | 0.567187 |
28 | 0.952436 | 1.852690 |
29 | -1.174882 | 0.873883 |
... | ... | ... |
170 | 0.563523 | 2.138955 |
171 | 1.033686 | 4.141070 |
172 | 0.292181 | 1.408222 |
173 | -2.301801 | -0.042128 |
174 | 0.104059 | 0.542621 |
175 | 0.588103 | 0.567352 |
176 | -1.073328 | -0.120820 |
177 | 1.853767 | 0.099758 |
178 | 1.075379 | 2.611514 |
179 | 0.559673 | 3.366675 |
180 | 0.747945 | 2.053162 |
181 | -2.927040 | -0.621391 |
182 | 0.134816 | 0.727113 |
183 | 0.015388 | 2.829295 |
184 | 0.742623 | 2.058462 |
185 | -0.229863 | 1.139025 |
186 | -1.033777 | -0.642099 |
187 | 0.428791 | 0.762164 |
188 | -0.596019 | 2.079460 |
189 | 1.819033 | 1.434940 |
190 | 0.254380 | 1.494345 |
191 | 2.276169 | 0.408099 |
192 | -0.076959 | -1.321194 |
193 | 0.496929 | -0.174779 |
194 | 0.723700 | 1.561495 |
195 | 1.097657 | 2.438591 |
196 | 0.460952 | 2.506192 |
197 | 0.458422 | 1.522599 |
198 | 1.411305 | 1.632619 |
199 | -1.533196 | 2.016695 |
200 rows × 2 columns
sns.jointplot(x="x", y="y", data=df, kind="kde");
iris = sns.load_dataset("iris")
sns.pairplot(iris);
tips = sns.load_dataset("tips")
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips);
sns.lmplot(x="total_bill", y="tip", col="day", data=tips,
col_wrap=2, size=3);
sns.factorplot(x="time", y="total_bill", hue="smoker",
col="day", data=tips, kind="box", size=4, aspect=.5);
Taken from http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn import svm
X = [[0, 0], [1, 1]]
y = [0, 1]
clf = svm.SVC()
clf.fit(X, y)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
clf.predict([[0, .5]])
array([0])
Taken from http://scikit-learn.org/stable/auto_examples/grid_search_digits.html and http://scikit-learn.org/stable/auto_examples/datasets/plot_digits_last_image.html
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
digits = datasets.load_digits()
import matplotlib.pyplot as plt
#Display the first digit
plt.figure(1, figsize=(3, 3))
plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest')
plt.grid('off')
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=0)
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5)
clf.fit(X_train, y_train)
GridSearchCV(cv=5, error_score='raise', estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), fit_params={}, iid=True, loss_func=None, n_jobs=1, param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]}, {'C': [1, 10, 100, 1000], 'kernel': ['linear']}], pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None, verbose=0)
print(clf.best_params_)
{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
y_true, y_pred = y_test, clf.predict(X_test)
ax = sns.heatmap(confusion_matrix(y_true, y_pred))
ax.set(xlabel='true label', ylabel='predicted label');
Taken from https://jakevdp.github.io/blog/2013/06/15/numba-vs-cython-take-2/
import numpy as np
X = np.random.random((1000, 3))
def pairwise_python(X):
M = X.shape[0]
N = X.shape[1]
D = np.empty((M, M), dtype=np.float)
for i in range(M):
for j in range(M):
d = 0.0
for k in range(N):
tmp = X[i, k] - X[j, k]
d += tmp * tmp
D[i, j] = np.sqrt(d)
return D
%timeit pairwise_python(X)
1 loops, best of 3: 6.34 s per loop
%load_ext cython
%%cython
import numpy as np
cimport cython
from libc.math cimport sqrt
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_cython(double[:, ::1] X):
cdef int M = X.shape[0]
cdef int N = X.shape[1]
cdef double tmp, d
cdef double[:, ::1] D = np.empty((M, M), dtype=np.float64)
for i in range(M):
for j in range(M):
d = 0.0
for k in range(N):
tmp = X[i, k] - X[j, k]
d += tmp * tmp
D[i, j] = sqrt(d)
return np.asarray(D)
%timeit pairwise_cython(X)
100 loops, best of 3: 7.11 ms per loop
from numba.decorators import autojit
pairwise_numba = autojit(pairwise_python)
# Run once to compile before timing
pairwise_numba(X)
%timeit pairwise_numba(X)
100 loops, best of 3: 7.31 ms per loop
Taken from https://jakevdp.github.io/blog/2015/08/14/out-of-core-dataframes-in-python/
!ls -lahL POIWorld.csv
-rw-rw-r-- 1 wiecki wiecki 774M Sep 8 14:43 POIWorld.csv
from dask import dataframe as dd
columns = ["name", "amenity", "Longitude", "Latitude"]
data = dd.read_csv('POIWorld.csv', usecols=columns)
data
dd.DataFrame<read-csv-POIWorld.csv-74f130b10caf7d8ca68886970e302ba2, divisions=(None, None, None, ..., None, None)>
with_name = data[data.name.notnull()]
is_starbucks = with_name.name.str.contains('[Ss]tarbucks')
is_dunkin = with_name.name.str.contains('[Dd]unkin')
starbucks = with_name[is_starbucks]
dunkin = with_name[is_dunkin]
with ProgressBar():
starbucks_count, dunkin_count = dd.compute(starbucks.name.count(), dunkin.name.count())
[########################################] | 100% Completed | 1min 5.8s
starbucks_count, dunkin_count
(5416, 1320)
locs = dd.compute(starbucks.Longitude,
starbucks.Latitude,
dunkin.Longitude,
dunkin.Latitude)
# extract arrays of values fro the series:
lon_s, lat_s, lon_d, lat_d = [loc.values for loc in locs]
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
def draw_USA():
"""initialize a basemap centered on the continental USA"""
plt.figure(figsize=(14, 10))
return Basemap(projection='lcc', resolution='l',
llcrnrlon=-119, urcrnrlon=-64,
llcrnrlat=22, urcrnrlat=49,
lat_1=33, lat_2=45, lon_0=-95,
area_thresh=10000)
m = draw_USA()
# Draw map background
m.fillcontinents(color='white', lake_color='#eeeeee')
m.drawstates(color='lightgray')
m.drawcoastlines(color='lightgray')
m.drawcountries(color='lightgray')
m.drawmapboundary(fill_color='#eeeeee')
# Plot the values in Starbucks Green and Dunkin Donuts Orange
style = dict(s=5, marker='o', alpha=0.5, zorder=2)
m.scatter(lon_s, lat_s, latlon=True,
label="Starbucks", color='#00592D', **style)
m.scatter(lon_d, lat_d, latlon=True,
label="Dunkin' Donuts", color='#FC772A', **style)
plt.legend(loc='lower left', frameon=False);