In [1]:
print "Before Pandas", len(dir())
import pandas
from pandas import *
print "After Pandas", len(dir())
from numpy import *
print "After NumPy", len(dir())
Before Pandas 21
After Pandas 165
After NumPy 706

Reading Data

In [2]:
import urlparse
import httplib
import pandas

csv_data = "http://priede.bf.lu.lv/ftp/grozs/Datorlietas/Geog5028/TIS_PRG/HomePlanet/ASTEROID.CSV"
fwf_data = "http://ssd.jpl.nasa.gov/dat/ELEMENTS.NUMBR"
tax_data = "http://sbn.psi.edu/ferret/reformatTable.action?productId=TAXONOMY10_TAB&dataSetId=EAR-A-5-DDR-TAXONOMY-V6.0"

def save_file(location):
    """Read data at url"""
    url = urlparse.urlparse(location)
    connection = httplib.HTTPConnection(url.netloc)
    connection.connect()
    connection.request('GET', url.path)
    response = connection.getresponse()
    data = response.read()
    filename = '/tmp/' + url.path.split('/')[-1]
    print filename
    with open(filename, 'rw+') as tmp:
        tmp.write(data)
    return filename


# Read NASA asteroid data
#filename = save_file(fwf_data)
filename = "data/ELEMENTS.NUMBR.txt"
with open(filename, 'r') as f:
    widths = map(lambda line: len(line) + 1, f.read().splitlines()[1].split(' '))
asteroids_nasa = pandas.read_fwf(filename, widths=widths, skiprows=[1])

widths = [11, 18, 11, 13, 13, 14, 14, 14, 14, 13, 13, 12, 12, 10, 10, 15, 15, 16, 15, 22]
asteroids_taxonomy = pandas.read_fwf(tax_data, widths=widths, skiprows=[1])
In [71]:
print asteroids_nasa.xs(0)
print '\n'
print asteroids_taxonomy.xs(0)
Num                        1
AST_NAME               Ceres
Epoch                  56600
Semimajor Axis      2.766807
Eccentricity      0.07579726
i                   10.59398
w                   72.29215
Node                80.32764
M                   10.55758
H                       3.34
G                       0.12
Ref\n               JPL 32\n
Name: 0, dtype: object


AST_NUMBER                                       1
AST_NAME                                     Ceres
PROV_ID                                          -
THOLEN_CLASS                                     G
THOLEN_PARAM                                    7G
BARUCCI_CLASS                                   G0
BARUCCI_PARAM                                   7I
TEDESCO_CLASS                                   G?
TEDESCO_PARAM                                   2I
HOWELL_CLASS                                   CvB
HOWELL_PARAM                                    65
SMASS_CLASS                                      -
SMASS_PARAM                                      -
BUS_CLASS                                        C
BUS_PARAM                                        s
S3OS2_CLASS_TH                                   C
S3OS2_CLASS_BB                                   C
BUS_DEMEO_CLASS                                  C
DEMEO_REF_CODE                                   a
COMMENT              \n    -                    \n
Name: 0, dtype: object

Descriptions

In [26]:
asteroids_nasa.ix[:, 0:2].head(10)
Out[26]:
Num Name
0 1 Ceres
1 2 Pallas
2 3 Juno
3 4 Vesta
4 5 Astraea
5 6 Hebe
6 7 Iris
7 8 Flora
8 9 Metis
9 10 Hygiea
In [15]:
asteroids_nasa.describe()
Out[15]:
<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, count to max
Data columns (total 10 columns):
Num      8  non-null values
Epoch    8  non-null values
a        8  non-null values
e        8  non-null values
i        8  non-null values
w        8  non-null values
Node     8  non-null values
M        8  non-null values
H        8  non-null values
G        8  non-null values
dtypes: float64(10)
In [18]:
asteroids_nasa.ix[:, 3].describe()
Out[18]:
count    369956.000000
mean          2.714804
std           2.343523
min           0.617616
25%           2.381873
50%           2.625409
75%           2.949065
max         780.919318
dtype: float64
In [19]:
asteroids_nasa.ix[:, 'a'].describe()
Out[19]:
count    369956.000000
mean          2.714804
std           2.343523
min           0.617616
25%           2.381873
50%           2.625409
75%           2.949065
max         780.919318
dtype: float64
In [30]:
asteroids_nasa.ix[:, 3:7].corr()
Out[30]:
a e i w
a 1.000000 0.028508 0.057180 0.001323
e 0.028508 1.000000 0.102916 0.014825
i 0.057180 0.102916 1.000000 -0.000701
w 0.001323 0.014825 -0.000701 1.000000
In [31]:
asteroids_nasa.rename(columns={'Name': 'AST_NAME', 'a': 'Semimajor Axis', 'e': 'Eccentricity'})
Out[31]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 369956 entries, 0 to 369955
Data columns (total 12 columns):
Num               369956  non-null values
AST_NAME          369956  non-null values
Epoch             369956  non-null values
Semimajor Axis    369956  non-null values
Eccentricity      369956  non-null values
i                 369956  non-null values
w                 369956  non-null values
Node              369956  non-null values
M                 369956  non-null values
H                 369956  non-null values
G                 369956  non-null values
Ref
              369956  non-null values
dtypes: float64(8), int64(2), object(2)
In [32]:
asteroids_nasa
Out[32]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 369956 entries, 0 to 369955
Data columns (total 12 columns):
Num      369956  non-null values
Name     369956  non-null values
Epoch    369956  non-null values
a        369956  non-null values
e        369956  non-null values
i        369956  non-null values
w        369956  non-null values
Node     369956  non-null values
M        369956  non-null values
H        369956  non-null values
G        369956  non-null values
Ref
     369956  non-null values
dtypes: float64(8), int64(2), object(2)
In [4]:
asteroids_nasa.rename(columns={'Name': 'AST_NAME', 'a': 'Semimajor Axis', 'e': 'Eccentricity'}, inplace=True)
In [34]:
asteroids_nasa
Out[34]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 369956 entries, 0 to 369955
Data columns (total 12 columns):
Num               369956  non-null values
AST_NAME          369956  non-null values
Epoch             369956  non-null values
Semimajor Axis    369956  non-null values
Eccentricity      369956  non-null values
i                 369956  non-null values
w                 369956  non-null values
Node              369956  non-null values
M                 369956  non-null values
H                 369956  non-null values
G                 369956  non-null values
Ref
              369956  non-null values
dtypes: float64(8), int64(2), object(2)
In [55]:
asteroids_taxonomy
Out[55]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615 entries, 0 to 2614
Data columns (total 20 columns):
AST_NUMBER                2615  non-null values
AST_NAME                  2615  non-null values
PROV_ID                   2615  non-null values
THOLEN_CLASS              2615  non-null values
THOLEN_PARAM              2615  non-null values
BARUCCI_CLASS             2615  non-null values
BARUCCI_PARAM             2615  non-null values
TEDESCO_CLASS             2615  non-null values
TEDESCO_PARAM             2615  non-null values
HOWELL_CLASS              2615  non-null values
HOWELL_PARAM              2615  non-null values
SMASS_CLASS               2615  non-null values
SMASS_PARAM               2615  non-null values
BUS_CLASS                 2615  non-null values
BUS_PARAM                 2615  non-null values
S3OS2_CLASS_TH            2615  non-null values
S3OS2_CLASS_BB            2615  non-null values
BUS_DEMEO_CLASS           2615  non-null values
DEMEO_REF_CODE            2615  non-null values
COMMENT              
    2615  non-null values
dtypes: int64(1), object(19)
In [63]:
merged = asteroids_nasa.merge(asteroids_taxonomy, on='AST_NAME')
print merged
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2415 entries, 0 to 2414
Data columns (total 31 columns):
Num                       2415  non-null values
AST_NAME                  2415  non-null values
Epoch                     2415  non-null values
Semimajor Axis            2415  non-null values
Eccentricity              2415  non-null values
i                         2415  non-null values
w                         2415  non-null values
Node                      2415  non-null values
M                         2415  non-null values
H                         2415  non-null values
G                         2415  non-null values
Ref
                      2415  non-null values
AST_NUMBER                2415  non-null values
PROV_ID                   2415  non-null values
THOLEN_CLASS              2415  non-null values
THOLEN_PARAM              2415  non-null values
BARUCCI_CLASS             2415  non-null values
BARUCCI_PARAM             2415  non-null values
TEDESCO_CLASS             2415  non-null values
TEDESCO_PARAM             2415  non-null values
HOWELL_CLASS              2415  non-null values
HOWELL_PARAM              2415  non-null values
SMASS_CLASS               2415  non-null values
SMASS_PARAM               2415  non-null values
BUS_CLASS                 2415  non-null values
BUS_PARAM                 2415  non-null values
S3OS2_CLASS_TH            2415  non-null values
S3OS2_CLASS_BB            2415  non-null values
BUS_DEMEO_CLASS           2415  non-null values
DEMEO_REF_CODE            2415  non-null values
COMMENT              
    2415  non-null values
dtypes: float64(8), int64(3), object(20)
In [64]:
merged.pop('Ref\n')
Out[64]:
0      JPL 32\n
1      JPL 26\n
2     JPL 102\n
3      JPL 33\n
4      JPL 83\n
5      JPL 82\n
6     JPL 105\n
7      JPL 90\n
8      JPL 90\n
9      JPL 85\n
10     JPL 70\n
11     JPL 87\n
12     JPL 60\n
13     JPL 57\n
14     JPL 70\n
...
2400     JPL 21\n
2401      JPL 2\n
2402      JPL 3\n
2403     JPL 50\n
2404    JPL 120\n
2405      JPL 1\n
2406     JPL 31\n
2407      JPL 1\n
2408      JPL 3\n
2409      JPL 1\n
2410      JPL 3\n
2411      JPL 1\n
2412      JPL 1\n
2413      JPL 1\n
2414    JPL 127\n
Name: Ref
, Length: 2415, dtype: object
In [65]:
merged.pop('COMMENT              \n')
Out[65]:
0     -                    \n
1     -                    \n
2     -                    \n
3     -                    \n
4     -                    \n
5     -                    \n
6     -                    \n
7     -                    \n
8     -                    \n
9     -                    \n
10    -                    \n
11    -                    \n
12    -                    \n
13    -                    \n
14    -                    \n
...
2400    -                    \n
2401    -                    \n
2402    -                    \n
2403    -                    \n
2404    -                    \n
2405    -                    \n
2406    -                    \n
2407    -                    \n
2408    -                    \n
2409    -                    \n
2410    -                    \n
2411    -                    \n
2412    -                    \n
2413    -                    \n
2414    -                    \n
Name: COMMENT              
, Length: 2415, dtype: object
In [66]:
merged.columns
Out[66]:
Index([Num, AST_NAME, Epoch, Semimajor Axis, Eccentricity, i, w, Node, M, H, G, AST_NUMBER, PROV_ID, THOLEN_CLASS, THOLEN_PARAM, BARUCCI_CLASS, BARUCCI_PARAM, TEDESCO_CLASS, TEDESCO_PARAM, HOWELL_CLASS, HOWELL_PARAM, SMASS_CLASS, SMASS_PARAM, BUS_CLASS, BUS_PARAM, S3OS2_CLASS_TH, S3OS2_CLASS_BB, BUS_DEMEO_CLASS, DEMEO_REF_CODE], dtype=object)
In [67]:
ref_codes = merged.pop('DEMEO_REF_CODE')
merged
Out[67]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2415 entries, 0 to 2414
Data columns (total 28 columns):
Num                2415  non-null values
AST_NAME           2415  non-null values
Epoch              2415  non-null values
Semimajor Axis     2415  non-null values
Eccentricity       2415  non-null values
i                  2415  non-null values
w                  2415  non-null values
Node               2415  non-null values
M                  2415  non-null values
H                  2415  non-null values
G                  2415  non-null values
AST_NUMBER         2415  non-null values
PROV_ID            2415  non-null values
THOLEN_CLASS       2415  non-null values
THOLEN_PARAM       2415  non-null values
BARUCCI_CLASS      2415  non-null values
BARUCCI_PARAM      2415  non-null values
TEDESCO_CLASS      2415  non-null values
TEDESCO_PARAM      2415  non-null values
HOWELL_CLASS       2415  non-null values
HOWELL_PARAM       2415  non-null values
SMASS_CLASS        2415  non-null values
SMASS_PARAM        2415  non-null values
BUS_CLASS          2415  non-null values
BUS_PARAM          2415  non-null values
S3OS2_CLASS_TH     2415  non-null values
S3OS2_CLASS_BB     2415  non-null values
BUS_DEMEO_CLASS    2415  non-null values
dtypes: float64(8), int64(3), object(17)
In [68]:
merged.insert(11, 'DEMEO_REF_CODE', ref_codes)
In [69]:
merged
Out[69]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2415 entries, 0 to 2414
Data columns (total 29 columns):
Num                2415  non-null values
AST_NAME           2415  non-null values
Epoch              2415  non-null values
Semimajor Axis     2415  non-null values
Eccentricity       2415  non-null values
i                  2415  non-null values
w                  2415  non-null values
Node               2415  non-null values
M                  2415  non-null values
H                  2415  non-null values
G                  2415  non-null values
DEMEO_REF_CODE     2415  non-null values
AST_NUMBER         2415  non-null values
PROV_ID            2415  non-null values
THOLEN_CLASS       2415  non-null values
THOLEN_PARAM       2415  non-null values
BARUCCI_CLASS      2415  non-null values
BARUCCI_PARAM      2415  non-null values
TEDESCO_CLASS      2415  non-null values
TEDESCO_PARAM      2415  non-null values
HOWELL_CLASS       2415  non-null values
HOWELL_PARAM       2415  non-null values
SMASS_CLASS        2415  non-null values
SMASS_PARAM        2415  non-null values
BUS_CLASS          2415  non-null values
BUS_PARAM          2415  non-null values
S3OS2_CLASS_TH     2415  non-null values
S3OS2_CLASS_BB     2415  non-null values
BUS_DEMEO_CLASS    2415  non-null values
dtypes: float64(8), int64(3), object(18)
In [5]:
from pandas.stats.api import ols
model = ols(y=asteroids_nasa.xs('Semimajor Axis', axis=1), x=asteroids_nasa.xs('i', axis=1))
print model
-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         369956
Number of Degrees of Freedom:   2

R-squared:         0.0033
Adj R-squared:     0.0033

Rmse:              2.3397

F-stat (1, 369954):  1213.5610, p-value:     0.0000

Degrees of Freedom: model 1, resid 369954

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x     0.0233     0.0007      34.84     0.0000     0.0220     0.0246
     intercept     2.5314     0.0065     388.25     0.0000     2.5186     2.5442
---------------------------------End of Summary---------------------------------

In [6]:
import matplotlib.pyplot as plt
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-6-eff513f636fd> in <module>()
----> 1 import matplotlib.pyplot as plt

ImportError: No module named matplotlib.pyplot
In [ ]: