Read Cassandra as DataFrame¶

Short guide on how to read data from Cassandra into pandas dataframe format

toc: true
badges: true
comments: true
categories: [Cassandra]
image:

In [ ]:

import os
from cassandra.cqlengine.models import Model
from cassandra.cqlengine import columns
from datetime import datetime
import pandas as pd

In [ ]:

import os
from datetime import datetime

from cassandra.cqlengine.management import sync_table
from cassandra.policies import TokenAwarePolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import (
    Cluster,
    DCAwareRoundRobinPolicy
)
from cassandra.cqlengine.connection import (
    register_connection,
    set_default_connection
)

In [ ]:

CASSANDRA_USERNAME='cassandra'
CASSANDRA_PASSWORD='cassandra'
CASSANDRA_HOST='127.0.0.1'
CASSANDRA_PORT=9042

In [ ]:

session = None
cluster = None

auth_provider = PlainTextAuthProvider(username=CASSANDRA_USERNAME, password=CASSANDRA_PASSWORD)
cluster = Cluster([CASSANDRA_HOST],
load_balancing_policy=TokenAwarePolicy(DCAwareRoundRobinPolicy()),
port=CASSANDRA_PORT,
auth_provider=auth_provider,
executor_threads=2,
protocol_version=4,
)           

ipykernel_launcher:10: DeprecationWarning: Legacy execution parameters will be removed in 4.0. Consider using execution profiles.

In [ ]:

session = cluster.connect()
register_connection(str(session), session=session)
set_default_connection(str(session))

In [ ]:

rows = session.execute('select * from demo.click_stream;')
df = pd.DataFrame(list(rows))
df.head()

Out[ ]:

	pk	created_at	client_ip	cookie_id	event_name	item_id	url
0	2021-05-01	2021-05-01 15:59:08.188	172.20.0.1	0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN	view	10009	http://127.0.0.1:8051/
1	2021-05-01	2021-05-01 15:59:08.256	172.20.0.1	0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN	view	10004	http://127.0.0.1:8051/
2	2021-05-01	2021-05-01 15:59:08.265	172.20.0.1	0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN	view	10007	http://127.0.0.1:8051/
3	2021-05-01	2021-05-01 15:59:08.270	172.20.0.1	0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN	view	10002	http://127.0.0.1:8051/
4	2021-05-01	2021-05-01 15:59:08.280	172.20.0.1	0:ko0sujvr:WcQ5Z8kIqvWRLp3~Rlpxu_ygeqqQMdkN	view	10001	http://127.0.0.1:8051/

In [ ]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   pk          265 non-null    object        
 1   created_at  265 non-null    datetime64[ns]
 2   client_ip   265 non-null    object        
 3   cookie_id   265 non-null    object        
 4   event_name  265 non-null    object        
 5   item_id     265 non-null    object        
 6   url         265 non-null    object        
dtypes: datetime64[ns](1), object(6)
memory usage: 14.6+ KB

In [ ]:

df.describe()

ipykernel_launcher:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.

Out[ ]:

	pk	created_at	client_ip	cookie_id	event_name	item_id	url
count	265	265	265	265	265	265	265
unique	1	265	1	5	4	12	2
top	2021-05-01	2021-05-01 18:43:48.776000	172.20.0.1	0:ko5xo7an:1ORfub~MP9WFwk9lasJHFV70F~lkpcVk	view	10002	http://127.0.0.1:8051/
freq	265	1	265	160	260	38	264
first	NaN	2021-05-01 15:59:08.188000	NaN	NaN	NaN	NaN	NaN
last	NaN	2021-05-01 18:44:17.576000	NaN	NaN	NaN	NaN	NaN

In [ ]:

df.item_id.value_counts()

Out[ ]:

10002    38
10006    34
10003    30
10000    28
10009    27
10001    24
10004    24
10007    20
10005    20
10008    18
1204      1
1231      1
Name: item_id, dtype: int64

In [ ]:

df.to_pickle('../recommender/data/logs_test_020521_1.p')