In [1]:

%load_ext autoreload
%autoreload 2

In [2]:

import sys
sys.path.append("..")

In [3]:

from optimus import Optimus

C:\Users\argenisleon\Anaconda3\lib\site-packages\dask\config.py:161: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
  data = yaml.load(f.read()) or {}
C:\Users\argenisleon\Anaconda3\lib\site-packages\statsmodels\compat\pandas.py:49: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)

In [4]:

op= Optimus(comm=True)

    You are using PySparkling of version 2.4.10, but your PySpark is of
    version 2.3.1. Please make sure Spark and PySparkling versions are compatible.

Open Bumblebee: https://app.hi-bumblebee.com

If you really care about privacy get your keys in bumblebee.ini and put them here

In [6]:

from pyspark.sql.types import *
from datetime import date, datetime

cols = [
        ("names", "str"),
        ("height(ft)", ShortType()),
        ("function", "str"),
        ("rank", ByteType()),
        ("age", "int"),
        ("weight(t)", "float"),
        "japanese name",
        "last position seen",
        "date arrival",
        "last date seen",
        ("attributes", ArrayType(FloatType())),
        ("DateType", DateType()),
        ("timestamp", TimestampType()),
        ("Cybertronian", BooleanType()),
        ("function(binary)", BinaryType()),
        ("NullType", NullType())

    ]

rows = [
        ("argenisleon@gmail.com", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
         None),
        ("bumbl#ebéé  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
         "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
         None),
        ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
         "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
         None),
        ("1 Megatron", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
         "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
         bytearray("First Lieutenant", "utf-8"), None),
        ("1 Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
         date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
        (None, 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
         [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),

    ]
df = op.create.df(cols ,rows, False).cache().repartition(1)

In [7]:

df.ext.display(20)

Viewing 6 of 6 rows / 16 columns

1 partition(s)

names 1 (string) not nullable	height(ft) 2 (smallint) not nullable	function 3 (string) not nullable	rank 4 (tinyint) not nullable	age 5 (int) not nullable	weight(t) 6 (float) not nullable	japanese name 7 (string) not nullable	last position seen 8 (string) not nullable	date arrival 9 (string) not nullable	last date seen 10 (string) not nullable	attributes 11 (array<float>) not nullable	DateType 12 (date) not nullable	timestamp 13 (timestamp) not nullable	Cybertronian 14 (boolean) not nullable	function(binary) 15 (binary) not nullable	NullType 16 (null) not nullable
argenisleon@gmail.com	28.0	Leader	10	5000000	4.300000190734863	[Inochi,⋅Convoy]	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None
bumbl#ebéé⋅⋅	17.0	Espionage	7	5000000	2.0	[Bumble,⋅Goldback]	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26.0	Security	7	5000000	4.0	[Roadbuster]	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
1⋅Megatron	13.0	First⋅Lieutenant	8	5000000	1.7999999523162842	[Meister]	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None
1⋅Megatron	nan	None	10	5000000	5.699999809265137	[Megatron]	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None
None	300.0	Battle⋅Station	8	5000000	nan	[Metroflex]	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None

Viewing 6 of 6 rows / 16 columns

1 partition(s) <class 'pyspark.sql.dataframe.DataFrame'>

In [8]:

df = op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/crime.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")

In [9]:

df.ext.display()

Viewing 10 of 319073 rows / 17 columns

8 partition(s)

INCIDENT_NUMBER 1 (string) not nullable	OFFENSE_CODE 2 (int) not nullable	OFFENSE_CODE_GROUP 3 (string) not nullable	OFFENSE_DESCRIPTION 4 (string) not nullable	DISTRICT 5 (string) not nullable	REPORTING_AREA 6 (string) not nullable	SHOOTING 7 (string) not nullable	OCCURRED_ON_DATE 8 (timestamp) not nullable	YEAR 9 (int) not nullable	MONTH 10 (int) not nullable	DAY_OF_WEEK 11 (string) not nullable	HOUR 12 (int) not nullable	UCR_PART 13 (string) not nullable	STREET 14 (string) not nullable	Lat 15 (double) not nullable	Long 16 (double) not nullable	Location 17 (string) not nullable
I182070945	619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	None	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	1402	Vandalism	VANDALISM	C11	347	None	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	3410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	None	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	None	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	None	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	3820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	None	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	724	Auto⋅Theft	AUTO⋅THEFT	B2	330	None	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.0827326	(42.30607218,⋅-71.08273260)
I182070932	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	None	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	301	Robbery	ROBBERY⋅-⋅STREET	C6	177	None	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	None	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)

Viewing 10 of 319073 rows / 17 columns

8 partition(s) <class 'pyspark.sql.dataframe.DataFrame'>

In [10]:

df.cols.count_by_dtypes("*", infer=False)

Out[10]:

{'SHOOTING': {'null': 318054, 'missing': 0, 'string': 1019},
 'MONTH': {'null': 0, 'missing': 0, 'int': 319073},
 'HOUR': {'null': 0, 'missing': 0, 'int': 319073},
 'Lat': {'null': 19999, 'missing': 0, 'decimal': 299074},
 'STREET': {'null': 10871, 'missing': 0, 'string': 308202},
 'DISTRICT': {'null': 1765, 'missing': 0, 'string': 317308},
 'OFFENSE_CODE_GROUP': {'null': 0, 'missing': 0, 'string': 319073},
 'REPORTING_AREA': {'null': 0, 'missing': 0, 'string': 319073},
 'OCCURRED_ON_DATE': {'null': 0, 'missing': 0, 'date': 319073},
 'UCR_PART': {'null': 90, 'missing': 0, 'string': 318983},
 'INCIDENT_NUMBER': {'null': 0, 'missing': 0, 'string': 319073},
 'DAY_OF_WEEK': {'null': 0, 'missing': 0, 'string': 319073},
 'OFFENSE_DESCRIPTION': {'null': 0, 'missing': 0, 'string': 319073},
 'YEAR': {'null': 0, 'missing': 0, 'int': 319073},
 'Long': {'null': 19999, 'missing': 0, 'decimal': 299074},
 'OFFENSE_CODE': {'null': 0, 'missing': 0, 'int': 319073},
 'Location': {'null': 0, 'missing': 0, 'string': 319073}}

In [11]:

from optimus.helpers.check import is_column_a
is_column_a(df,"OCCURRED_ON_DATE","timestamp")
print(df.cols.schema_dtype("OCCURRED_ON_DATE"))

TimestampType

In [12]:

df.dtypes

Out[12]:

[('INCIDENT_NUMBER', 'string'),
 ('OFFENSE_CODE', 'int'),
 ('OFFENSE_CODE_GROUP', 'string'),
 ('OFFENSE_DESCRIPTION', 'string'),
 ('DISTRICT', 'string'),
 ('REPORTING_AREA', 'string'),
 ('SHOOTING', 'string'),
 ('OCCURRED_ON_DATE', 'timestamp'),
 ('YEAR', 'int'),
 ('MONTH', 'int'),
 ('DAY_OF_WEEK', 'string'),
 ('HOUR', 'int'),
 ('UCR_PART', 'string'),
 ('STREET', 'string'),
 ('Lat', 'double'),
 ('Long', 'double'),
 ('Location', 'string')]

In [36]:

df.cols.std("OCCURRED_ON_DATE")

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-36-c1c8bd292777> in <module>
----> 1 df.cols.std("OCCURRED_ON_DATE")

~\Documents\Optimus\optimus\engines\spark\columns.py in std(columns)
    745             """
    746             columns = parse_columns(self, columns, filter_by_column_dtypes=self.constants.NUMERIC_TYPES)
--> 747             check_column_numbers(columns, "*")
    748 
    749             return format_dict(Cols.agg_exprs(columns, F.stddev))

~\Documents\Optimus\optimus\helpers\columns.py in check_column_numbers(columns, number)
    216     if columns is None:
    217         RaiseIt.value_error(columns, ["str", "list"],
--> 218                             extra_text="Maybe the columns selected do not match a specified datatype filter.")
    219 
    220     count = len(columns)

~\Documents\Optimus\optimus\helpers\raiseit.py in value_error(var, data_values, extra_text)
     74                                  type=divisor.join(map(
     75                                      lambda x: "'" + x + "'",
---> 76                                      data_values)), var_type=one_list_to_val(var), extra_text=extra_text))
     77 
     78     @staticmethod

ValueError: 'columns' must be 'str' or 'list', received 'None'. Maybe the columns selected do not match a specified datatype filter.

In [ ]:

df.ext.send("OCCURRED_ON_DATE")

In [ ]:

df.cols.hist("*")

In [45]:

df.cols.hist("INCIDENT_NUMBER")

VVV StringType
DATA (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>)
VVV StringType
DATA (<class 'pyspark.sql.types.StringType'>,)
EXEC AGG 1

Out[45]:

{'INCIDENT_NUMBER': {'hist': [{'count': 0.0, 'lower': 0.0, 'upper': 2.5},
   {'count': 0.0, 'lower': 2.5, 'upper': 5.0},
   {'count': 0.0, 'lower': 5.0, 'upper': 7.5},
   {'count': 1.0, 'lower': 7.5, 'upper': 10.0},
   {'count': 318719.0, 'lower': 10.0, 'upper': 12.5},
   {'count': 353.0, 'lower': 12.5, 'upper': 15.0},
   {'count': 0.0, 'lower': 15.0, 'upper': 17.5},
   {'count': 0.0, 'lower': 17.5, 'upper': 20.0},
   {'count': 0.0, 'lower': 20.0, 'upper': 22.5},
   {'count': 0.0, 'lower': 22.5, 'upper': 25.0},
   {'count': 0.0, 'lower': 25.0, 'upper': 27.5},
   {'count': 0.0, 'lower': 27.5, 'upper': 30.0},
   {'count': 0.0, 'lower': 30.0, 'upper': 32.5},
   {'count': 0.0, 'lower': 32.5, 'upper': 35.0},
   {'count': 0.0, 'lower': 35.0, 'upper': 37.5},
   {'count': 0.0, 'lower': 37.5, 'upper': 40.0},
   {'count': 0.0, 'lower': 40.0, 'upper': 42.5},
   {'count': 0.0, 'lower': 42.5, 'upper': 45.0},
   {'count': 0.0, 'lower': 45.0, 'upper': 47.5},
   {'count': 0.0, 'lower': 47.5, 'upper': 50.0}]}}

In [25]:

df.outliers.tukey("height(ft)").select().ext.display()

ShortType (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>)
0.5 44.5
ShortType (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>)

Viewing 1 of 1 rows / 16 columns

1 partition(s)

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (string) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
None	300	Battle⋅Station	8	5000000	None	[Metroflex]	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None

Viewing 1 of 1 rows / 16 columns

1 partition(s)

In [8]:

outlier.hist("price")

Out[8]:

'{"price": {"hist": [{"count": 6.0, "lower": 8.0, "upper": 8.1}, {"count": 0.0, "lower": 8.1, "upper": 8.2}, {"count": 0.0, "lower": 8.2, "upper": 8.3}, {"count": 0.0, "lower": 8.3, "upper": 8.4}, {"count": 0.0, "lower": 8.4, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 8.6}, {"count": 0.0, "lower": 8.6, "upper": 8.7}, {"count": 0.0, "lower": 8.7, "upper": 8.8}, {"count": 0.0, "lower": 8.8, "upper": 8.9}, {"count": 0.0, "lower": 8.9, "upper": 9.0}, {"count": 2.0, "lower": 9.0, "upper": 9.1}, {"count": 0.0, "lower": 9.1, "upper": 9.2}, {"count": 0.0, "lower": 9.2, "upper": 9.3}, {"count": 0.0, "lower": 9.3, "upper": 9.4}, {"count": 0.0, "lower": 9.4, "upper": 9.5}, {"count": 0.0, "lower": 9.5, "upper": 9.6}, {"count": 0.0, "lower": 9.6, "upper": 9.7}, {"count": 0.0, "lower": 9.7, "upper": 9.8}, {"count": 0.0, "lower": 9.8, "upper": 9.9}, {"count": 0.0, "lower": 9.9, "upper": 10.0}]}}'

In [12]:

df.cols.count_by_dtypes("id")

Out[12]:

{'id': {'null': 0, 'missing': 0, 'int': 19}}

In [22]:

df.count()

Out[22]:

In [24]:

outlier.info()

Out[24]:

{'count_outliers': 9,
 'count_non_outliers': 10,
 'lower_bound': 6,
 'lower_bound_count': 9,
 'upper_bound': 10,
 'upper_bound_count': 0}

In [11]:

# df.table()

In [12]:

df.cols.count_mismatch({"names":"argenisleon@gmail.com","names":"email"})

Out[12]:

{'names': {'email': 1, 'mismatch': 4, 'null': 1, 'missing': 0}}

In [14]:

a = {'names': {'email': 1, 'mismatch': 4, 'null': 1}}

In [15]:

tuple({"firstName":"string","lastName":"array"}.values())

Out[15]:

('string', 'array')

In [16]:

from infer import Infer

In [17]:

from infer import Infer
Infer.mismatch(("names",None),{"names":"email"})

Out[17]:

(('names', 'null'), 1)

In [20]:

Infer.value(12, "string")

In [36]:

list({"firstName":"string","lastName":"string"}.keys())

Out[36]:

['firstName', 'lastName']

In [8]:

df.rows.select_by_dtypes("names","str")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-5a6988a57346> in <module>
----> 1 df.rows.select_by_dtypes("names","str")

NameError: name 'df' is not defined

In [117]:

# Histograma
df.rows.between("height(ft)",17,26, invert = False , equal =True, ).table()

Viewing 3 of 3 rows / 16 columns

1 partition(s)

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
1⋅Megatron	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None

Viewing 3 of 3 rows / 16 columns

1 partition(s)

In [ ]:

In [55]:

df.cols.reverse("function").table()

Viewing 6 of 6 rows / 16 columns

1 partition(s)

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
Optimus⋅OptimusPrime	28	redaeL	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None
bumbl#ebéé⋅⋅	17	eganoipsE	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	ytiruceS	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
1⋅Megatron	13	tnanetueiL⋅tsriF	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None
1⋅Megatron	None	enoN	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None
megatron⋅1	300	noitatS⋅elttaB	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None

Viewing 6 of 6 rows / 16 columns

1 partition(s)

In [20]:

outlier = df.outliers.tukey("mass (g)")

In [28]:

# print(outlier.info())
outlier.select_lower_bound()

Out[28]:

'{"columns": [{"title": "mass (g)"}], "value": [[21.0], [160.0], [252.0], [256.8], [320.0], [41.0], [94.2], [265.0], [146.0], [134.0], [345.0], [14.0], [23.2], [17.0], [375.0], [270.0], [13.9], [18.0], [100.0], [488.1], [470.0], [67.8], [56.0], [190.0], [219.0], [324.0], [357.0], [212.0], [478.0], [342.0], [8.0], [94.0], [45.6], [0.5], [72.0], [367.0], [303.0], [48.6], [469.0], [78.4], [167.0], [100.0], [340.0], [28.0], [0.8], [230.0], [400.0], [438.0], [230.0], [30.0], [300.0], [188.0], [127.0], [277.0], [113.0], [107.2], [380.0], [82.0], [220.0], [240.0], [132.7], [36.1], [28.0], [380.0], [102.0], [480.0], [45.5], [215.0], [288.0], [28.0], [0.2], [315.0], [414.0], [167.7], [305.5], [180.0], [266.1], [112.0], [22.0], [450.0], [222.0], [100.0], [30.0], [483.0], [89.0], [230.0], [350.0], [448.0], [299.0], [400.0], [180.0], [450.0], [100.0], [331.0], [195.0], [140.0], [67.4], [97.7], [202.6], [136.0]]}'

In [256]:

keyCol.fingerprint(df,"product").table()

Viewing 10 of 19 rows / 9 columns

1 partition(s)

id 1 (int) nullable	firstName 2 (string) nullable	lastName 3 (string) nullable	billingId 4 (int) nullable	product 5 (string) nullable	price 6 (int) nullable	birth 7 (string) nullable	dummyCol 8 (string) nullable	product***FINGERPRINT 9 (string) nullable
1	Luis	Alvarez$$%!	123	Cake	10	1980/07/07	never	cake
2	André	Ampère	423	piza	8	1950/07/08	gonna	piza
3	NiELS	Böhr//((%%	551	pizza	8	1990/07/09	give	pizza
4	PAUL	dirac$	521	pizza	8	1954/07/10	you	pizza
5	Albert	Einstein	634	pizza	8	1990/07/11	up	pizza
6	Galileo	⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI	672	arepa	5	1930/08/12	never	arepa
7	CaRL	Ga%%%uss	323	taco	3	1970/07/13	gonna	taco
8	David	H$$$ilbert	624	taaaccoo	3	1950/07/14	let	taaaccoo
9	Johannes	KEPLER	735	taco	3	1920/04/22	you	taco
10	JaMES	M$$ax%%well	875	taco	3	1923/03/12	down	taco

Viewing 10 of 19 rows / 9 columns

1 partition(s)

In [245]:

keyCol.fingerprint(df,"names").table()

Viewing 6 of 6 rows / 17 columns

1 partition(s)

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable	names***FINGERPRINT 17 (string) nullable
Optimus⋅OptimusPrime	28	Leader	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None	optimusoptimusprime
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None	bumblebee
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None	ironhide
1⋅Megatron	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None	1megatron
1⋅Megatron	None	None	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None	1megatron
megatron⋅1	300	Battle⋅Station	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None	1megatron

Viewing 6 of 6 rows / 17 columns

1 partition(s)

In [259]:

keyCol.fingerprint_cluster(df,"product", output="json")

Out[259]:

'{"taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "pizzza": {"similar": {"pizzza": 1}, "count": 1, "sum": 1}, "arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "pizza": {"similar": {"pizza": 4}, "count": 1, "sum": 4}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}}'

In [261]:

keyCol.n_gram_fingerprint_cluster(df,"product", output="json",n_size=2)

Out[261]:

'{"arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}, "pizza": {"similar": {"pizzza": 1, "pizza": 4}, "count": 2, "sum": 5}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}}'

In [7]:

from optimus.ml import keycollision as keyCol
from optimus.ml import distancecluster as dc

In [258]:

dc.levenshtein_cluster(df,"product", output="json")

Out[258]:

'{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}}'

In [31]:

keyCol.n_gram_fingerprint_cluster(df,"names", n_size=1,output="json")

Viewing 6 of 6 rows / 4 columns

1 partition(s)

count 1 (string) not nullable	names 2 (string) nullable	names***NGRAM 3 (array<string>) not nullable	names***NGRAM_FINGERPRINT 4 (string) nullable
1	bumbl#ebéé⋅⋅	['bumblebee']	bumblebee
1	ironhide&	['ironhide']	ironhide
1	Megatron2	['megatron2']	megatron2
1	Optimus⋅OptimusPrime	['optimusoptimusprime']	optimusoptimusprime
1	Megatron1	['megatron1']	megatron1
1	Megatron	['megatron']	megatron

Viewing 6 of 6 rows / 4 columns

1 partition(s)

Out[31]:

'{"ironhide&": {"similar": {"ironhide&": 1}, "count": 1, "sum": 1.0}, "Megatron1": {"similar": {"Megatron1": 1}, "count": 1, "sum": 1.0}, "Optimus OptimusPrime": {"similar": {"Optimus OptimusPrime": 1}, "count": 1, "sum": 1.0}, "Megatron": {"similar": {"Megatron": 1}, "count": 1, "sum": 1.0}, "bumbl#eb\\u00e9\\u00e9  ": {"similar": {"bumbl#eb\\u00e9\\u00e9  ": 1}, "count": 1, "sum": 1.0}, "Megatron2": {"similar": {"Megatron2": 1}, "count": 1, "sum": 1.0}}'

In [25]:

df.table()

Viewing 6 of 6 rows / 16 columns

1 partition(s)

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
Optimus⋅OptimusPrime	28	Leader	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
Megatron1	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None
Megatron	None	None	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None
megatron	300	Battle⋅Station	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None

Viewing 6 of 6 rows / 16 columns

1 partition(s)

In [81]:

# df = op.load.csv("data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")

In [82]:

df.table()

Viewing 6 of 6 rows / 16 columns

1 partition(s)

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
Optimus⋅OptimusPrime	28	Leader	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
JaJa⋅JaJaJ	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None
Megatron	None	None	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None
Metroplex_)^$	300	Battle⋅Station	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None

Viewing 6 of 6 rows / 16 columns

1 partition(s)

In [95]:

df.cols.replace("names",["JaJa","bbb"],"aaa",search_by="words").table()

Viewing 6 of 6 rows / 16 columns

1 partition(s)

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
Optimus⋅OptimusPrime	28	Leader	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
aaa⋅JaJaJ	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None
Megatron	None	None	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None
Metroplex_)^$	300	Battle⋅Station	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None

Viewing 6 of 6 rows / 16 columns

1 partition(s)

In [20]:

df.send()

Send!

In [7]:

df.table(20)

Out[7]:

Viewing 19 of 19 rows / 8 columns

1 partition(s)

id 1 (int) nullable	firstName 2 (string) nullable	lastName 3 (string) nullable	billingId 4 (int) nullable	product 5 (string) nullable	price 6 (int) nullable	birth 7 (string) nullable	dummyCol 8 (string) nullable
1	Luis	Alvarez$$%!	123	Cake	10	1980/07/07	never
2	André	Ampère	423	piza	8	1950/07/08	gonna
3	NiELS	Böhr//((%%	551	pizza	8	1990/07/09	give
4	PAUL	dirac$	521	pizza	8	1954/07/10	you
5	Albert	Einstein	634	pizza	8	1990/07/11	up
6	Galileo	⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI	672	arepa	5	1930/08/12	never
7	CaRL	Ga%%%uss	323	taco	3	1970/07/13	gonna
8	David	H$$$ilbert	624	taaaccoo	3	1950/07/14	let
9	Johannes	KEPLER	735	taco	3	1920/04/22	you
10	JaMES	M$$ax%%well	875	taco	3	1923/03/12	down
11	Isaac	Newton	992	pasta	9	1999/02/15	never⋅
12	Emmy%%	Nöether$	234	pasta	9	1993/12/08	gonna
13	Max!!!	Planck!!!	111	hamburguer	4	1994/01/04	run⋅
14	Fred	Hoy&&&le	553	pizzza	8	1997/06/27	around
15	(((⋅⋅⋅Heinrich⋅)))))	Hertz	116	pizza	8	1956/11/30	and
16	William	Gilbert###	886	BEER	2	1958/03/26	desert
17	Marie	CURIE	912	Rice	1	2000/03/22	you
18	Arthur	COM%%%pton	812	110790	5	1899/01/01	#
19	JAMES	Chadwick	467	null	10	1921/05/03	#

Viewing 19 of 19 rows / 8 columns

1 partition(s)

In [10]:

df.outliers.z_score("price",threshold =1).info()

Out[10]:

{'count_outliers': 8, 'count_non_outliers': 11, 'max_z_score': 1.7111}

In [8]:

df.outliers.tukey("price").info()

Out[8]:

{'count_outliers': 0,
 'count_non_outliers': 19,
 'lower_bound': -4.5,
 'lower_bound_count': 0,
 'upper_bound': 15.5,
 'upper_bound_count': 0,
 'iqr1': 3,
 'iqr3': 8}

In [9]:

df.outliers.mad("price", threshold =1).info()

Out[9]:

{'count_outliers': 9,
 'count_non_outliers': 19,
 'lower_bound': 6,
 'lower_bound_count': 9,
 'upper_bound': 10,
 'upper_bound_count': 0}

In [11]:

df.outliers.modified_z_score("price",threshold =1).info()

Out[11]:

{'count_outliers': 19, 'count_non_outliers': 19, 'max_m_z_score': 2.36075}

In [47]:

%%time
from optimus.ml import distancecluster as dc
print(dc.levenshtein_cluster(df,'product',output="json"))

{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}}
Wall time: 9.6 s

In [51]:

from optimus.ml import distancecluster as dc
from optimus.ml import keycollision as kc

# result = dc.levenshtein_json(df,'product')
result = kc.fingerprint_cluster(df, "product",3)

In [62]:

result = kc.n_gram_fingerprint_cluster(df, "product",3)

Viewing 10 of 13 rows / 4 columns

1 partition(s)

count 1 (string) not nullable	product 2 (string) nullable	product***NGRAM 3 (array<string>) not nullable	product***NGRAM_FINGERPRINT 4 (string) nullable
1	taaaccoo	['taaaccoo']	taaaccoo
1	piza	['piza']	piza
1	hamburguer	['hamburguer']	hamburguer
3	taco	['taco']	taco
1	BEER	['beer']	beer
1	pizzza	['pizzza']	pizzza
1	arepa	['arepa']	arepa
4	pizza	['pizza']	pizza
1	Rice	['rice']	rice
1	110790	['110790']	110790

Viewing 10 of 13 rows / 4 columns

1 partition(s)

In [63]:

print(result)

{'taaaccoo': {'similar': ['taaaccoo'], 'count': 1, 'sum': 1.0}, 'piza': {'similar': ['piza'], 'count': 1, 'sum': 1.0}, 'hamburguer': {'similar': ['hamburguer'], 'count': 1, 'sum': 1.0}, 'taco': {'similar': ['taco'], 'count': 1, 'sum': 3.0}, 'pizzza': {'similar': ['pizzza'], 'count': 1, 'sum': 1.0}, 'arepa': {'similar': ['arepa'], 'count': 1, 'sum': 1.0}, 'pizza': {'similar': ['pizza'], 'count': 1, 'sum': 4.0}, 'Rice': {'similar': ['Rice'], 'count': 1, 'sum': 1.0}, '110790': {'similar': ['110790'], 'count': 1, 'sum': 1.0}, 'BEER': {'similar': ['BEER'], 'count': 1, 'sum': 1.0}, 'Cake': {'similar': ['Cake'], 'count': 1, 'sum': 1.0}, 'null': {'similar': ['null'], 'count': 1, 'sum': 1.0}, 'pasta': {'similar': ['pasta'], 'count': 1, 'sum': 2.0}}

In [159]:

type(result)

Out[159]:

str

In [68]:

kv_dict ={}
for row in result.collect():
    _row = list(row.asDict().values())
    print(_row)
    kv_dict[_row[0]] = _row[1]

['taaaccoo', 1]
['piza', 1]
['hamburguer', 1]
['taco', 3]
['BEER', 1]
['pizzza', 1]
['arepa', 1]
['pizza', 4]
['Rice', 1]
['110790', 1]
['Cake', 1]
['null', 1]
['pasta', 2]

In [69]:

print(kv_dict)

{'taaaccoo': 1, 'piza': 1, 'hamburguer': 1, 'taco': 3, 'BEER': 1, 'pizzza': 1, 'arepa': 1, 'pizza': 4, 'Rice': 1, '110790': 1, 'Cake': 1, 'null': 1, 'pasta': 2}

In [46]:

a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-d4821b29c8c9> in <module>
----> 1 a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()

AttributeError: 'str' object has no attribute 'cols'

In [47]:

a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-47-fb2466275319> in <module>
----> 1 a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()

AttributeError: 'str' object has no attribute 'rows'

In [12]:

In [ ]: