%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from optimus import Optimus
C:\Users\argenisleon\Anaconda3\lib\site-packages\dask\config.py:161: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details. data = yaml.load(f.read()) or {} C:\Users\argenisleon\Anaconda3\lib\site-packages\statsmodels\compat\pandas.py:49: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)
op= Optimus(comm=True)
You are using PySparkling of version 2.4.10, but your PySpark is of version 2.3.1. Please make sure Spark and PySparkling versions are compatible.
from pyspark.sql.types import *
from datetime import date, datetime
cols = [
("names", "str"),
("height(ft)", ShortType()),
("function", "str"),
("rank", ByteType()),
("age", "int"),
("weight(t)", "float"),
"japanese name",
"last position seen",
"date arrival",
"last date seen",
("attributes", ArrayType(FloatType())),
("DateType", DateType()),
("timestamp", TimestampType()),
("Cybertronian", BooleanType()),
("function(binary)", BinaryType()),
("NullType", NullType())
]
rows = [
("argenisleon@gmail.com", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
"2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
None),
("bumbl#ebéé ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
"2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
None),
("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
"2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
None),
("1 Megatron", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
"2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
bytearray("First Lieutenant", "utf-8"), None),
("1 Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
(None, 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
[91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
]
df = op.create.df(cols ,rows, False).cache().repartition(1)
df.ext.display(20)
names
1 (string)
not nullable
|
height(ft)
2 (smallint)
not nullable
|
function
3 (string)
not nullable
|
rank
4 (tinyint)
not nullable
|
age
5 (int)
not nullable
|
weight(t)
6 (float)
not nullable
|
japanese name
7 (string)
not nullable
|
last position seen
8 (string)
not nullable
|
date arrival
9 (string)
not nullable
|
last date seen
10 (string)
not nullable
|
attributes
11 (array<float>)
not nullable
|
DateType
12 (date)
not nullable
|
timestamp
13 (timestamp)
not nullable
|
Cybertronian
14 (boolean)
not nullable
|
function(binary)
15 (binary)
not nullable
|
NullType
16 (null)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
argenisleon@gmail.com
|
28.0
|
Leader
|
10
|
5000000
|
4.300000190734863
|
[Inochi,⋅Convoy]
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17.0
|
Espionage
|
7
|
5000000
|
2.0
|
[Bumble,⋅Goldback]
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26.0
|
Security
|
7
|
5000000
|
4.0
|
[Roadbuster]
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
1⋅Megatron
|
13.0
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
[Meister]
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
1⋅Megatron
|
nan
|
None
|
10
|
5000000
|
5.699999809265137
|
[Megatron]
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
None
|
300.0
|
Battle⋅Station
|
8
|
5000000
|
nan
|
[Metroflex]
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
df = op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/crime.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")
df.ext.display()
INCIDENT_NUMBER
1 (string)
not nullable
|
OFFENSE_CODE
2 (int)
not nullable
|
OFFENSE_CODE_GROUP
3 (string)
not nullable
|
OFFENSE_DESCRIPTION
4 (string)
not nullable
|
DISTRICT
5 (string)
not nullable
|
REPORTING_AREA
6 (string)
not nullable
|
SHOOTING
7 (string)
not nullable
|
OCCURRED_ON_DATE
8 (timestamp)
not nullable
|
YEAR
9 (int)
not nullable
|
MONTH
10 (int)
not nullable
|
DAY_OF_WEEK
11 (string)
not nullable
|
HOUR
12 (int)
not nullable
|
UCR_PART
13 (string)
not nullable
|
STREET
14 (string)
not nullable
|
Lat
15 (double)
not nullable
|
Long
16 (double)
not nullable
|
Location
17 (string)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
None
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
None
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
None
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
None
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
None
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
None
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
None
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
None
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
None
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
None
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
df.cols.count_by_dtypes("*", infer=False)
{'SHOOTING': {'null': 318054, 'missing': 0, 'string': 1019}, 'MONTH': {'null': 0, 'missing': 0, 'int': 319073}, 'HOUR': {'null': 0, 'missing': 0, 'int': 319073}, 'Lat': {'null': 19999, 'missing': 0, 'decimal': 299074}, 'STREET': {'null': 10871, 'missing': 0, 'string': 308202}, 'DISTRICT': {'null': 1765, 'missing': 0, 'string': 317308}, 'OFFENSE_CODE_GROUP': {'null': 0, 'missing': 0, 'string': 319073}, 'REPORTING_AREA': {'null': 0, 'missing': 0, 'string': 319073}, 'OCCURRED_ON_DATE': {'null': 0, 'missing': 0, 'date': 319073}, 'UCR_PART': {'null': 90, 'missing': 0, 'string': 318983}, 'INCIDENT_NUMBER': {'null': 0, 'missing': 0, 'string': 319073}, 'DAY_OF_WEEK': {'null': 0, 'missing': 0, 'string': 319073}, 'OFFENSE_DESCRIPTION': {'null': 0, 'missing': 0, 'string': 319073}, 'YEAR': {'null': 0, 'missing': 0, 'int': 319073}, 'Long': {'null': 19999, 'missing': 0, 'decimal': 299074}, 'OFFENSE_CODE': {'null': 0, 'missing': 0, 'int': 319073}, 'Location': {'null': 0, 'missing': 0, 'string': 319073}}
from optimus.helpers.check import is_column_a
is_column_a(df,"OCCURRED_ON_DATE","timestamp")
print(df.cols.schema_dtype("OCCURRED_ON_DATE"))
TimestampType
df.dtypes
[('INCIDENT_NUMBER', 'string'), ('OFFENSE_CODE', 'int'), ('OFFENSE_CODE_GROUP', 'string'), ('OFFENSE_DESCRIPTION', 'string'), ('DISTRICT', 'string'), ('REPORTING_AREA', 'string'), ('SHOOTING', 'string'), ('OCCURRED_ON_DATE', 'timestamp'), ('YEAR', 'int'), ('MONTH', 'int'), ('DAY_OF_WEEK', 'string'), ('HOUR', 'int'), ('UCR_PART', 'string'), ('STREET', 'string'), ('Lat', 'double'), ('Long', 'double'), ('Location', 'string')]
df.cols.std("OCCURRED_ON_DATE")
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-36-c1c8bd292777> in <module> ----> 1 df.cols.std("OCCURRED_ON_DATE") ~\Documents\Optimus\optimus\engines\spark\columns.py in std(columns) 745 """ 746 columns = parse_columns(self, columns, filter_by_column_dtypes=self.constants.NUMERIC_TYPES) --> 747 check_column_numbers(columns, "*") 748 749 return format_dict(Cols.agg_exprs(columns, F.stddev)) ~\Documents\Optimus\optimus\helpers\columns.py in check_column_numbers(columns, number) 216 if columns is None: 217 RaiseIt.value_error(columns, ["str", "list"], --> 218 extra_text="Maybe the columns selected do not match a specified datatype filter.") 219 220 count = len(columns) ~\Documents\Optimus\optimus\helpers\raiseit.py in value_error(var, data_values, extra_text) 74 type=divisor.join(map( 75 lambda x: "'" + x + "'", ---> 76 data_values)), var_type=one_list_to_val(var), extra_text=extra_text)) 77 78 @staticmethod ValueError: 'columns' must be 'str' or 'list', received 'None'. Maybe the columns selected do not match a specified datatype filter.
df.ext.send("OCCURRED_ON_DATE")
df.cols.hist("*")
df.cols.hist("INCIDENT_NUMBER")
VVV StringType DATA (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>) VVV StringType DATA (<class 'pyspark.sql.types.StringType'>,) EXEC AGG 1
{'INCIDENT_NUMBER': {'hist': [{'count': 0.0, 'lower': 0.0, 'upper': 2.5}, {'count': 0.0, 'lower': 2.5, 'upper': 5.0}, {'count': 0.0, 'lower': 5.0, 'upper': 7.5}, {'count': 1.0, 'lower': 7.5, 'upper': 10.0}, {'count': 318719.0, 'lower': 10.0, 'upper': 12.5}, {'count': 353.0, 'lower': 12.5, 'upper': 15.0}, {'count': 0.0, 'lower': 15.0, 'upper': 17.5}, {'count': 0.0, 'lower': 17.5, 'upper': 20.0}, {'count': 0.0, 'lower': 20.0, 'upper': 22.5}, {'count': 0.0, 'lower': 22.5, 'upper': 25.0}, {'count': 0.0, 'lower': 25.0, 'upper': 27.5}, {'count': 0.0, 'lower': 27.5, 'upper': 30.0}, {'count': 0.0, 'lower': 30.0, 'upper': 32.5}, {'count': 0.0, 'lower': 32.5, 'upper': 35.0}, {'count': 0.0, 'lower': 35.0, 'upper': 37.5}, {'count': 0.0, 'lower': 37.5, 'upper': 40.0}, {'count': 0.0, 'lower': 40.0, 'upper': 42.5}, {'count': 0.0, 'lower': 42.5, 'upper': 45.0}, {'count': 0.0, 'lower': 45.0, 'upper': 47.5}, {'count': 0.0, 'lower': 47.5, 'upper': 50.0}]}}
df.outliers.tukey("height(ft)").select().ext.display()
ShortType (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>) 0.5 44.5 ShortType (<class 'pyspark.sql.types.ByteType'>, <class 'pyspark.sql.types.ShortType'>, <class 'pyspark.sql.types.LongType'>, <class 'pyspark.sql.types.IntegerType'>, <class 'pyspark.sql.types.DoubleType'>, <class 'pyspark.sql.types.FloatType'>)
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (string)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
None
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
[Metroflex]
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
outlier.hist("price")
'{"price": {"hist": [{"count": 6.0, "lower": 8.0, "upper": 8.1}, {"count": 0.0, "lower": 8.1, "upper": 8.2}, {"count": 0.0, "lower": 8.2, "upper": 8.3}, {"count": 0.0, "lower": 8.3, "upper": 8.4}, {"count": 0.0, "lower": 8.4, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 8.6}, {"count": 0.0, "lower": 8.6, "upper": 8.7}, {"count": 0.0, "lower": 8.7, "upper": 8.8}, {"count": 0.0, "lower": 8.8, "upper": 8.9}, {"count": 0.0, "lower": 8.9, "upper": 9.0}, {"count": 2.0, "lower": 9.0, "upper": 9.1}, {"count": 0.0, "lower": 9.1, "upper": 9.2}, {"count": 0.0, "lower": 9.2, "upper": 9.3}, {"count": 0.0, "lower": 9.3, "upper": 9.4}, {"count": 0.0, "lower": 9.4, "upper": 9.5}, {"count": 0.0, "lower": 9.5, "upper": 9.6}, {"count": 0.0, "lower": 9.6, "upper": 9.7}, {"count": 0.0, "lower": 9.7, "upper": 9.8}, {"count": 0.0, "lower": 9.8, "upper": 9.9}, {"count": 0.0, "lower": 9.9, "upper": 10.0}]}}'
df.cols.count_by_dtypes("id")
{'id': {'null': 0, 'missing': 0, 'int': 19}}
df.count()
19
outlier.info()
6
{'count_outliers': 9, 'count_non_outliers': 10, 'lower_bound': 6, 'lower_bound_count': 9, 'upper_bound': 10, 'upper_bound_count': 0}
# df.table()
df.cols.count_mismatch({"names":"argenisleon@gmail.com","names":"email"})
{'names': {'email': 1, 'mismatch': 4, 'null': 1, 'missing': 0}}
a = {'names': {'email': 1, 'mismatch': 4, 'null': 1}}
tuple({"firstName":"string","lastName":"array"}.values())
('string', 'array')
from infer import Infer
from infer import Infer
Infer.mismatch(("names",None),{"names":"email"})
(('names', 'null'), 1)
Infer.value(12, "string")
list({"firstName":"string","lastName":"string"}.keys())
['firstName', 'lastName']
df.rows.select_by_dtypes("names","str")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-8-5a6988a57346> in <module> ----> 1 df.rows.select_by_dtypes("names","str") NameError: name 'df' is not defined
# Histograma
df.rows.between("height(ft)",17,26, invert = False , equal =True, ).table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
1⋅Megatron
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
df.cols.reverse("function").table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
redaeL
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17
|
eganoipsE
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
ytiruceS
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
1⋅Megatron
|
13
|
tnanetueiL⋅tsriF
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
1⋅Megatron
|
None
|
enoN
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
megatron⋅1
|
300
|
noitatS⋅elttaB
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
outlier = df.outliers.tukey("mass (g)")
# print(outlier.info())
outlier.select_lower_bound()
'{"columns": [{"title": "mass (g)"}], "value": [[21.0], [160.0], [252.0], [256.8], [320.0], [41.0], [94.2], [265.0], [146.0], [134.0], [345.0], [14.0], [23.2], [17.0], [375.0], [270.0], [13.9], [18.0], [100.0], [488.1], [470.0], [67.8], [56.0], [190.0], [219.0], [324.0], [357.0], [212.0], [478.0], [342.0], [8.0], [94.0], [45.6], [0.5], [72.0], [367.0], [303.0], [48.6], [469.0], [78.4], [167.0], [100.0], [340.0], [28.0], [0.8], [230.0], [400.0], [438.0], [230.0], [30.0], [300.0], [188.0], [127.0], [277.0], [113.0], [107.2], [380.0], [82.0], [220.0], [240.0], [132.7], [36.1], [28.0], [380.0], [102.0], [480.0], [45.5], [215.0], [288.0], [28.0], [0.2], [315.0], [414.0], [167.7], [305.5], [180.0], [266.1], [112.0], [22.0], [450.0], [222.0], [100.0], [30.0], [483.0], [89.0], [230.0], [350.0], [448.0], [299.0], [400.0], [180.0], [450.0], [100.0], [331.0], [195.0], [140.0], [67.4], [97.7], [202.6], [136.0]]}'
keyCol.fingerprint(df,"product").table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
product***FINGERPRINT
9 (string)
nullable
|
---|---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
cake
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
piza
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
pizza
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
pizza
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
pizza
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
arepa
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
taco
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
taaaccoo
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
taco
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
taco
|
keyCol.fingerprint(df,"names").table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
names***FINGERPRINT
17 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
optimusoptimusprime
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
bumblebee
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
ironhide
|
1⋅Megatron
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
1megatron
|
1⋅Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
1megatron
|
megatron⋅1
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
1megatron
|
keyCol.fingerprint_cluster(df,"product", output="json")
'{"taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "pizzza": {"similar": {"pizzza": 1}, "count": 1, "sum": 1}, "arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "pizza": {"similar": {"pizza": 4}, "count": 1, "sum": 4}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}}'
keyCol.n_gram_fingerprint_cluster(df,"product", output="json",n_size=2)
'{"arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}, "pizza": {"similar": {"pizzza": 1, "pizza": 4}, "count": 2, "sum": 5}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}}'
from optimus.ml import keycollision as keyCol
from optimus.ml import distancecluster as dc
dc.levenshtein_cluster(df,"product", output="json")
'{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}}'
keyCol.n_gram_fingerprint_cluster(df,"names", n_size=1,output="json")
count
1 (string)
not nullable
|
names
2 (string)
nullable
|
names***NGRAM
3 (array<string>)
not nullable
|
names***NGRAM_FINGERPRINT
4 (string)
nullable
|
---|---|---|---|
1
|
bumbl#ebéé⋅⋅
|
['bumblebee']
|
bumblebee
|
1
|
ironhide&
|
['ironhide']
|
ironhide
|
1
|
Megatron2
|
['megatron2']
|
megatron2
|
1
|
Optimus⋅OptimusPrime
|
['optimusoptimusprime']
|
optimusoptimusprime
|
1
|
Megatron1
|
['megatron1']
|
megatron1
|
1
|
Megatron
|
['megatron']
|
megatron
|
'{"ironhide&": {"similar": {"ironhide&": 1}, "count": 1, "sum": 1.0}, "Megatron1": {"similar": {"Megatron1": 1}, "count": 1, "sum": 1.0}, "Optimus OptimusPrime": {"similar": {"Optimus OptimusPrime": 1}, "count": 1, "sum": 1.0}, "Megatron": {"similar": {"Megatron": 1}, "count": 1, "sum": 1.0}, "bumbl#eb\\u00e9\\u00e9 ": {"similar": {"bumbl#eb\\u00e9\\u00e9 ": 1}, "count": 1, "sum": 1.0}, "Megatron2": {"similar": {"Megatron2": 1}, "count": 1, "sum": 1.0}}'
df.table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
Megatron1
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
megatron
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
# df = op.load.csv("data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")
df.table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
JaJa⋅JaJaJ
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
Metroplex_)^$
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
df.cols.replace("names",["JaJa","bbb"],"aaa",search_by="words").table()
names
1 (string)
nullable
|
height(ft)
2 (smallint)
nullable
|
function
3 (string)
nullable
|
rank
4 (tinyint)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
timestamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optimus⋅OptimusPrime
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Leader')
|
None
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Espionage')
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Security')
|
None
|
aaa⋅JaJaJ
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'First⋅Lieutenant')
|
None
|
Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'None')
|
None
|
Metroplex_)^$
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
bytearray(b'Battle⋅Station')
|
None
|
df.send()
Send!
df.table(20)
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
11
|
Isaac
|
Newton
|
992
|
pasta
|
9
|
1999/02/15
|
never⋅
|
12
|
Emmy%%
|
Nöether$
|
234
|
pasta
|
9
|
1993/12/08
|
gonna
|
13
|
Max!!!
|
Planck!!!
|
111
|
hamburguer
|
4
|
1994/01/04
|
run⋅
|
14
|
Fred
|
Hoy&&&le
|
553
|
pizzza
|
8
|
1997/06/27
|
around
|
15
|
(((⋅⋅⋅Heinrich⋅)))))
|
Hertz
|
116
|
pizza
|
8
|
1956/11/30
|
and
|
16
|
William
|
Gilbert###
|
886
|
BEER
|
2
|
1958/03/26
|
desert
|
17
|
Marie
|
CURIE
|
912
|
Rice
|
1
|
2000/03/22
|
you
|
18
|
Arthur
|
COM%%%pton
|
812
|
110790
|
5
|
1899/01/01
|
#
|
19
|
JAMES
|
Chadwick
|
467
|
null
|
10
|
1921/05/03
|
#
|
df.outliers.z_score("price",threshold =1).info()
{'count_outliers': 8, 'count_non_outliers': 11, 'max_z_score': 1.7111}
df.outliers.tukey("price").info()
{'count_outliers': 0, 'count_non_outliers': 19, 'lower_bound': -4.5, 'lower_bound_count': 0, 'upper_bound': 15.5, 'upper_bound_count': 0, 'iqr1': 3, 'iqr3': 8}
df.outliers.mad("price", threshold =1).info()
{'count_outliers': 9, 'count_non_outliers': 19, 'lower_bound': 6, 'lower_bound_count': 9, 'upper_bound': 10, 'upper_bound_count': 0}
df.outliers.modified_z_score("price",threshold =1).info()
{'count_outliers': 19, 'count_non_outliers': 19, 'max_m_z_score': 2.36075}
%%time
from optimus.ml import distancecluster as dc
print(dc.levenshtein_cluster(df,'product',output="json"))
{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}} Wall time: 9.6 s
from optimus.ml import distancecluster as dc
from optimus.ml import keycollision as kc
# result = dc.levenshtein_json(df,'product')
result = kc.fingerprint_cluster(df, "product",3)
result = kc.n_gram_fingerprint_cluster(df, "product",3)
count
1 (string)
not nullable
|
product
2 (string)
nullable
|
product***NGRAM
3 (array<string>)
not nullable
|
product***NGRAM_FINGERPRINT
4 (string)
nullable
|
---|---|---|---|
1
|
taaaccoo
|
['taaaccoo']
|
taaaccoo
|
1
|
piza
|
['piza']
|
piza
|
1
|
hamburguer
|
['hamburguer']
|
hamburguer
|
3
|
taco
|
['taco']
|
taco
|
1
|
BEER
|
['beer']
|
beer
|
1
|
pizzza
|
['pizzza']
|
pizzza
|
1
|
arepa
|
['arepa']
|
arepa
|
4
|
pizza
|
['pizza']
|
pizza
|
1
|
Rice
|
['rice']
|
rice
|
1
|
110790
|
['110790']
|
110790
|
print(result)
{'taaaccoo': {'similar': ['taaaccoo'], 'count': 1, 'sum': 1.0}, 'piza': {'similar': ['piza'], 'count': 1, 'sum': 1.0}, 'hamburguer': {'similar': ['hamburguer'], 'count': 1, 'sum': 1.0}, 'taco': {'similar': ['taco'], 'count': 1, 'sum': 3.0}, 'pizzza': {'similar': ['pizzza'], 'count': 1, 'sum': 1.0}, 'arepa': {'similar': ['arepa'], 'count': 1, 'sum': 1.0}, 'pizza': {'similar': ['pizza'], 'count': 1, 'sum': 4.0}, 'Rice': {'similar': ['Rice'], 'count': 1, 'sum': 1.0}, '110790': {'similar': ['110790'], 'count': 1, 'sum': 1.0}, 'BEER': {'similar': ['BEER'], 'count': 1, 'sum': 1.0}, 'Cake': {'similar': ['Cake'], 'count': 1, 'sum': 1.0}, 'null': {'similar': ['null'], 'count': 1, 'sum': 1.0}, 'pasta': {'similar': ['pasta'], 'count': 1, 'sum': 2.0}}
type(result)
str
kv_dict ={}
for row in result.collect():
_row = list(row.asDict().values())
print(_row)
kv_dict[_row[0]] = _row[1]
['taaaccoo', 1] ['piza', 1] ['hamburguer', 1] ['taco', 3] ['BEER', 1] ['pizzza', 1] ['arepa', 1] ['pizza', 4] ['Rice', 1] ['110790', 1] ['Cake', 1] ['null', 1] ['pasta', 2]
print(kv_dict)
{'taaaccoo': 1, 'piza': 1, 'hamburguer': 1, 'taco': 3, 'BEER': 1, 'pizzza': 1, 'arepa': 1, 'pizza': 4, 'Rice': 1, '110790': 1, 'Cake': 1, 'null': 1, 'pasta': 2}
a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-46-d4821b29c8c9> in <module> ----> 1 a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table() AttributeError: 'str' object has no attribute 'cols'
a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-47-fb2466275319> in <module> ----> 1 a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table() AttributeError: 'str' object has no attribute 'rows'