#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[2]:


import sys
sys.path.append("..")


# In[3]:


from optimus import Optimus


# In[4]:


op= Optimus(comm=True)


# In[6]:


from pyspark.sql.types import *
from datetime import date, datetime

cols = [
        ("names", "str"),
        ("height(ft)", ShortType()),
        ("function", "str"),
        ("rank", ByteType()),
        ("age", "int"),
        ("weight(t)", "float"),
        "japanese name",
        "last position seen",
        "date arrival",
        "last date seen",
        ("attributes", ArrayType(FloatType())),
        ("DateType", DateType()),
        ("timestamp", TimestampType()),
        ("Cybertronian", BooleanType()),
        ("function(binary)", BinaryType()),
        ("NullType", NullType())

    ]

rows = [
        ("argenisleon@gmail.com", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
         None),
        ("bumbl#ebéé  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
         "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
         None),
        ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
         "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
         None),
        ("1 Megatron", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
         "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
         bytearray("First Lieutenant", "utf-8"), None),
        ("1 Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
         date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
        (None, 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
         [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),

    ]
df = op.create.df(cols ,rows, False).cache().repartition(1)


# In[7]:


df.ext.display(20)


# In[8]:


df = op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/crime.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")


# In[9]:


df.ext.display()


# In[10]:


df.cols.count_by_dtypes("*", infer=False)


# In[11]:


from optimus.helpers.check import is_column_a
is_column_a(df,"OCCURRED_ON_DATE","timestamp")
print(df.cols.schema_dtype("OCCURRED_ON_DATE"))


# In[12]:


df.dtypes


# In[36]:


df.cols.std("OCCURRED_ON_DATE")


# In[ ]:


df.ext.send("OCCURRED_ON_DATE")


# In[ ]:


df.cols.hist("*")


# In[45]:


df.cols.hist("INCIDENT_NUMBER")


# In[25]:


df.outliers.tukey("height(ft)").select().ext.display()


# In[8]:


outlier.hist("price")


# In[12]:


df.cols.count_by_dtypes("id")


# In[22]:


df.count()


# In[24]:


outlier.info()


# In[11]:


# df.table()


# In[12]:


df.cols.count_mismatch({"names":"argenisleon@gmail.com","names":"email"})


# In[14]:


a = {'names': {'email': 1, 'mismatch': 4, 'null': 1}}


# In[15]:


tuple({"firstName":"string","lastName":"array"}.values())


# In[16]:


from infer import Infer


# In[17]:


from infer import Infer
Infer.mismatch(("names",None),{"names":"email"})


# In[20]:


Infer.value(12, "string")


# In[36]:


list({"firstName":"string","lastName":"string"}.keys())


# In[8]:


df.rows.select_by_dtypes("names","str")


# In[117]:


# Histograma
df.rows.between("height(ft)",17,26, invert = False , equal =True, ).table()


# In[ ]:


# In[55]:


df.cols.reverse("function").table()


# In[20]:


outlier = df.outliers.tukey("mass (g)")


# In[28]:


# print(outlier.info())
outlier.select_lower_bound()


# In[256]:


keyCol.fingerprint(df,"product").table()


# In[245]:


keyCol.fingerprint(df,"names").table()


# In[259]:


keyCol.fingerprint_cluster(df,"product", output="json")


# In[261]:


keyCol.n_gram_fingerprint_cluster(df,"product", output="json",n_size=2)


# In[7]:


from optimus.ml import keycollision as keyCol
from optimus.ml import distancecluster as dc


# In[258]:


dc.levenshtein_cluster(df,"product", output="json")


# In[31]:


keyCol.n_gram_fingerprint_cluster(df,"names", n_size=1,output="json")


# In[25]:


df.table()


# In[81]:


# df = op.load.csv("data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")


# In[82]:


df.table()


# In[95]:


df.cols.replace("names",["JaJa","bbb"],"aaa",search_by="words").table()


# In[20]:


df.send()


# In[7]:


df.table(20)


# In[10]:


df.outliers.z_score("price",threshold =1).info()


# In[8]:


df.outliers.tukey("price").info()


# In[9]:


df.outliers.mad("price", threshold =1).info()


# In[11]:


df.outliers.modified_z_score("price",threshold =1).info()


# In[47]:


get_ipython().run_cell_magic('time', '', 'from optimus.ml import distancecluster as dc\nprint(dc.levenshtein_cluster(df,\'product\',output="json"))\n')


# In[51]:


from optimus.ml import distancecluster as dc
from optimus.ml import keycollision as kc

# result = dc.levenshtein_json(df,'product')
result = kc.fingerprint_cluster(df, "product",3)


# In[62]:


result = kc.n_gram_fingerprint_cluster(df, "product",3)


# In[63]:


print(result)


# In[159]:


type(result)


# In[68]:


kv_dict ={}
for row in result.collect():
    _row = list(row.asDict().values())
    print(_row)
    kv_dict[_row[0]] = _row[1]


# In[69]:


print(kv_dict)


# In[46]:


a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()


# In[47]:


a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()


# In[12]:


# In[ ]:


# In[ ]: