#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import sys sys.path.append("..") # In[3]: from optimus import Optimus # In[4]: op= Optimus(comm=True) # In[6]: from pyspark.sql.types import * from datetime import date, datetime cols = [ ("names", "str"), ("height(ft)", ShortType()), ("function", "str"), ("rank", ByteType()), ("age", "int"), ("weight(t)", "float"), "japanese name", "last position seen", "date arrival", "last date seen", ("attributes", ArrayType(FloatType())), ("DateType", DateType()), ("timestamp", TimestampType()), ("Cybertronian", BooleanType()), ("function(binary)", BinaryType()), ("NullType", NullType()) ] rows = [ ("argenisleon@gmail.com", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10", "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"), None), ("bumbl#ebéé ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10", "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"), None), ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10", "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"), None), ("1 Megatron", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10", "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True, bytearray("First Lieutenant", "utf-8"), None), ("1 Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0], date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None), (None, 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10", [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None), ] df = op.create.df(cols ,rows, False).cache().repartition(1) # In[7]: df.ext.display(20) # In[8]: df = op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/crime.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None") # In[9]: df.ext.display() # In[10]: df.cols.count_by_dtypes("*", infer=False) # In[11]: from optimus.helpers.check import is_column_a is_column_a(df,"OCCURRED_ON_DATE","timestamp") print(df.cols.schema_dtype("OCCURRED_ON_DATE")) # In[12]: df.dtypes # In[36]: df.cols.std("OCCURRED_ON_DATE") # In[ ]: df.ext.send("OCCURRED_ON_DATE") # In[ ]: df.cols.hist("*") # In[45]: df.cols.hist("INCIDENT_NUMBER") # In[25]: df.outliers.tukey("height(ft)").select().ext.display() # In[8]: outlier.hist("price") # In[12]: df.cols.count_by_dtypes("id") # In[22]: df.count() # In[24]: outlier.info() # In[11]: # df.table() # In[12]: df.cols.count_mismatch({"names":"argenisleon@gmail.com","names":"email"}) # In[14]: a = {'names': {'email': 1, 'mismatch': 4, 'null': 1}} # In[15]: tuple({"firstName":"string","lastName":"array"}.values()) # In[16]: from infer import Infer # In[17]: from infer import Infer Infer.mismatch(("names",None),{"names":"email"}) # In[20]: Infer.value(12, "string") # In[36]: list({"firstName":"string","lastName":"string"}.keys()) # In[8]: df.rows.select_by_dtypes("names","str") # In[117]: # Histograma df.rows.between("height(ft)",17,26, invert = False , equal =True, ).table() # In[ ]: # In[55]: df.cols.reverse("function").table() # In[20]: outlier = df.outliers.tukey("mass (g)") # In[28]: # print(outlier.info()) outlier.select_lower_bound() # In[256]: keyCol.fingerprint(df,"product").table() # In[245]: keyCol.fingerprint(df,"names").table() # In[259]: keyCol.fingerprint_cluster(df,"product", output="json") # In[261]: keyCol.n_gram_fingerprint_cluster(df,"product", output="json",n_size=2) # In[7]: from optimus.ml import keycollision as keyCol from optimus.ml import distancecluster as dc # In[258]: dc.levenshtein_cluster(df,"product", output="json") # In[31]: keyCol.n_gram_fingerprint_cluster(df,"names", n_size=1,output="json") # In[25]: df.table() # In[81]: # df = op.load.csv("data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None") # In[82]: df.table() # In[95]: df.cols.replace("names",["JaJa","bbb"],"aaa",search_by="words").table() # In[20]: df.send() # In[7]: df.table(20) # In[10]: df.outliers.z_score("price",threshold =1).info() # In[8]: df.outliers.tukey("price").info() # In[9]: df.outliers.mad("price", threshold =1).info() # In[11]: df.outliers.modified_z_score("price",threshold =1).info() # In[47]: get_ipython().run_cell_magic('time', '', 'from optimus.ml import distancecluster as dc\nprint(dc.levenshtein_cluster(df,\'product\',output="json"))\n') # In[51]: from optimus.ml import distancecluster as dc from optimus.ml import keycollision as kc # result = dc.levenshtein_json(df,'product') result = kc.fingerprint_cluster(df, "product",3) # In[62]: result = kc.n_gram_fingerprint_cluster(df, "product",3) # In[63]: print(result) # In[159]: type(result) # In[68]: kv_dict ={} for row in result.collect(): _row = list(row.asDict().values()) print(_row) kv_dict[_row[0]] = _row[1] # In[69]: print(kv_dict) # In[46]: a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table() # In[47]: a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table() # In[12]: # In[ ]: # In[ ]: