import pandas as pd import numpy as np #para montar en drive from google.colab import drive import os drive.mount('/content/gdrive') # Establecer ruta de acceso en drive import os print(os.getcwd()) os.chdir("/content/gdrive/My Drive") from matplotlib import pyplot as plt from sklearn import datasets from sklearn.tree import DecisionTreeClassifier from sklearn import tree # Cargar los datos iris = datasets.load_iris() X = iris.data y = iris.target # ajustar arbol de decisión simple con hiperparametros (defecto) clf = DecisionTreeClassifier(random_state=1234) model = clf.fit(X, y) # Graficando fig = plt.figure(figsize=(18,10)) _ = tree.plot_tree(clf,feature_names=iris.feature_names, class_names=iris.target_names, filled=True) pip install dtreeviz from dtreeviz.trees import dtreeviz # Una forma diferente de ver el arbol viz = dtreeviz(clf, X, y, target_name="target", feature_names=iris.feature_names, class_names=list(iris.target_names)) viz.save("decision_tree.svg") # Guardar la imagen viz import matplotlib.pyplot as plt import pandas as pd from sklearn import datasets, neighbors from mlxtend.plotting import plot_decision_regions def knn_comparison(data, k): # funcion de comparacion x = data[['X','Y']].values # Extraccion de columns y = data['class'].astype(int).values # Clase y como int clf = neighbors.KNeighborsClassifier(n_neighbors=k) #algoritmo clf.fit(x, y)# Graficar la region de decision plot_decision_regions(x, y, clf=clf, legend=2)# Añadir anotaciones plt.xlabel('X') plt.ylabel('Y') plt.title('Knn with K='+ str(k)) plt.show() # Cargar y aplicar funcion data1 = pd.read_csv('ushape.csv') for i in [1,5,20,30,40,80]: # Para diferentes valores de k (Knn) knn_comparison(data1, i) # Data concentrica data2 = pd.read_csv('concertriccir2.csv') for i in [1,5,20,30,40,60]: knn_comparison(data2, i) # Data XOR data3 = pd.read_csv('xor.csv') for i in [1,5,20,30,40,60]: knn_comparison(data3, i) # Linear separable data4 = pd.read_csv('linearsep.csv') for i in [1,5,20,30,40,60]: knn_comparison(data4, i) # Data outliers data5 = pd.read_csv('outlier.csv') for i in [1, 5,20,30,40,60]: knn_comparison(data5, i) from sklearn.datasets import load_breast_cancer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split X, y = load_breast_cancer(return_X_y=True) X y # Separacion train/tet X_train, X_test, y_train, y_test = train_test_split(X, y) model = LogisticRegression(max_iter=10000, n_jobs=-1) # Ajustar modelo model.fit(X_train, y_train) #Predicciones predicciones = model.predict(X_test) predicciones print(accuracy_score(y_test, predicciones)) from sklearn.metrics import confusion_matrix #Matriz de confusion cf_matrix = confusion_matrix(y_test, predicciones) import seaborn as sns ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues') ax.set_title('Matriz de confusion con labels\n\n'); ax.set_xlabel('\nValores predichos') ax.set_ylabel('Valores reales '); ## Ticket labels - En orden alfabetico ax.xaxis.set_ticklabels(['False','True']) ax.yaxis.set_ticklabels(['False','True']) plt.show() print("Definiendo los simbolos de stock") symbol_data_to_load = ['D','EXC','NEE','SO','DUK'] list_of_df = [] # Ciclo sobre simbolos #llenar la lsita de dataframes print(" --- Inicio de Loop --- ") for i in symbol_data_to_load: print("Procesando Simbolo: " + i) temp_df = pd.read_csv(i+'.csv',sep=',') temp_df['Volume_Millions'] = temp_df['Volume'] / 1000000.0 temp_df['Symbol'] = i # Agregar nueva columna con el simbolo list_of_df.append(temp_df) print(" --- Completado loop simbolos --- ") # Combinar en un Dataframe unico usando concat #permite pegar los dataframes de la lista print("Agregando la data") agg_df = pd.concat(list_of_df, axis=0) # Agregar estadisticas de retorno y volatilidad # es mas rápido agregarlo al dataframe que a cada uno de los registros print('Calculando estadisticas finales') agg_df['VolStat'] = (agg_df['High'] - agg_df['Low']) / agg_df['Open'] agg_df['Return'] = (agg_df['Close'] / agg_df['Open']) - 1.0 print("agg_df DataFrame dimension (filas, columnas): ") print(agg_df.shape) print("Head del DataFrame agg_df: ") agg_df.head() #print("agg_df['Symbol'].unique()") agg_df.Symbol.unique() ### Load relevant packages import pandas as pd from scipy import stats import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statsmodels.formula.api as sm # https://community.plot.ly/t/solved-update-to-plotly-4-0-0-broke-application/26526/2 import os #%matplotlib inline #plt.style.use('ggplot') from bokeh.resources import INLINE import bokeh.io from bokeh import * data= agg_df[['Open','High','Low','Close','Volume_Millions','Symbol','VolStat','Return']] data.head() def min_max_scaling(series): return (series - series.min()) / (series.max() - series.min()) for col in data.columns: if col == 'Symbol': pass else: data[col] = min_max_scaling(data[col]) data.head() agg_df.columns model1 = 'VolStat~Open+ High+ Low+ Close + Volume_Millions +Symbol' lm1 = sm.ols(formula = model1, data = data).fit() print(lm1.summary()) model2 = 'Return~Open+ High+ Low+ Close + Volume_Millions +Symbol' lm2 = sm.ols(formula = model2, data = data).fit() print(lm2.summary())