import pandas as pd
import numpy as np
#para montar en drive
from google.colab import drive
import os
drive.mount('/content/gdrive')
# Establecer ruta de acceso en drive
import os
print(os.getcwd())
os.chdir("/content/gdrive/My Drive")
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True). /content/gdrive/My Drive
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# Cargar los datos
iris = datasets.load_iris()
X = iris.data
y = iris.target
# ajustar arbol de decisión simple con hiperparametros (defecto)
clf = DecisionTreeClassifier(random_state=1234)
model = clf.fit(X, y)
# Graficando
fig = plt.figure(figsize=(18,10))
_ = tree.plot_tree(clf,feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True)
pip install dtreeviz
Collecting dtreeviz Downloading dtreeviz-1.3.2.tar.gz (62 kB) |████████████████████████████████| 62 kB 696 kB/s Requirement already satisfied: graphviz>=0.9 in /usr/local/lib/python3.7/dist-packages (from dtreeviz) (0.10.1) Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from dtreeviz) (1.1.5) Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from dtreeviz) (1.19.5) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from dtreeviz) (1.0.2) Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from dtreeviz) (3.2.2) Collecting colour Downloading colour-0.1.5-py2.py3-none-any.whl (23 kB) Requirement already satisfied: pytest in /usr/local/lib/python3.7/dist-packages (from dtreeviz) (3.6.4) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->dtreeviz) (3.0.6) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->dtreeviz) (2.8.2) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->dtreeviz) (1.3.2) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->dtreeviz) (0.11.0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->dtreeviz) (1.15.0) Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->dtreeviz) (2018.9) Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.7/dist-packages (from pytest->dtreeviz) (0.7.1) Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from pytest->dtreeviz) (8.12.0) Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from pytest->dtreeviz) (57.4.0) Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from pytest->dtreeviz) (21.4.0) Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.7/dist-packages (from pytest->dtreeviz) (1.4.0) Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pytest->dtreeviz) (1.11.0) Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->dtreeviz) (1.4.1) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->dtreeviz) (3.0.0) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->dtreeviz) (1.1.0) Building wheels for collected packages: dtreeviz Building wheel for dtreeviz (setup.py) ... done Created wheel for dtreeviz: filename=dtreeviz-1.3.2-py3-none-any.whl size=67935 sha256=480b16be5581347fb6c82165aa7176d807483284aff3b16830ae14cd71a21943 Stored in directory: /root/.cache/pip/wheels/9d/29/a1/f2ad20de79875e749330d5c6234fc5f517991fcaa23d7a3d0f Successfully built dtreeviz Installing collected packages: colour, dtreeviz Successfully installed colour-0.1.5 dtreeviz-1.3.2
from dtreeviz.trees import dtreeviz
# Una forma diferente de ver el arbol
viz = dtreeviz(clf, X, y,
target_name="target",
feature_names=iris.feature_names,
class_names=list(iris.target_names))
viz.save("decision_tree.svg") # Guardar la imagen
viz
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray return array(a, dtype, copy=False, order=order)
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets, neighbors
from mlxtend.plotting import plot_decision_regions
def knn_comparison(data, k): # funcion de comparacion
x = data[['X','Y']].values # Extraccion de columns
y = data['class'].astype(int).values # Clase y como int
clf = neighbors.KNeighborsClassifier(n_neighbors=k) #algoritmo
clf.fit(x, y)# Graficar la region de decision
plot_decision_regions(x, y, clf=clf, legend=2)# Añadir anotaciones
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Knn with K='+ str(k))
plt.show()
# Cargar y aplicar funcion
data1 = pd.read_csv('ushape.csv')
for i in [1,5,20,30,40,80]: # Para diferentes valores de k (Knn)
knn_comparison(data1, i)
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
# Data concentrica
data2 = pd.read_csv('concertriccir2.csv')
for i in [1,5,20,30,40,60]:
knn_comparison(data2, i)
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
# Data XOR
data3 = pd.read_csv('xor.csv')
for i in [1,5,20,30,40,60]:
knn_comparison(data3, i)
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
# Linear separable
data4 = pd.read_csv('linearsep.csv')
for i in [1,5,20,30,40,60]:
knn_comparison(data4, i)
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
# Data outliers
data5 = pd.read_csv('outlier.csv')
for i in [1, 5,20,30,40,60]:
knn_comparison(data5, i)
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
/usr/local/lib/python3.7/dist-packages/mlxtend/plotting/decision_regions.py:244: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3. ax.axis(xmin=xx.min(), xmax=xx.max(), y_min=yy.min(), y_max=yy.max())
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X, y = load_breast_cancer(return_X_y=True)
X
array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01, 1.189e-01], [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01, 8.902e-02], [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01, 8.758e-02], ..., [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01, 7.820e-02], [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01, 1.240e-01], [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01, 7.039e-02]])
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])
# Separacion train/tet
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = LogisticRegression(max_iter=10000, n_jobs=-1)
# Ajustar modelo
model.fit(X_train, y_train)
#Predicciones
predicciones = model.predict(X_test)
predicciones
array([0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1])
print(accuracy_score(y_test, predicciones))
0.916083916083916
from sklearn.metrics import confusion_matrix
#Matriz de confusion
cf_matrix = confusion_matrix(y_test, predicciones)
import seaborn as sns
ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')
ax.set_title('Matriz de confusion con labels\n\n');
ax.set_xlabel('\nValores predichos')
ax.set_ylabel('Valores reales ');
## Ticket labels - En orden alfabetico
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
plt.show()
Combinemos los cinco archivos CSV para analizar las cinco empresas juntas que hamos trabajado antes. Esto también reducirá la cantidad de trabajo de programación requerido, ya que el código se compartirá entre las cinco empresas.
Una forma de realizar esta tarea de agregación es utilizar el método pd.concat ()
de pandas
. Una entrada en este método puede ser una lista de DataFrames que le gustaría concatenar. Usaremos un bucle for para recorrer cada símbolo bursátil, cargar el archivo CSV correspondiente y luego agregar el resultado a una lista que luego se agrega usando pd.concat()
. Echemos un vistazo a cómo se hace esto.
print("Definiendo los simbolos de stock")
symbol_data_to_load = ['D','EXC','NEE','SO','DUK']
list_of_df = []
# Ciclo sobre simbolos
#llenar la lsita de dataframes
print(" --- Inicio de Loop --- ")
for i in symbol_data_to_load:
print("Procesando Simbolo: " + i)
temp_df = pd.read_csv(i+'.csv',sep=',')
temp_df['Volume_Millions'] = temp_df['Volume'] / 1000000.0
temp_df['Symbol'] = i # Agregar nueva columna con el simbolo
list_of_df.append(temp_df)
print(" --- Completado loop simbolos --- ")
# Combinar en un Dataframe unico usando concat
#permite pegar los dataframes de la lista
print("Agregando la data")
agg_df = pd.concat(list_of_df, axis=0)
# Agregar estadisticas de retorno y volatilidad
# es mas rápido agregarlo al dataframe que a cada uno de los registros
print('Calculando estadisticas finales')
agg_df['VolStat'] = (agg_df['High'] - agg_df['Low']) / agg_df['Open']
agg_df['Return'] = (agg_df['Close'] / agg_df['Open']) - 1.0
print("agg_df DataFrame dimension (filas, columnas): ")
print(agg_df.shape)
print("Head del DataFrame agg_df: ")
agg_df.head()
#print("agg_df['Symbol'].unique()")
Definiendo los simbolos de stock --- Inicio de Loop --- Procesando Simbolo: D Procesando Simbolo: EXC Procesando Simbolo: NEE Procesando Simbolo: SO Procesando Simbolo: DUK --- Completado loop simbolos --- Agregando la data Calculando estadisticas finales agg_df DataFrame dimension (filas, columnas): (6295, 11) Head del DataFrame agg_df:
Date | Open | High | Low | Close | Adj Close | Volume | Volume_Millions | Symbol | VolStat | Return | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2014-07-28 | 69.750000 | 71.059998 | 69.750000 | 70.879997 | 57.963978 | 1806400 | 1.8064 | D | 0.018781 | 0.016201 |
1 | 2014-07-29 | 70.669998 | 70.980003 | 69.930000 | 69.930000 | 57.187099 | 2231100 | 2.2311 | D | 0.014858 | -0.010471 |
2 | 2014-07-30 | 70.000000 | 70.660004 | 68.400002 | 68.970001 | 56.402020 | 2588900 | 2.5889 | D | 0.032286 | -0.014714 |
3 | 2014-07-31 | 68.629997 | 68.849998 | 67.580002 | 67.639999 | 55.314388 | 3266900 | 3.2669 | D | 0.018505 | -0.014425 |
4 | 2014-08-01 | 67.330002 | 68.410004 | 67.220001 | 67.589996 | 55.273487 | 2601800 | 2.6018 | D | 0.017674 | 0.003861 |
agg_df.Symbol.unique()
array(['D', 'EXC', 'NEE', 'SO', 'DUK'], dtype=object)
### Load relevant packages
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
# https://community.plot.ly/t/solved-update-to-plotly-4-0-0-broke-application/26526/2
import os
#%matplotlib inline
#plt.style.use('ggplot')
from bokeh.resources import INLINE
import bokeh.io
from bokeh import *
Primero debemos estandarizar con el proceso min-max scaler
data= agg_df[['Open','High','Low','Close','Volume_Millions','Symbol','VolStat','Return']]
data.head()
Open | High | Low | Close | Volume_Millions | Symbol | VolStat | Return | |
---|---|---|---|---|---|---|---|---|
0 | 69.750000 | 71.059998 | 69.750000 | 70.879997 | 1.8064 | D | 0.018781 | 0.016201 |
1 | 70.669998 | 70.980003 | 69.930000 | 69.930000 | 2.2311 | D | 0.014858 | -0.010471 |
2 | 70.000000 | 70.660004 | 68.400002 | 68.970001 | 2.5889 | D | 0.032286 | -0.014714 |
3 | 68.629997 | 68.849998 | 67.580002 | 67.639999 | 3.2669 | D | 0.018505 | -0.014425 |
4 | 67.330002 | 68.410004 | 67.220001 | 67.589996 | 2.6018 | D | 0.017674 | 0.003861 |
def min_max_scaling(series):
return (series - series.min()) / (series.max() - series.min())
for col in data.columns:
if col == 'Symbol':
pass
else:
data[col] = min_max_scaling(data[col])
data.head()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy import sys
Open | High | Low | Close | Volume_Millions | Symbol | VolStat | Return | |
---|---|---|---|---|---|---|---|---|
0 | 0.237520 | 0.241568 | 0.242309 | 0.243018 | 0.048378 | D | 0.133693 | 0.774064 |
1 | 0.242458 | 0.241139 | 0.243286 | 0.237935 | 0.064767 | D | 0.099260 | 0.567006 |
2 | 0.238862 | 0.239423 | 0.234985 | 0.232798 | 0.078575 | D | 0.252211 | 0.534066 |
3 | 0.231508 | 0.229717 | 0.230536 | 0.225682 | 0.104740 | D | 0.131268 | 0.536311 |
4 | 0.224530 | 0.227358 | 0.228582 | 0.225415 | 0.079073 | D | 0.123977 | 0.678273 |
agg_df.columns
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Volume_Millions', 'Symbol', 'VolStat', 'Return'], dtype='object')
model1 = 'VolStat~Open+ High+ Low+ Close + Volume_Millions +Symbol'
lm1 = sm.ols(formula = model1, data = data).fit()
print(lm1.summary())
OLS Regression Results ============================================================================== Dep. Variable: VolStat R-squared: 0.834 Model: OLS Adj. R-squared: 0.834 Method: Least Squares F-statistic: 3516. Date: Sun, 02 Jan 2022 Prob (F-statistic): 0.00 Time: 19:51:12 Log-Likelihood: 14263. No. Observations: 6295 AIC: -2.851e+04 Df Residuals: 6285 BIC: -2.844e+04 Df Model: 9 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- Intercept 0.1281 0.001 89.511 0.000 0.125 0.131 Symbol[T.DUK] -0.0020 0.001 -1.927 0.054 -0.004 3.36e-05 Symbol[T.EXC] -0.0043 0.001 -3.174 0.002 -0.007 -0.002 Symbol[T.NEE] 0.0072 0.002 4.292 0.000 0.004 0.011 Symbol[T.SO] -0.0188 0.001 -15.499 0.000 -0.021 -0.016 Open -0.5613 0.161 -3.478 0.001 -0.878 -0.245 High 17.0916 0.184 92.739 0.000 16.730 17.453 Low -16.1575 0.184 -87.950 0.000 -16.518 -15.797 Close -0.4239 0.165 -2.565 0.010 -0.748 -0.100 Volume_Millions 0.1905 0.005 37.250 0.000 0.180 0.201 ============================================================================== Omnibus: 1459.013 Durbin-Watson: 1.482 Prob(Omnibus): 0.000 Jarque-Bera (JB): 32545.425 Skew: 0.562 Prob(JB): 0.00 Kurtosis: 14.082 Cond. No. 1.11e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.11e+03. This might indicate that there are strong multicollinearity or other numerical problems.
En general el modelo tiene un alto valor de $R²$ con un valor de 0.834 lo cual implica que el modelo explica el 83% de la varianza de todos los datos, de igual forma la regresión es significativa (al menos un coeficiente es diferente de 0) del valor p obtenido de la prueba F.
Respecto a los coeficientes se puede decir que:
model2 = 'Return~Open+ High+ Low+ Close + Volume_Millions +Symbol'
lm2 = sm.ols(formula = model2, data = data).fit()
print(lm2.summary())
OLS Regression Results ============================================================================== Dep. Variable: Return R-squared: 0.788 Model: OLS Adj. R-squared: 0.787 Method: Least Squares F-statistic: 2589. Date: Sun, 02 Jan 2022 Prob (F-statistic): 0.00 Time: 19:51:55 Log-Likelihood: 12186. No. Observations: 6295 AIC: -2.435e+04 Df Residuals: 6285 BIC: -2.428e+04 Df Model: 9 Covariance Type: nonrobust =================================================================================== coef std err t P>|t| [0.025 0.975] ----------------------------------------------------------------------------------- Intercept 0.6495 0.002 326.192 0.000 0.646 0.653 Symbol[T.DUK] 0.0002 0.001 0.158 0.875 -0.003 0.003 Symbol[T.EXC] 0.0010 0.002 0.548 0.584 -0.003 0.005 Symbol[T.NEE] -0.0006 0.002 -0.255 0.799 -0.005 0.004 Symbol[T.SO] 0.0015 0.002 0.885 0.376 -0.002 0.005 Open -16.2288 0.224 -72.296 0.000 -16.669 -15.789 High 0.1291 0.256 0.504 0.615 -0.373 0.632 Low -0.0505 0.256 -0.198 0.843 -0.551 0.450 Close 16.1929 0.230 70.439 0.000 15.742 16.644 Volume_Millions -0.0142 0.007 -1.992 0.046 -0.028 -0.000 ============================================================================== Omnibus: 1183.972 Durbin-Watson: 1.963 Prob(Omnibus): 0.000 Jarque-Bera (JB): 28613.068 Skew: -0.227 Prob(JB): 0.00 Kurtosis: 13.435 Cond. No. 1.11e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.11e+03. This might indicate that there are strong multicollinearity or other numerical problems.
Curiosamente en este caso se observa que: