import pandas as pd
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'first': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'second': range(3)})
df1
key | first | |
---|---|---|
0 | b | 0 |
1 | b | 1 |
2 | a | 2 |
3 | c | 3 |
4 | a | 4 |
5 | a | 5 |
6 | b | 6 |
df2
key | second | |
---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | d | 2 |
pd.merge(left=df1,right=df2, on ='key',how='left')
key | first | second | |
---|---|---|---|
0 | b | 0 | 1.0 |
1 | b | 1 | 1.0 |
2 | a | 2 | 0.0 |
3 | c | 3 | NaN |
4 | a | 4 | 0.0 |
5 | a | 5 | 0.0 |
6 | b | 6 | 1.0 |
pd.merge(df1,df2, on ='key',how= 'inner')
key | first | second | |
---|---|---|---|
0 | b | 0 | 1 |
1 | b | 1 | 1 |
2 | b | 6 | 1 |
3 | a | 2 | 0 |
4 | a | 4 | 0 |
5 | a | 5 | 0 |
pd.merge(df1,df2, on ='key',how= 'outer')
key | first | second | |
---|---|---|---|
0 | b | 0.0 | 1.0 |
1 | b | 1.0 | 1.0 |
2 | b | 6.0 | 1.0 |
3 | a | 2.0 | 0.0 |
4 | a | 4.0 | 0.0 |
5 | a | 5.0 | 0.0 |
6 | c | 3.0 | NaN |
7 | d | NaN | 2.0 |
pd.merge(df1,df2, on ='key',how= 'left')
key | first | second | |
---|---|---|---|
0 | b | 0 | 1.0 |
1 | b | 1 | 1.0 |
2 | a | 2 | 0.0 |
3 | c | 3 | NaN |
4 | a | 4 | 0.0 |
5 | a | 5 | 0.0 |
6 | b | 6 | 1.0 |
pd.merge(df1,df2, on ='key',how= 'right')
key | first | second | |
---|---|---|---|
0 | a | 2.0 | 0 |
1 | a | 4.0 | 0 |
2 | a | 5.0 | 0 |
3 | b | 0.0 | 1 |
4 | b | 1.0 | 1 |
5 | b | 6.0 | 1 |
6 | d | NaN | 2 |
from google.colab import drive
import os
drive.mount('/content/gdrive')
# Establecer ruta de acceso en drive
import os
print(os.getcwd())
#os.chdir("/content/gdrive/My Drive")
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True). /content/gdrive/My Drive/Casos de Estudio Coderhouse
%cd '/content/gdrive/MyDrive/Casos de Estudio Coderhouse'
/content/gdrive/MyDrive/Casos de Estudio Coderhouse
clase0=pd.read_excel('clase0.xlsx')
clase0
key1 | key2 | data | |
---|---|---|---|
0 | Buenos Aires | 2020 | 0 |
1 | Buenos Aires | 2021 | 1 |
2 | Buenos Aires | 2022 | 2 |
3 | Cordoba | 2021 | 3 |
4 | Cordoba | 2022 | 4 |
5 | Santa Fe | 2020 | 5 |
6 | Santa Fe | 2020 | 6 |
clase1=pd.read_excel('clase1.xlsx')
clase1
key1 | key2 | event1 | event2 | |
---|---|---|---|---|
0 | Cordoba | 2021 | 0 | 1 |
1 | Cordoba | 2020 | 2 | 3 |
2 | Buenos Aires | 2020 | 4 | 5 |
3 | Buenos Aires | 2020 | 6 | 7 |
4 | Buenos Aires | 2021 | 8 | 9 |
5 | Buenos Aires | 2022 | 10 | 11 |
6 | Santa Fe | 2020 | 12 | 13 |
7 | Santa Fe | 2019 | 14 | 15 |
clase1.groupby(by=['key1','key2']).mean()
event1 | event2 | ||
---|---|---|---|
key1 | key2 | ||
Buenos Aires | 2020 | 5.0 | 6.0 |
2021 | 8.0 | 9.0 | |
2022 | 10.0 | 11.0 | |
Cordoba | 2020 | 2.0 | 3.0 |
2021 | 0.0 | 1.0 | |
Santa Fe | 2019 | 14.0 | 15.0 |
2020 | 12.0 | 13.0 |
pd.merge(clase0,clase1, on =['key1','key2'])
key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|
0 | Buenos Aires | 2020 | 0 | 4 | 5 |
1 | Buenos Aires | 2020 | 0 | 6 | 7 |
2 | Buenos Aires | 2021 | 1 | 8 | 9 |
3 | Buenos Aires | 2022 | 2 | 10 | 11 |
4 | Cordoba | 2021 | 3 | 0 | 1 |
5 | Santa Fe | 2020 | 5 | 12 | 13 |
6 | Santa Fe | 2020 | 6 | 12 | 13 |
pd.merge(clase0,clase1, on =['key1','key2'],how= 'inner')
key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|
0 | Buenos Aires | 2020 | 0 | 4 | 5 |
1 | Buenos Aires | 2020 | 0 | 6 | 7 |
2 | Buenos Aires | 2021 | 1 | 8 | 9 |
3 | Buenos Aires | 2022 | 2 | 10 | 11 |
4 | Cordoba | 2021 | 3 | 0 | 1 |
5 | Santa Fe | 2020 | 5 | 12 | 13 |
6 | Santa Fe | 2020 | 6 | 12 | 13 |
pd.merge(clase0,clase1, on =['key1','key2'],how= 'outer')
key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|
0 | Buenos Aires | 2020 | 0.0 | 4.0 | 5.0 |
1 | Buenos Aires | 2020 | 0.0 | 6.0 | 7.0 |
2 | Buenos Aires | 2021 | 1.0 | 8.0 | 9.0 |
3 | Buenos Aires | 2022 | 2.0 | 10.0 | 11.0 |
4 | Cordoba | 2021 | 3.0 | 0.0 | 1.0 |
5 | Cordoba | 2022 | 4.0 | NaN | NaN |
6 | Santa Fe | 2020 | 5.0 | 12.0 | 13.0 |
7 | Santa Fe | 2020 | 6.0 | 12.0 | 13.0 |
8 | Cordoba | 2020 | NaN | 2.0 | 3.0 |
9 | Santa Fe | 2019 | NaN | 14.0 | 15.0 |
pd.merge(clase0,clase1, on =['key1','key2'],how= 'left')
key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|
0 | Buenos Aires | 2020 | 0 | 4.0 | 5.0 |
1 | Buenos Aires | 2020 | 0 | 6.0 | 7.0 |
2 | Buenos Aires | 2021 | 1 | 8.0 | 9.0 |
3 | Buenos Aires | 2022 | 2 | 10.0 | 11.0 |
4 | Cordoba | 2021 | 3 | 0.0 | 1.0 |
5 | Cordoba | 2022 | 4 | NaN | NaN |
6 | Santa Fe | 2020 | 5 | 12.0 | 13.0 |
7 | Santa Fe | 2020 | 6 | 12.0 | 13.0 |
pd.merge(clase0,clase1, on =['key1','key2'],how= 'right')
key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|
0 | Cordoba | 2021 | 3.0 | 0 | 1 |
1 | Cordoba | 2020 | NaN | 2 | 3 |
2 | Buenos Aires | 2020 | 0.0 | 4 | 5 |
3 | Buenos Aires | 2020 | 0.0 | 6 | 7 |
4 | Buenos Aires | 2021 | 1.0 | 8 | 9 |
5 | Buenos Aires | 2022 | 2.0 | 10 | 11 |
6 | Santa Fe | 2020 | 5.0 | 12 | 13 |
7 | Santa Fe | 2020 | 6.0 | 12 | 13 |
8 | Santa Fe | 2019 | NaN | 14 | 15 |
c1=pd.merge(clase0,clase1, on =['key1','key2'],how= 'right')
c2=pd.merge(clase0,clase1, on =['key1','key2'],how= 'left')
c3=pd.merge(clase0,clase1, on =['key1','key2'],how= 'outer')
c4=pd.merge(clase0,clase1, on =['key1','key2'],how= 'inner')
pd.concat([c1,c2,c3,c4],axis=0)
key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|
0 | Cordoba | 2021 | 3.0 | 0.0 | 1.0 |
1 | Cordoba | 2020 | NaN | 2.0 | 3.0 |
2 | Buenos Aires | 2020 | 0.0 | 4.0 | 5.0 |
3 | Buenos Aires | 2020 | 0.0 | 6.0 | 7.0 |
4 | Buenos Aires | 2021 | 1.0 | 8.0 | 9.0 |
5 | Buenos Aires | 2022 | 2.0 | 10.0 | 11.0 |
6 | Santa Fe | 2020 | 5.0 | 12.0 | 13.0 |
7 | Santa Fe | 2020 | 6.0 | 12.0 | 13.0 |
8 | Santa Fe | 2019 | NaN | 14.0 | 15.0 |
0 | Buenos Aires | 2020 | 0.0 | 4.0 | 5.0 |
1 | Buenos Aires | 2020 | 0.0 | 6.0 | 7.0 |
2 | Buenos Aires | 2021 | 1.0 | 8.0 | 9.0 |
3 | Buenos Aires | 2022 | 2.0 | 10.0 | 11.0 |
4 | Cordoba | 2021 | 3.0 | 0.0 | 1.0 |
5 | Cordoba | 2022 | 4.0 | NaN | NaN |
6 | Santa Fe | 2020 | 5.0 | 12.0 | 13.0 |
7 | Santa Fe | 2020 | 6.0 | 12.0 | 13.0 |
0 | Buenos Aires | 2020 | 0.0 | 4.0 | 5.0 |
1 | Buenos Aires | 2020 | 0.0 | 6.0 | 7.0 |
2 | Buenos Aires | 2021 | 1.0 | 8.0 | 9.0 |
3 | Buenos Aires | 2022 | 2.0 | 10.0 | 11.0 |
4 | Cordoba | 2021 | 3.0 | 0.0 | 1.0 |
5 | Cordoba | 2022 | 4.0 | NaN | NaN |
6 | Santa Fe | 2020 | 5.0 | 12.0 | 13.0 |
7 | Santa Fe | 2020 | 6.0 | 12.0 | 13.0 |
8 | Cordoba | 2020 | NaN | 2.0 | 3.0 |
9 | Santa Fe | 2019 | NaN | 14.0 | 15.0 |
0 | Buenos Aires | 2020 | 0.0 | 4.0 | 5.0 |
1 | Buenos Aires | 2020 | 0.0 | 6.0 | 7.0 |
2 | Buenos Aires | 2021 | 1.0 | 8.0 | 9.0 |
3 | Buenos Aires | 2022 | 2.0 | 10.0 | 11.0 |
4 | Cordoba | 2021 | 3.0 | 0.0 | 1.0 |
5 | Santa Fe | 2020 | 5.0 | 12.0 | 13.0 |
6 | Santa Fe | 2020 | 6.0 | 12.0 | 13.0 |
a=pd.concat([c1,c2],axis=1)
a
key1 | key2 | data | event1 | event2 | key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Cordoba | 2021 | 3.0 | 0 | 1 | Buenos Aires | 2020.0 | 0.0 | 4.0 | 5.0 |
1 | Cordoba | 2020 | NaN | 2 | 3 | Buenos Aires | 2020.0 | 0.0 | 6.0 | 7.0 |
2 | Buenos Aires | 2020 | 0.0 | 4 | 5 | Buenos Aires | 2021.0 | 1.0 | 8.0 | 9.0 |
3 | Buenos Aires | 2020 | 0.0 | 6 | 7 | Buenos Aires | 2022.0 | 2.0 | 10.0 | 11.0 |
4 | Buenos Aires | 2021 | 1.0 | 8 | 9 | Cordoba | 2021.0 | 3.0 | 0.0 | 1.0 |
5 | Buenos Aires | 2022 | 2.0 | 10 | 11 | Cordoba | 2022.0 | 4.0 | NaN | NaN |
6 | Santa Fe | 2020 | 5.0 | 12 | 13 | Santa Fe | 2020.0 | 5.0 | 12.0 | 13.0 |
7 | Santa Fe | 2020 | 6.0 | 12 | 13 | Santa Fe | 2020.0 | 6.0 | 12.0 | 13.0 |
8 | Santa Fe | 2019 | NaN | 14 | 15 | NaN | NaN | NaN | NaN | NaN |
%cd '/content/gdrive/MyDrive/'
/content/gdrive/MyDrive
customers= pd.read_csv('customers.csv',sep=',')
customers.head()
CustomerID | CustomerName | ContactName | Address | City | PostalCode | Country | |
---|---|---|---|---|---|---|---|
0 | 1 | Alfreds Futterkiste | Maria Anders | Obere Str. 57 | Berlin | 12209 | Germany |
1 | 2 | Ana Trujillo Emparedados y helados | Ana Trujillo | Avda. de la Constitución 2222 | México D.F. | 5021 | Mexico |
2 | 3 | Antonio Moreno Taquería | Antonio Moreno | Mataderos 2312 | México D.F. | 5023 | Mexico |
3 | 4 | Around the Horn | Thomas Hardy | 120 Hanover Sq. | London | WA1 1DP | UK |
4 | 5 | Berglunds snabbköp | Christina Berglund | Berguvsvägen 8 | Luleå | S-958 22 | Sweden |
employees= pd.read_csv('employees.csv',sep=',')
employees.head()
EmployeeID | LastName | FirstName | BirthDate | Photo | Notes | |
---|---|---|---|---|---|---|
0 | 1 | Davolio | Nancy | 12/8/1968 | EmpID1.pic | Education includes a BA in psychology from Col... |
1 | 2 | Fuller | Andrew | 2/19/1952 | EmpID2.pic | Andrew received his BTS commercial and a Ph.D.... |
2 | 3 | Leverling | Janet | 8/30/1963 | EmpID3.pic | Janet has a BS degree in chemistry from Boston... |
3 | 4 | Peacock | Margaret | 9/19/1958 | EmpID4.pic | Margaret holds a BA in English literature from... |
4 | 5 | Buchanan | Steven | 3/4/1955 | EmpID5.pic | Steven Buchanan graduated from St. Andrews Uni... |
orders= pd.read_csv('orders.csv',sep=',')
orders.head()
OrderID | CustomerID | EmployeeID | OrderDate | ShipperID | |
---|---|---|---|---|---|
0 | 10248 | 90 | 5 | 7/4/1996 | 3 |
1 | 10249 | 81 | 6 | 7/5/1996 | 1 |
2 | 10250 | 34 | 4 | 7/8/1996 | 2 |
3 | 10251 | 84 | 3 | 7/8/1996 | 1 |
4 | 10252 | 76 | 4 | 7/9/1996 | 2 |
# Hacer el inner join
pd.merge(orders, employees, how='inner', on='EmployeeID').sort_values(by='OrderID')[['OrderID','LastName','FirstName']]
OrderID | LastName | FirstName | |
---|---|---|---|
0 | 10248 | Buchanan | Steven |
11 | 10249 | Suyama | Michael |
29 | 10250 | Peacock | Margaret |
69 | 10251 | Leverling | Janet |
30 | 10252 | Peacock | Margaret |
... | ... | ... | ... |
28 | 10439 | Suyama | Michael |
68 | 10440 | Peacock | Margaret |
98 | 10441 | Leverling | Janet |
99 | 10442 | Leverling | Janet |
161 | 10443 | Callahan | Laura |
196 rows × 3 columns