import pandas as pd
import numpy as np
import os
os.chdir(r"C:\Users\Gram\Desktop\myPyCode\01 Python 분석 기초 - 실습\data")
df = pd.read_csv('data_studentlist_en.csv', header='infer',encoding = 'latin1')
df.shape
(17, 8)
df.size
136
df.ndim
2
type(df)
pandas.core.frame.DataFrame
df.info() #데이터프레임의 구조 보여줌
<class 'pandas.core.frame.DataFrame'> RangeIndex: 17 entries, 0 to 16 Data columns (total 8 columns): name 17 non-null object gender 17 non-null object age 17 non-null int64 grade 17 non-null int64 absence 17 non-null object bloodtype 17 non-null object height 17 non-null float64 weight 17 non-null float64 dtypes: float64(2), int64(2), object(4) memory usage: 1.2+ KB
df.describe() #기술통계 요약 보여줌
age | grade | height | weight | |
---|---|---|---|---|
count | 17.000000 | 17.000000 | 17.000000 | 17.000000 |
mean | 22.352941 | 2.294118 | 170.035294 | 61.064706 |
std | 1.169464 | 1.104802 | 7.849756 | 11.851685 |
min | 20.000000 | 1.000000 | 155.200000 | 45.200000 |
25% | 22.000000 | 1.000000 | 165.300000 | 53.000000 |
50% | 22.000000 | 2.000000 | 169.200000 | 61.300000 |
75% | 23.000000 | 3.000000 | 176.100000 | 68.200000 |
max | 24.000000 | 4.000000 | 182.100000 | 85.700000 |
df.head(3) #열이름 제외하고 상단에서 3행
name | gender | age | grade | absence | bloodtype | height | weight | |
---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | M | 23 | 3 | Y | O | 165.3 | 68.2 |
1 | Sarah O'Donnel | F | 22 | 2 | N | AB | 170.1 | 53.0 |
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 |
df.tail(3) #열이름 제외하고 하단에서 3행
name | gender | age | grade | absence | bloodtype | height | weight | |
---|---|---|---|---|---|---|---|---|
14 | Linda Carter | F | 22 | 2 | N | B | 158.2 | 45.2 |
15 | Richard Swayze | M | 24 | 4 | Y | B | 168.6 | 70.2 |
16 | Andrew Daley | M | 21 | 1 | N | A | 169.2 | 62.2 |
df.columns #열 이름(헤더 이름)보기
Index(['name', 'gender', 'age', 'grade', 'absence', 'bloodtype', 'height', 'weight'], dtype='object')
df.columns = ['NAME', 'GENDER' , 'AGE', 'GRADE', 'ABSENCE', 'BLOODTYPE', 'HEIGHT', 'WEIGHT']
df.head(3) #열 이름(헤더 이름) 변경
#새로운 열 이름 리스트를 대입함
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | M | 23 | 3 | Y | O | 165.3 | 68.2 |
1 | Sarah O'Donnel | F | 22 | 2 | N | AB | 170.1 | 53.0 |
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 |
header = df.columns #열로 헤더이름 가져오기
df.index
RangeIndex(start=0, stop=17, step=1)
df.NAME # NAME 열 뽑기
0 Jared Diamond 1 Sarah O'Donnel 2 Brian Martin 3 David Hassel 4 Clara Rodriquez 5 Jennifer Lorentz 6 Susan Clark 7 Margareth Jones 8 John Bertsch 9 Jake Timmerman 10 Joshua Connor 11 John Matsuda 12 Eddy Johnson 13 Rebecah Anderson 14 Linda Carter 15 Richard Swayze 16 Andrew Daley Name: NAME, dtype: object
type(df.NAME) # 출력된 것은 시리즈이다.
pandas.core.series.Series
df['NAME'] #데이터프레임에서 시리즈 하나 가져옴
0 Jared Diamond 1 Sarah O'Donnel 2 Brian Martin 3 David Hassel 4 Clara Rodriquez 5 Jennifer Lorentz 6 Susan Clark 7 Margareth Jones 8 John Bertsch 9 Jake Timmerman 10 Joshua Connor 11 John Matsuda 12 Eddy Johnson 13 Rebecah Anderson 14 Linda Carter 15 Richard Swayze 16 Andrew Daley Name: NAME, dtype: object
df.loc[:,'NAME'] #콤마 왼쪽은 행, 오른쪽은 열
0 Jared Diamond 1 Sarah O'Donnel 2 Brian Martin 3 David Hassel 4 Clara Rodriquez 5 Jennifer Lorentz 6 Susan Clark 7 Margareth Jones 8 John Bertsch 9 Jake Timmerman 10 Joshua Connor 11 John Matsuda 12 Eddy Johnson 13 Rebecah Anderson 14 Linda Carter 15 Richard Swayze 16 Andrew Daley Name: NAME, dtype: object
df.loc[:,['NAME','GENDER']] #칼럼 'NAME'과 'GENDER' 뽑음
#콤마 왼쪽은 행, 오른쪽은 열
NAME | GENDER | |
---|---|---|
0 | Jared Diamond | M |
1 | Sarah O'Donnel | F |
2 | Brian Martin | M |
3 | David Hassel | M |
4 | Clara Rodriquez | F |
5 | Jennifer Lorentz | F |
6 | Susan Clark | F |
7 | Margareth Jones | F |
8 | John Bertsch | M |
9 | Jake Timmerman | M |
10 | Joshua Connor | M |
11 | John Matsuda | M |
12 | Eddy Johnson | M |
13 | Rebecah Anderson | F |
14 | Linda Carter | F |
15 | Richard Swayze | M |
16 | Andrew Daley | M |
df.loc[2] #2번째 행을 보여줌
NAME Brian Martin GENDER M AGE 24 GRADE 4 ABSENCE N BLOODTYPE B HEIGHT 175 WEIGHT 80.1 Name: 2, dtype: object
df.loc[2:4] #2~4번째 행을 보여줌
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 |
3 | David Hassel | M | 23 | 3 | N | AB | 182.1 | 85.7 |
4 | Clara Rodriquez | F | 20 | 1 | Y | A | 168.0 | 49.5 |
df.drop(columns=['NAME','GENDER']) #칼럼 'NAME'과 'GENDER'를 제외한 나머지
#영구적으로 제외하고 싶으면 inplace=True해서 다시 대입
AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|
0 | 23 | 3 | Y | O | 165.3 | 68.2 |
1 | 22 | 2 | N | AB | 170.1 | 53.0 |
2 | 24 | 4 | N | B | 175.0 | 80.1 |
3 | 23 | 3 | N | AB | 182.1 | 85.7 |
4 | 20 | 1 | Y | A | 168.0 | 49.5 |
5 | 21 | 2 | N | O | 162.0 | 52.0 |
6 | 22 | 1 | N | O | 155.2 | 45.3 |
7 | 23 | 1 | N | A | 176.9 | 55.0 |
8 | 23 | 3 | N | B | 178.5 | 64.2 |
9 | 22 | 2 | N | B | 176.1 | 61.3 |
10 | 24 | 4 | Y | O | 167.1 | 62.0 |
11 | 22 | 2 | N | AB | 180.0 | 75.8 |
12 | 21 | 1 | N | A | 162.2 | 55.3 |
13 | 23 | 3 | N | O | 176.1 | 53.1 |
14 | 22 | 2 | N | B | 158.2 | 45.2 |
15 | 24 | 4 | Y | B | 168.6 | 70.2 |
16 | 21 | 1 | N | A | 169.2 | 62.2 |
df.loc[:, (header!='NAME') & (header!='GENDER')] #칼럼 'NAME'과 'GENDER'를 제외한 나머지
#영구적으로 제외되지 않음
AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|
0 | 23 | 3 | Y | O | 165.3 | 68.2 |
1 | 22 | 2 | N | AB | 170.1 | 53.0 |
2 | 24 | 4 | N | B | 175.0 | 80.1 |
3 | 23 | 3 | N | AB | 182.1 | 85.7 |
4 | 20 | 1 | Y | A | 168.0 | 49.5 |
5 | 21 | 2 | N | O | 162.0 | 52.0 |
6 | 22 | 1 | N | O | 155.2 | 45.3 |
7 | 23 | 1 | N | A | 176.9 | 55.0 |
8 | 23 | 3 | N | B | 178.5 | 64.2 |
9 | 22 | 2 | N | B | 176.1 | 61.3 |
10 | 24 | 4 | Y | O | 167.1 | 62.0 |
11 | 22 | 2 | N | AB | 180.0 | 75.8 |
12 | 21 | 1 | N | A | 162.2 | 55.3 |
13 | 23 | 3 | N | O | 176.1 | 53.1 |
14 | 22 | 2 | N | B | 158.2 | 45.2 |
15 | 24 | 4 | Y | B | 168.6 | 70.2 |
16 | 21 | 1 | N | A | 169.2 | 62.2 |
df.loc[:,(header =='NAME') | (header == 'GENDER')] #칼럼 'NAME'과 'GENDER'를 뽑아서 보여줌
NAME | GENDER | |
---|---|---|
0 | Jared Diamond | M |
1 | Sarah O'Donnel | F |
2 | Brian Martin | M |
3 | David Hassel | M |
4 | Clara Rodriquez | F |
5 | Jennifer Lorentz | F |
6 | Susan Clark | F |
7 | Margareth Jones | F |
8 | John Bertsch | M |
9 | Jake Timmerman | M |
10 | Joshua Connor | M |
11 | John Matsuda | M |
12 | Eddy Johnson | M |
13 | Rebecah Anderson | F |
14 | Linda Carter | F |
15 | Richard Swayze | M |
16 | Andrew Daley | M |
df[df.GENDER == 'M'] #성별이 M인 경우
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | M | 23 | 3 | Y | O | 165.3 | 68.2 |
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 |
3 | David Hassel | M | 23 | 3 | N | AB | 182.1 | 85.7 |
8 | John Bertsch | M | 23 | 3 | N | B | 178.5 | 64.2 |
9 | Jake Timmerman | M | 22 | 2 | N | B | 176.1 | 61.3 |
10 | Joshua Connor | M | 24 | 4 | Y | O | 167.1 | 62.0 |
11 | John Matsuda | M | 22 | 2 | N | AB | 180.0 | 75.8 |
12 | Eddy Johnson | M | 21 | 1 | N | A | 162.2 | 55.3 |
15 | Richard Swayze | M | 24 | 4 | Y | B | 168.6 | 70.2 |
16 | Andrew Daley | M | 21 | 1 | N | A | 169.2 | 62.2 |
df[-(df.GENDER == 'M')] #성별이 M이 아닌 경우
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
1 | Sarah O'Donnel | F | 22 | 2 | N | AB | 170.1 | 53.0 |
4 | Clara Rodriquez | F | 20 | 1 | Y | A | 168.0 | 49.5 |
5 | Jennifer Lorentz | F | 21 | 2 | N | O | 162.0 | 52.0 |
6 | Susan Clark | F | 22 | 1 | N | O | 155.2 | 45.3 |
7 | Margareth Jones | F | 23 | 1 | N | A | 176.9 | 55.0 |
13 | Rebecah Anderson | F | 23 | 3 | N | O | 176.1 | 53.1 |
14 | Linda Carter | F | 22 | 2 | N | B | 158.2 | 45.2 |
df[df.HEIGHT > 170] #신장이 170이상인 경우
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
1 | Sarah O'Donnel | F | 22 | 2 | N | AB | 170.1 | 53.0 |
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 |
3 | David Hassel | M | 23 | 3 | N | AB | 182.1 | 85.7 |
7 | Margareth Jones | F | 23 | 1 | N | A | 176.9 | 55.0 |
8 | John Bertsch | M | 23 | 3 | N | B | 178.5 | 64.2 |
9 | Jake Timmerman | M | 22 | 2 | N | B | 176.1 | 61.3 |
11 | John Matsuda | M | 22 | 2 | N | AB | 180.0 | 75.8 |
13 | Rebecah Anderson | F | 23 | 3 | N | O | 176.1 | 53.1 |
df[ (df.WEIGHT > 70) & (df.WEIGHT < 80)] #AND 조건
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
11 | John Matsuda | M | 22 | 2 | N | AB | 180.0 | 75.8 |
15 | Richard Swayze | M | 24 | 4 | Y | B | 168.6 | 70.2 |
df[ (df.GRADE == 1) | (df.GRADE == 4 )] #OR 조건
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 |
4 | Clara Rodriquez | F | 20 | 1 | Y | A | 168.0 | 49.5 |
6 | Susan Clark | F | 22 | 1 | N | O | 155.2 | 45.3 |
7 | Margareth Jones | F | 23 | 1 | N | A | 176.9 | 55.0 |
10 | Joshua Connor | M | 24 | 4 | Y | O | 167.1 | 62.0 |
12 | Eddy Johnson | M | 21 | 1 | N | A | 162.2 | 55.3 |
15 | Richard Swayze | M | 24 | 4 | Y | B | 168.6 | 70.2 |
16 | Andrew Daley | M | 21 | 1 | N | A | 169.2 | 62.2 |
df[ (df.GENDER == 'M') & ((df.HEIGHT < 160) | (df.HEIGHT > 180))] #AND와 OR 조건
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
3 | David Hassel | M | 23 | 3 | N | AB | 182.1 | 85.7 |
df['BMI'] = 10000*df['WEIGHT']/df['HEIGHT']**2
df.head(5)
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | BMI | |
---|---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | M | 23 | 3 | Y | O | 165.3 | 68.2 | 24.959660 |
1 | Sarah O'Donnel | F | 22 | 2 | N | AB | 170.1 | 53.0 | 18.317544 |
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 | 26.155102 |
3 | David Hassel | M | 23 | 3 | N | AB | 182.1 | 85.7 | 25.844071 |
4 | Clara Rodriquez | F | 20 | 1 | Y | A | 168.0 | 49.5 | 17.538265 |
df.drop('BMI',axis=1)
df.head(5)
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | BMI | |
---|---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | M | 23 | 3 | Y | O | 165.3 | 68.2 | 24.959660 |
1 | Sarah O'Donnel | F | 22 | 2 | N | AB | 170.1 | 53.0 | 18.317544 |
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 | 26.155102 |
3 | David Hassel | M | 23 | 3 | N | AB | 182.1 | 85.7 | 25.844071 |
4 | Clara Rodriquez | F | 20 | 1 | Y | A | 168.0 | 49.5 | 17.538265 |
df.drop('BMI',axis=1,inplace=True)
df.head(5)
NAME | GENDER | AGE | GRADE | ABSENCE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | M | 23 | 3 | Y | O | 165.3 | 68.2 |
1 | Sarah O'Donnel | F | 22 | 2 | N | AB | 170.1 | 53.0 |
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 |
3 | David Hassel | M | 23 | 3 | N | AB | 182.1 | 85.7 |
4 | Clara Rodriquez | F | 20 | 1 | Y | A | 168.0 | 49.5 |
df2 = df.drop(columns=['GRADE','ABSENCE'])
df2.to_csv('data_mine.csv',index=False)
df3 = pd.read_csv('data_mine.csv',encoding='latin1',header='infer')
df3.head(3)
NAME | GENDER | AGE | BLOODTYPE | HEIGHT | WEIGHT | |
---|---|---|---|---|---|---|
0 | Jared Diamond | M | 23 | O | 165.3 | 68.2 |
1 | Sarah O'Donnel | F | 22 | AB | 170.1 | 53.0 |
2 | Brian Martin | M | 24 | B | 175.0 | 80.1 |
dfx = pd.read_excel('data_studentlist_en.xlsx', sheet_name='Sheet1')
dfx.head(5)
name | gender | age | grade | absence | bloodtype | height | weight | |
---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | M | 23 | 3 | Y | O | 165.3 | 68.2 |
1 | Sarah O'Donnel | F | 22 | 2 | N | AB | 170.1 | 53.0 |
2 | Brian Martin | M | 24 | 4 | N | B | 175.0 | 80.1 |
3 | David Hassel | M | 23 | 3 | N | AB | 182.1 | 85.7 |
4 | Clara Rodriquez | F | 20 | 1 | Y | A | 168.0 | 49.5 |
dfx.to_excel('data_studentlist_en2.xlsx',sheet_name='NewSheet',index=False)
data = { 'NAME' : ['Jake', 'Jennifer', 'Paul', 'Andrew'], 'AGE': [24,21,25,19], 'GENDER':['M','F','M','M']}
df4 = pd.DataFrame(data)
df4 #리스트가 칼럼의 값으로 들어감
#칼럼의 길이(리스트의 길이)가 같아야 오류안남
NAME | AGE | GENDER | |
---|---|---|---|
0 | Jake | 24 | M |
1 | Jennifer | 21 | F |
2 | Paul | 25 | M |
3 | Andrew | 19 | M |
df5 = pd.DataFrame(np.random.rand(10,5), columns=['A','B','C','D','E']) #랜덤 데이터프레임
df5.head(3) #numpy배열로 데이터프레임 객체만듬
#10개의 행과 5개의 열이므로, 칼럼도 5개 잘 맞음
A | B | C | D | E | |
---|---|---|---|---|---|
0 | 0.390709 | 0.019426 | 0.114121 | 0.797595 | 0.051237 |
1 | 0.489058 | 0.025332 | 0.429168 | 0.987400 | 0.408086 |
2 | 0.855244 | 0.558176 | 0.436755 | 0.738553 | 0.914112 |
df_left = df.loc[:,['NAME','AGE','GENDER','GRADE','ABSENCE']]
df_left
NAME | AGE | GENDER | GRADE | ABSENCE | |
---|---|---|---|---|---|
0 | Jared Diamond | 23 | M | 3 | Y |
1 | Sarah O'Donnel | 22 | F | 2 | N |
2 | Brian Martin | 24 | M | 4 | N |
3 | David Hassel | 23 | M | 3 | N |
4 | Clara Rodriquez | 20 | F | 1 | Y |
5 | Jennifer Lorentz | 21 | F | 2 | N |
6 | Susan Clark | 22 | F | 1 | N |
7 | Margareth Jones | 23 | F | 1 | N |
8 | John Bertsch | 23 | M | 3 | N |
9 | Jake Timmerman | 22 | M | 2 | N |
10 | Joshua Connor | 24 | M | 4 | Y |
11 | John Matsuda | 22 | M | 2 | N |
12 | Eddy Johnson | 21 | M | 1 | N |
13 | Rebecah Anderson | 23 | F | 3 | N |
14 | Linda Carter | 22 | F | 2 | N |
15 | Richard Swayze | 24 | M | 4 | Y |
16 | Andrew Daley | 21 | M | 1 | N |
df_right = df.loc[:,['NAME','BLOODTYPE','WEIGHT','HEIGHT']]
df_right
NAME | BLOODTYPE | WEIGHT | HEIGHT | |
---|---|---|---|---|
0 | Jared Diamond | O | 68.2 | 165.3 |
1 | Sarah O'Donnel | AB | 53.0 | 170.1 |
2 | Brian Martin | B | 80.1 | 175.0 |
3 | David Hassel | AB | 85.7 | 182.1 |
4 | Clara Rodriquez | A | 49.5 | 168.0 |
5 | Jennifer Lorentz | O | 52.0 | 162.0 |
6 | Susan Clark | O | 45.3 | 155.2 |
7 | Margareth Jones | A | 55.0 | 176.9 |
8 | John Bertsch | B | 64.2 | 178.5 |
9 | Jake Timmerman | B | 61.3 | 176.1 |
10 | Joshua Connor | O | 62.0 | 167.1 |
11 | John Matsuda | AB | 75.8 | 180.0 |
12 | Eddy Johnson | A | 55.3 | 162.2 |
13 | Rebecah Anderson | O | 53.1 | 176.1 |
14 | Linda Carter | B | 45.2 | 158.2 |
15 | Richard Swayze | B | 70.2 | 168.6 |
16 | Andrew Daley | A | 62.2 | 169.2 |
pd.merge(df_left,df_right,on='NAME')
NAME | AGE | GENDER | GRADE | ABSENCE | BLOODTYPE | WEIGHT | HEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | 23 | M | 3 | Y | O | 68.2 | 165.3 |
1 | Sarah O'Donnel | 22 | F | 2 | N | AB | 53.0 | 170.1 |
2 | Brian Martin | 24 | M | 4 | N | B | 80.1 | 175.0 |
3 | David Hassel | 23 | M | 3 | N | AB | 85.7 | 182.1 |
4 | Clara Rodriquez | 20 | F | 1 | Y | A | 49.5 | 168.0 |
5 | Jennifer Lorentz | 21 | F | 2 | N | O | 52.0 | 162.0 |
6 | Susan Clark | 22 | F | 1 | N | O | 45.3 | 155.2 |
7 | Margareth Jones | 23 | F | 1 | N | A | 55.0 | 176.9 |
8 | John Bertsch | 23 | M | 3 | N | B | 64.2 | 178.5 |
9 | Jake Timmerman | 22 | M | 2 | N | B | 61.3 | 176.1 |
10 | Joshua Connor | 24 | M | 4 | Y | O | 62.0 | 167.1 |
11 | John Matsuda | 22 | M | 2 | N | AB | 75.8 | 180.0 |
12 | Eddy Johnson | 21 | M | 1 | N | A | 55.3 | 162.2 |
13 | Rebecah Anderson | 23 | F | 3 | N | O | 53.1 | 176.1 |
14 | Linda Carter | 22 | F | 2 | N | B | 45.2 | 158.2 |
15 | Richard Swayze | 24 | M | 4 | Y | B | 70.2 | 168.6 |
16 | Andrew Daley | 21 | M | 1 | N | A | 62.2 | 169.2 |
df_left_small = df_left.loc[:10,]
df_left_small
NAME | AGE | GENDER | GRADE | ABSENCE | |
---|---|---|---|---|---|
0 | Jared Diamond | 23 | M | 3 | Y |
1 | Sarah O'Donnel | 22 | F | 2 | N |
2 | Brian Martin | 24 | M | 4 | N |
3 | David Hassel | 23 | M | 3 | N |
4 | Clara Rodriquez | 20 | F | 1 | Y |
5 | Jennifer Lorentz | 21 | F | 2 | N |
6 | Susan Clark | 22 | F | 1 | N |
7 | Margareth Jones | 23 | F | 1 | N |
8 | John Bertsch | 23 | M | 3 | N |
9 | Jake Timmerman | 22 | M | 2 | N |
10 | Joshua Connor | 24 | M | 4 | Y |
df_right_small = df_right.loc[7:,]
df_right_small
NAME | BLOODTYPE | WEIGHT | HEIGHT | |
---|---|---|---|---|
7 | Margareth Jones | A | 55.0 | 176.9 |
8 | John Bertsch | B | 64.2 | 178.5 |
9 | Jake Timmerman | B | 61.3 | 176.1 |
10 | Joshua Connor | O | 62.0 | 167.1 |
11 | John Matsuda | AB | 75.8 | 180.0 |
12 | Eddy Johnson | A | 55.3 | 162.2 |
13 | Rebecah Anderson | O | 53.1 | 176.1 |
14 | Linda Carter | B | 45.2 | 158.2 |
15 | Richard Swayze | B | 70.2 | 168.6 |
16 | Andrew Daley | A | 62.2 | 169.2 |
1. 양쪽 온전하게 일치하는 경우 데이터 프레임 결합하기 (Inner Join): 교집합
pd.merge(df_left_small,df_right_small,left_on='NAME', right_on = 'NAME', how='inner')
NAME | AGE | GENDER | GRADE | ABSENCE | BLOODTYPE | WEIGHT | HEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Margareth Jones | 23 | F | 1 | N | A | 55.0 | 176.9 |
1 | John Bertsch | 23 | M | 3 | N | B | 64.2 | 178.5 |
2 | Jake Timmerman | 22 | M | 2 | N | B | 61.3 | 176.1 |
3 | Joshua Connor | 24 | M | 4 | Y | O | 62.0 | 167.1 |
2. 왼쪽 기준으로 데이터 프레임 결합하기 (Left Join): 왼쪽에서 다 선택, 오른쪽은 공통값만
pd.merge(df_left_small,df_right_small,left_on='NAME', right_on = 'NAME', how='left')
NAME | AGE | GENDER | GRADE | ABSENCE | BLOODTYPE | WEIGHT | HEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | 23 | M | 3 | Y | NaN | NaN | NaN |
1 | Sarah O'Donnel | 22 | F | 2 | N | NaN | NaN | NaN |
2 | Brian Martin | 24 | M | 4 | N | NaN | NaN | NaN |
3 | David Hassel | 23 | M | 3 | N | NaN | NaN | NaN |
4 | Clara Rodriquez | 20 | F | 1 | Y | NaN | NaN | NaN |
5 | Jennifer Lorentz | 21 | F | 2 | N | NaN | NaN | NaN |
6 | Susan Clark | 22 | F | 1 | N | NaN | NaN | NaN |
7 | Margareth Jones | 23 | F | 1 | N | A | 55.0 | 176.9 |
8 | John Bertsch | 23 | M | 3 | N | B | 64.2 | 178.5 |
9 | Jake Timmerman | 22 | M | 2 | N | B | 61.3 | 176.1 |
10 | Joshua Connor | 24 | M | 4 | Y | O | 62.0 | 167.1 |
3. 오른쪽 기준으로 데이터 프레임 결합하기 (Right Join): 오른쪽에서 다 선택, 왼쪽은 공통값만
pd.merge(df_left_small,df_right_small,left_on='NAME', right_on = 'NAME', how='right')
NAME | AGE | GENDER | GRADE | ABSENCE | BLOODTYPE | WEIGHT | HEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Margareth Jones | 23.0 | F | 1.0 | N | A | 55.0 | 176.9 |
1 | John Bertsch | 23.0 | M | 3.0 | N | B | 64.2 | 178.5 |
2 | Jake Timmerman | 22.0 | M | 2.0 | N | B | 61.3 | 176.1 |
3 | Joshua Connor | 24.0 | M | 4.0 | Y | O | 62.0 | 167.1 |
4 | John Matsuda | NaN | NaN | NaN | NaN | AB | 75.8 | 180.0 |
5 | Eddy Johnson | NaN | NaN | NaN | NaN | A | 55.3 | 162.2 |
6 | Rebecah Anderson | NaN | NaN | NaN | NaN | O | 53.1 | 176.1 |
7 | Linda Carter | NaN | NaN | NaN | NaN | B | 45.2 | 158.2 |
8 | Richard Swayze | NaN | NaN | NaN | NaN | B | 70.2 | 168.6 |
9 | Andrew Daley | NaN | NaN | NaN | NaN | A | 62.2 | 169.2 |
4. 좌우 결측치 상관없이 데이터 프레임 결합하기 (Outer Join):
pd.merge(df_left_small,df_right_small,left_on='NAME', right_on = 'NAME', how='outer')
NAME | AGE | GENDER | GRADE | ABSENCE | BLOODTYPE | WEIGHT | HEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | 23.0 | M | 3.0 | Y | NaN | NaN | NaN |
1 | Sarah O'Donnel | 22.0 | F | 2.0 | N | NaN | NaN | NaN |
2 | Brian Martin | 24.0 | M | 4.0 | N | NaN | NaN | NaN |
3 | David Hassel | 23.0 | M | 3.0 | N | NaN | NaN | NaN |
4 | Clara Rodriquez | 20.0 | F | 1.0 | Y | NaN | NaN | NaN |
5 | Jennifer Lorentz | 21.0 | F | 2.0 | N | NaN | NaN | NaN |
6 | Susan Clark | 22.0 | F | 1.0 | N | NaN | NaN | NaN |
7 | Margareth Jones | 23.0 | F | 1.0 | N | A | 55.0 | 176.9 |
8 | John Bertsch | 23.0 | M | 3.0 | N | B | 64.2 | 178.5 |
9 | Jake Timmerman | 22.0 | M | 2.0 | N | B | 61.3 | 176.1 |
10 | Joshua Connor | 24.0 | M | 4.0 | Y | O | 62.0 | 167.1 |
11 | John Matsuda | NaN | NaN | NaN | NaN | AB | 75.8 | 180.0 |
12 | Eddy Johnson | NaN | NaN | NaN | NaN | A | 55.3 | 162.2 |
13 | Rebecah Anderson | NaN | NaN | NaN | NaN | O | 53.1 | 176.1 |
14 | Linda Carter | NaN | NaN | NaN | NaN | B | 45.2 | 158.2 |
15 | Richard Swayze | NaN | NaN | NaN | NaN | B | 70.2 | 168.6 |
16 | Andrew Daley | NaN | NaN | NaN | NaN | A | 62.2 | 169.2 |
# columns로 매칭.
pd.concat([df_left_small,df_right_small],sort=True,ignore_index=True)
ABSENCE | AGE | BLOODTYPE | GENDER | GRADE | HEIGHT | NAME | WEIGHT | |
---|---|---|---|---|---|---|---|---|
0 | Y | 23.0 | NaN | M | 3.0 | NaN | Jared Diamond | NaN |
1 | N | 22.0 | NaN | F | 2.0 | NaN | Sarah O'Donnel | NaN |
2 | N | 24.0 | NaN | M | 4.0 | NaN | Brian Martin | NaN |
3 | N | 23.0 | NaN | M | 3.0 | NaN | David Hassel | NaN |
4 | Y | 20.0 | NaN | F | 1.0 | NaN | Clara Rodriquez | NaN |
5 | N | 21.0 | NaN | F | 2.0 | NaN | Jennifer Lorentz | NaN |
6 | N | 22.0 | NaN | F | 1.0 | NaN | Susan Clark | NaN |
7 | N | 23.0 | NaN | F | 1.0 | NaN | Margareth Jones | NaN |
8 | N | 23.0 | NaN | M | 3.0 | NaN | John Bertsch | NaN |
9 | N | 22.0 | NaN | M | 2.0 | NaN | Jake Timmerman | NaN |
10 | Y | 24.0 | NaN | M | 4.0 | NaN | Joshua Connor | NaN |
11 | NaN | NaN | A | NaN | NaN | 176.9 | Margareth Jones | 55.0 |
12 | NaN | NaN | B | NaN | NaN | 178.5 | John Bertsch | 64.2 |
13 | NaN | NaN | B | NaN | NaN | 176.1 | Jake Timmerman | 61.3 |
14 | NaN | NaN | O | NaN | NaN | 167.1 | Joshua Connor | 62.0 |
15 | NaN | NaN | AB | NaN | NaN | 180.0 | John Matsuda | 75.8 |
16 | NaN | NaN | A | NaN | NaN | 162.2 | Eddy Johnson | 55.3 |
17 | NaN | NaN | O | NaN | NaN | 176.1 | Rebecah Anderson | 53.1 |
18 | NaN | NaN | B | NaN | NaN | 158.2 | Linda Carter | 45.2 |
19 | NaN | NaN | B | NaN | NaN | 168.6 | Richard Swayze | 70.2 |
20 | NaN | NaN | A | NaN | NaN | 169.2 | Andrew Daley | 62.2 |
# index로 매칭.
pd.concat([df_left_small,df_right_small],axis=1,sort=True,ignore_index=False)
NAME | AGE | GENDER | GRADE | ABSENCE | NAME | BLOODTYPE | WEIGHT | HEIGHT | |
---|---|---|---|---|---|---|---|---|---|
0 | Jared Diamond | 23.0 | M | 3.0 | Y | NaN | NaN | NaN | NaN |
1 | Sarah O'Donnel | 22.0 | F | 2.0 | N | NaN | NaN | NaN | NaN |
2 | Brian Martin | 24.0 | M | 4.0 | N | NaN | NaN | NaN | NaN |
3 | David Hassel | 23.0 | M | 3.0 | N | NaN | NaN | NaN | NaN |
4 | Clara Rodriquez | 20.0 | F | 1.0 | Y | NaN | NaN | NaN | NaN |
5 | Jennifer Lorentz | 21.0 | F | 2.0 | N | NaN | NaN | NaN | NaN |
6 | Susan Clark | 22.0 | F | 1.0 | N | NaN | NaN | NaN | NaN |
7 | Margareth Jones | 23.0 | F | 1.0 | N | Margareth Jones | A | 55.0 | 176.9 |
8 | John Bertsch | 23.0 | M | 3.0 | N | John Bertsch | B | 64.2 | 178.5 |
9 | Jake Timmerman | 22.0 | M | 2.0 | N | Jake Timmerman | B | 61.3 | 176.1 |
10 | Joshua Connor | 24.0 | M | 4.0 | Y | Joshua Connor | O | 62.0 | 167.1 |
11 | NaN | NaN | NaN | NaN | NaN | John Matsuda | AB | 75.8 | 180.0 |
12 | NaN | NaN | NaN | NaN | NaN | Eddy Johnson | A | 55.3 | 162.2 |
13 | NaN | NaN | NaN | NaN | NaN | Rebecah Anderson | O | 53.1 | 176.1 |
14 | NaN | NaN | NaN | NaN | NaN | Linda Carter | B | 45.2 | 158.2 |
15 | NaN | NaN | NaN | NaN | NaN | Richard Swayze | B | 70.2 | 168.6 |
16 | NaN | NaN | NaN | NaN | NaN | Andrew Daley | A | 62.2 | 169.2 |