15 분 소요

Pandas (2)

import numpy as np
import pandas as pd

5. 정렬과 순위

obj = pd.Series(np.arange(4), index=['d', 'a', 'b', 'c'])
obj
d    0
a    1
b    2
c    3
dtype: int64
obj.sort_index()
a    1
b    2
c    3
d    0
dtype: int64
obj.sort_values()
d    0
a    1
b    2
c    3
dtype: int64
frame = pd.DataFrame(np.arange(8).reshape(2, 4),
                     index = ['three', 'one'],
                     columns = ['d', 'a', 'b', 'c'])
frame
d a b c
three 0 1 2 3
one 4 5 6 7
frame.sort_index() # axis=0(기본값) : index를 정렬, axis=1 : column을 정렬
d a b c
one 4 5 6 7
three 0 1 2 3
frame.sort_index(axis='columns') # axis=1 과 동일, column 을 정렬
a b c d
three 1 2 3 0
one 5 6 7 4
frame = pd.DataFrame({'b' : [4, 7, -3, 2], 'c' : [0, 1, 0, 1]})
frame
b c
0 4 0
1 7 1
2 -3 0
3 2 1
frame.sort_values(by='b') # axis=0 (기본값) : 행축을 따라서 정렬
b c
2 -3 0
3 2 1
0 4 0
1 7 1
frame.sort_values(by='b', axis=0, ascending=False)
b c
1 7 1
0 4 0
3 2 1
2 -3 0
# 계층적으로 정렬
frame.sort_values(by=['c', 'b'], axis=0)
b c
2 -3 0
0 4 0
3 2 1
1 7 1

6. 기술 통계 계산과 요약

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
df.sum() # axis=0 : 행축을 따라서 합계 구함
one    9.25
two   -5.80
dtype: float64
df.sum(axis=1)
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
df.sum(axis=1, skipna=False)
a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64
# numpy : 최대값, 최소값의 위치를 구할 때 argmax(), argmin()
# pandas : 최대값, 최소값의 위치를 구할 때 idxmax(), idxmin()
df
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
df.idxmax() # 행축을 따라서 최대값의 index
one    b
two    d
dtype: object
df.cumsum() # 행축을 따라서 누적합이 계산
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
df.describe()
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj
0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object
obj.describe() # 수치 데이터의 describe()와는 다른 결과
               # 범주 데이터의 describe()에는 고윳값의 갯수(unique), 빈도수가 가장 많았던 고윳값 (top, freq)
count     16
unique     3
top        a
freq       8
dtype: object
import seaborn as sns
df = sns.load_dataset('titanic')
df = df[['survived',	'pclass',	'sex',	'age']]
df.describe()
survived pclass age
count 891.000000 891.000000 714.000000
mean 0.383838 2.308642 29.699118
std 0.486592 0.836071 14.526497
min 0.000000 1.000000 0.420000
25% 0.000000 2.000000 20.125000
50% 0.000000 3.000000 28.000000
75% 1.000000 3.000000 38.000000
max 1.000000 3.000000 80.000000
df['sex'].describe()
count      891
unique       2
top       male
freq       577
Name: sex, dtype: object
df.describe(include='all')
survived pclass sex age
count 891.000000 891.000000 891 714.000000
unique NaN NaN 2 NaN
top NaN NaN male NaN
freq NaN NaN 577 NaN
mean 0.383838 2.308642 NaN 29.699118
std 0.486592 0.836071 NaN 14.526497
min 0.000000 1.000000 NaN 0.420000
25% 0.000000 2.000000 NaN 20.125000
50% 0.000000 3.000000 NaN 28.000000
75% 1.000000 3.000000 NaN 38.000000
max 1.000000 3.000000 NaN 80.000000

6.1 상관관계와 공분산

df = pd.DataFrame({"math" : [50, 60, 40, 30, 70, 50], "physics" : [40, 60, 50, 20, 80, 50]})
df
math physics
0 50 40
1 60 60
2 40 50
3 30 20
4 70 80
5 50 50
  • 산포도(산점도)
df.plot(kind='scatter', x='math', y='physics')
<matplotlib.axes._subplots.AxesSubplot at 0x7f1b174222b0>

png

  • 상관계수
df.corr()
math physics
math 1.000000 0.919239
physics 0.919239 1.000000
# 자전거 공유 데이터셋 (bike_train.csv)
bike = pd.read_csv('./datasets/bike_train.csv')
bike.corr()
season holiday workingday weather temp atemp humidity windspeed casual registered count
season 1.000000 0.029368 -0.008126 0.008879 0.258689 0.264744 0.190610 -0.147121 0.096758 0.164011 0.163439
holiday 0.029368 1.000000 -0.250491 -0.007074 0.000295 -0.005215 0.001929 0.008409 0.043799 -0.020956 -0.005393
workingday -0.008126 -0.250491 1.000000 0.033772 0.029966 0.024660 -0.010880 0.013373 -0.319111 0.119460 0.011594
weather 0.008879 -0.007074 0.033772 1.000000 -0.055035 -0.055376 0.406244 0.007261 -0.135918 -0.109340 -0.128655
temp 0.258689 0.000295 0.029966 -0.055035 1.000000 0.984948 -0.064949 -0.017852 0.467097 0.318571 0.394454
atemp 0.264744 -0.005215 0.024660 -0.055376 0.984948 1.000000 -0.043536 -0.057473 0.462067 0.314635 0.389784
humidity 0.190610 0.001929 -0.010880 0.406244 -0.064949 -0.043536 1.000000 -0.318607 -0.348187 -0.265458 -0.317371
windspeed -0.147121 0.008409 0.013373 0.007261 -0.017852 -0.057473 -0.318607 1.000000 0.092276 0.091052 0.101369
casual 0.096758 0.043799 -0.319111 -0.135918 0.467097 0.462067 -0.348187 0.092276 1.000000 0.497250 0.690414
registered 0.164011 -0.020956 0.119460 -0.109340 0.318571 0.314635 -0.265458 0.091052 0.497250 1.000000 0.970948
count 0.163439 -0.005393 0.011594 -0.128655 0.394454 0.389784 -0.317371 0.101369 0.690414 0.970948 1.000000
bike['count'].corr(bike['temp']) # 상관계수
0.39445364496724905
bike['count'].cov(bike['temp']) # 공분산
556.7331709807349
bike.corrwith(bike['count'])
season        0.163439
holiday      -0.005393
workingday    0.011594
weather      -0.128655
temp          0.394454
atemp         0.389784
humidity     -0.317371
windspeed     0.101369
casual        0.690414
registered    0.970948
count         1.000000
dtype: float64
bike.corrwith(bike['count']).sort_values(ascending=False)
count         1.000000
registered    0.970948
casual        0.690414
temp          0.394454
atemp         0.389784
season        0.163439
windspeed     0.101369
workingday    0.011594
holiday      -0.005393
weather      -0.128655
humidity     -0.317371
dtype: float64
bike.plot(kind='scatter', x='temp', y='count', alpha=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0x7f1b14c2deb0>

png

6.2 범주 데이터 요약

import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
titanic.describe(include='object')
sex embarked who embark_town alive
count 891 889 891 889 891
unique 2 3 3 3 2
top male S man Southampton no
freq 577 644 537 644 549
titanic['embark_town'].unique() # unique한 원소들
array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)
titanic['embark_town'].nunique() # unique한 원소들의 개수
3
titanic['embark_town'].value_counts()
Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64
titanic['embark_town'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f1b0bc46f10>

png

7. 데이터 로딩과 저장

  • 텍스트 형식 데이터 읽기
df = pd.read_csv('./examples/ex1.csv')
df
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
df.columns
Index(['a', 'b', 'c', 'd', 'message'], dtype='object')
df = pd.read_table('./examples/ex1.csv', sep=',')
df
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
df = pd.read_csv('./examples/ex2.csv', header=None)
df
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
names = ['a', 'b', 'c', 'd', 'message']
df = pd.read_csv('./examples/ex2.csv', names=names)
df
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
df.set_index('message')
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
# 위에서 했던 컬럼명 지정과 인덱스 설정
names = ['a', 'b', 'c', 'd', 'message']
df = pd.read_csv('./examples/ex2.csv', names=names, index_col='message')
df
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
# 위에서 했던 컬럼명 지정과 인덱스 설정
names = ['a', 'b', 'c', 'd', 'message']
df = pd.read_csv('./examples/ex2.csv', names=names, index_col=4)
df
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
# !cat examples/ex3.txt
            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491
# !type examples/ex3.txt
/bin/bash: line 0: type: examples/ex3.txt: not found
lst = []
with open('examples/ex3.txt') as f:
  for line in f:
    lst.append(line)
lst    
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']
df = pd.read_table('./examples/ex3.txt', sep='\s+') # \s+ : 공백 1개 이상의 패턴과 매치시켜주는 정규표현식
df
A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
  • 큰 데이터를 조금씩 읽어오기
pd.options.display.max_rows = 10 # 결과창에 보이는 행의 개수 조정정
result = pd.read_csv('./examples/ex6.csv')
result
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
... ... ... ... ... ...
9995 2.311896 -0.417070 -1.409599 -0.515821 L
9996 -0.479893 -0.650419 0.745152 -0.646038 E
9997 0.523331 0.787112 0.486066 1.093156 K
9998 -0.362559 0.598894 -1.843201 0.887292 G
9999 -0.096376 -1.012999 -0.657431 -0.573315 0

10000 rows × 5 columns

chunker = pd.read_csv('./examples/ex6.csv', chunksize=1000)
for piece in chunker:
  print(piece.head(2))
        one       two     three      four key
0  0.467976 -0.038649 -0.295344 -1.824726   L
1 -0.358893  1.404453  0.704965 -0.200638   B
           one       two     three      four key
1000  0.467976 -0.038649 -0.295344 -1.824726   T
1001 -0.358893  1.404453  0.704965 -0.200638   J
           one       two     three      four key
2000  0.467976 -0.038649 -0.295344 -1.824726   1
2001 -0.358893  1.404453  0.704965 -0.200638   H
           one       two     three      four key
3000  0.467976 -0.038649 -0.295344 -1.824726   H
3001 -0.358893  1.404453  0.704965 -0.200638   Y
           one       two     three      four key
4000  0.467976 -0.038649 -0.295344 -1.824726   H
4001 -0.358893  1.404453  0.704965 -0.200638   Z
           one       two     three      four key
5000  0.467976 -0.038649 -0.295344 -1.824726   1
5001 -0.358893  1.404453  0.704965 -0.200638   Z
           one       two     three      four key
6000  0.467976 -0.038649 -0.295344 -1.824726   I
6001 -0.358893  1.404453  0.704965 -0.200638   X
           one       two     three      four key
7000  0.467976 -0.038649 -0.295344 -1.824726   1
7001 -0.358893  1.404453  0.704965 -0.200638   I
           one       two     three      four key
8000  0.467976 -0.038649 -0.295344 -1.824726   7
8001 -0.358893  1.404453  0.704965 -0.200638   W
           one       two     three      four key
9000  0.467976 -0.038649 -0.295344 -1.824726   B
9001 -0.358893  1.404453  0.704965 -0.200638   M
  • 텍스트 형식으로 기록하기
data = pd.read_csv('./examples/ex5.csv')
data
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
data.to_csv('./examples/out.csv') # index, column 포함해서 저장장
data.to_csv('./examples/out2.csv', index=False) # index 제외하고 저장장
data.to_csv('./examples/out3.csv', index=False, header=False) # index, column 모두 제외하고 저장
  • json 파일 읽기
data = pd.read_json('./examples/example.json')
data
a b c
0 1 2 3
1 4 5 6
2 7 8 9
  • excel 파일 읽기
frame = pd.read_excel('./examples/ex1.xlsx')
frame
Unnamed: 0 a b c d message
0 0 1 2 3 4 hello
1 1 5 6 7 8 world
2 2 9 10 11 12 foo
frame.to_excel('./examples/ex2.xlsx', index=False)
frame2 = pd.read_excel('./examples/ex2.xlsx')
frame2
Unnamed: 0 a b c d message
0 0 1 2 3 4 hello
1 1 5 6 7 8 world
2 2 9 10 11 12 foo

8. 데이터 정제 및 준비

8.1 누락된 데이터 처리하기

  • 결측치 확인, 삭제
string_data = pd.Series(['abc', 'def', np.nan, 'ghi'])
string_data
0    abc
1    def
2    NaN
3    ghi
dtype: object
string_data.isnull()
0    False
1    False
2     True
3    False
dtype: bool
pd.isnull(string_data)
0    False
1    False
2     True
3    False
dtype: bool
type(np.nan)
float
type(None)
NoneType
string_data = pd.Series(['abc', 'def', None, 'ghi'])
string_data
0     abc
1     def
2    None
3     ghi
dtype: object
string_data.isnull()
0    False
1    False
2     True
3    False
dtype: bool
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data
0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
data.isnull()
0    False
1     True
2    False
3     True
4    False
dtype: bool
data[data.isnull()] # 불리안 색인인
1   NaN
3   NaN
dtype: float64
# null인 행을 삭제하고 싶다면 null인 행의 인덱스 구하고 drop 함수 적용
data.drop([1, 3])
0    1.0
2    3.5
4    7.0
dtype: float64
null_idx = data[data.isnull()].index
data.drop(null_idx) # axis=0이므로 null_idx인 행을 삭제
0    1.0
2    3.5
4    7.0
dtype: float64
# dropna를 사용하면 null인 행을 삭제
data.dropna()
0    1.0
2    3.5
4    7.0
dtype: float64
data = pd.DataFrame([[1.0, 6.5, 3.0],
                     [1.0, np.nan, np.nan],
                     [np.nan, np.nan, np.nan],
                     [np.nan, 6.0, 3.0]])
data
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.0 3.0
data.dropna() # axis=0 기본값이므로 행을 삭제
              # how='any' 기본값이므로 null이 하나라도 있으면 삭제
0 1 2
0 1.0 6.5 3.0
# 위와 결과 동일일
data.dropna(axis=0, how='any')
0 1 2
0 1.0 6.5 3.0
data[100] = np.nan
data
0 1 2 100
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.0 3.0 NaN
data.dropna(axis=1) # axis=1이므로 열을 삭제
                    # how='any' 가 기본값이므로 null이 하나라도 있으면 삭제제
0
1
2
3
# 위와 동일한 결과
data.dropna(axis=1, how='any')
0
1
2
3
data.dropna(axis=1, how='all') # axis=1이므로 열을 삭제
                               # how='all'이므로 모든 값이 null일 경우 삭제제
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.0 3.0
df = pd.DataFrame(np.random.randn(7, 3))
df
0 1 2
0 1.576235 1.580954 0.137933
1 -0.078512 -0.616893 -0.397091
2 -1.607066 0.315902 1.822990
3 1.101237 0.075200 0.968919
4 0.113288 1.684596 -0.050132
5 0.681725 -0.350161 -1.902540
6 -0.404688 0.052319 0.691823
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df
0 1 2
0 1.576235 NaN NaN
1 -0.078512 NaN NaN
2 -1.607066 NaN 1.822990
3 1.101237 NaN 0.968919
4 0.113288 1.684596 -0.050132
5 0.681725 -0.350161 -1.902540
6 -0.404688 0.052319 0.691823
df.dropna() # axis=0, how='any'
0 1 2
4 0.113288 1.684596 -0.050132
5 0.681725 -0.350161 -1.902540
6 -0.404688 0.052319 0.691823
df.dropna(axis=1)
0
0 1.576235
1 -0.078512
2 -1.607066
3 1.101237
4 0.113288
5 0.681725
6 -0.404688
df.dropna(axis=1, thresh=5) #thresh=5 : null이 아닌 데이터가 5개 미만인 데이터만 삭제
0 2
0 1.576235 NaN
1 -0.078512 NaN
2 -1.607066 1.822990
3 1.101237 0.968919
4 0.113288 -0.050132
5 0.681725 -1.902540
6 -0.404688 0.691823
df
0 1 2
0 1.576235 NaN NaN
1 -0.078512 NaN NaN
2 -1.607066 NaN 1.822990
3 1.101237 NaN 0.968919
4 0.113288 1.684596 -0.050132
5 0.681725 -0.350161 -1.902540
6 -0.404688 0.052319 0.691823
  • 결측치 채우기
df
0 1 2
0 1.576235 NaN NaN
1 -0.078512 NaN NaN
2 -1.607066 NaN 1.822990
3 1.101237 NaN 0.968919
4 0.113288 1.684596 -0.050132
5 0.681725 -0.350161 -1.902540
6 -0.404688 0.052319 0.691823
df.fillna(0)
0 1 2
0 1.576235 0.000000 0.000000
1 -0.078512 0.000000 0.000000
2 -1.607066 0.000000 1.822990
3 1.101237 0.000000 0.968919
4 0.113288 1.684596 -0.050132
5 0.681725 -0.350161 -1.902540
6 -0.404688 0.052319 0.691823
df.fillna({1:0, 2:0.5})
0 1 2
0 1.576235 0.000000 0.500000
1 -0.078512 0.000000 0.500000
2 -1.607066 0.000000 1.822990
3 1.101237 0.000000 0.968919
4 0.113288 1.684596 -0.050132
5 0.681725 -0.350161 -1.902540
6 -0.404688 0.052319 0.691823
df.fillna({1:0, 2:0.5}, inplace=True)
df
0 1 2
0 1.576235 0.000000 0.500000
1 -0.078512 0.000000 0.500000
2 -1.607066 0.000000 1.822990
3 1.101237 0.000000 0.968919
4 0.113288 1.684596 -0.050132
5 0.681725 -0.350161 -1.902540
6 -0.404688 0.052319 0.691823
df = pd.DataFrame(np.random.randn(6, 3))
df
0 1 2
0 -0.636706 0.377941 0.021031
1 1.266285 -0.817885 1.025722
2 -0.526124 -0.261350 -0.509566
3 1.239667 1.088629 1.037033
4 0.900286 0.029285 -0.889282
5 -0.131138 -1.700356 -0.269322
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df
0 1 2
0 -0.636706 0.377941 0.021031
1 1.266285 -0.817885 1.025722
2 -0.526124 NaN -0.509566
3 1.239667 NaN 1.037033
4 0.900286 NaN NaN
5 -0.131138 NaN NaN
df.fillna(axis=0, method='ffill') # axis=0이므로 "행축을 따라서" 누락값을 채워넣기
                                  # forward 방향으로 채워넣기기
0 1 2
0 -0.636706 0.377941 0.021031
1 1.266285 -0.817885 1.025722
2 -0.526124 -0.817885 -0.509566
3 1.239667 -0.817885 1.037033
4 0.900286 -0.817885 1.037033
5 -0.131138 -0.817885 1.037033
df.fillna(axis=1, method='ffill') # axis=0이므로 "열열축을 따라서" 누락값을 채워넣기
                                  # forward 방향으로 채워넣기기
0 1 2
0 -0.636706 0.377941 0.021031
1 1.266285 -0.817885 1.025722
2 -0.526124 -0.526124 -0.509566
3 1.239667 1.239667 1.037033
4 0.900286 0.900286 0.900286
5 -0.131138 -0.131138 -0.131138
df.fillna(axis=0, method='ffill', limit=2)
0 1 2
0 -0.636706 0.377941 0.021031
1 1.266285 -0.817885 1.025722
2 -0.526124 -0.817885 -0.509566
3 1.239667 -0.817885 1.037033
4 0.900286 NaN 1.037033
5 -0.131138 NaN 1.037033

Reference

파이썬 라이브러리를 활용한 데이터 분석 (웨스 맥키니 저)

댓글남기기