12 분 소요

Pandas (1)

Pandas 개요

  • pandas는 for문을 사용하지 않고 데이터를 처리한다거나 배열 기반의 함수를 제 공하는 등 NumPy의 배열 기반 계산 스타일을 많이 차용
  • pandas가 NumPy 스타일을 많이 차용했지만 가장 큰 차이점은 pandas는 표 형 식의 데이터나 다양한 형태의 데이터를 다루는 데 초점을 맞춰 설계
  • 그에 비해 NumPy는 단일 산술 배열 데이터를 다루는 데 특화
  • 고수준의 자료구조를 제공하고 파이썬 생태계 내의 다른 분석 라이브러리 등과 함께 사용

1. Pandas 자료구조

import pandas as pd
import numpy as np

1.1 Series

  • 1차원 데이터
obj = pd.Series([4, 7, -5, 3])
obj
0    4
1    7
2   -5
3    3
dtype: int64
obj.values
array([ 4,  7, -5,  3])
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2
d    4
b    7
a   -5
c    3
dtype: int64
obj2['d'] # 라벨 색인
4
obj2[0] # 정수 색인
4
obj2[[0, 1, 3]] # 팬시 색인 (정수)
d    4
b    7
c    3
dtype: int64
obj2[['d', 'b', 'c']] 
d    4
b    7
c    3
dtype: int64
obj2 > 0
d     True
b     True
a    False
c     True
dtype: bool
obj2[obj2 > 0] # 불리안 색인
d    4
b    7
c    3
dtype: int64
obj2 * 2
d     8
b    14
a   -10
c     6
dtype: int64
np.exp(obj2) # 유니버설 함수
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
sdata = {'Ohio':3500, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = pd.Series(sdata)
obj3
Ohio       3500
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
obj4 = pd.Series(sdata, index=['California', 'Ohio', 'Oregon', 'Texas'])
obj4
California        NaN
Ohio           3500.0
Oregon        16000.0
Texas         71000.0
dtype: float64
pd.isnull(obj4).sum()
1
pd.notnull(obj4)
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
obj
0    4
1    7
2   -5
3    3
dtype: int64
obj.index = ['a', 'b', 'c', 'd']
obj
a    4
b    7
c   -5
d    3
dtype: int64

1.2 DataFrame

  • 2차원 데이터
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002, 2003],
        'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
        }

frame = pd.DataFrame(data)
frame
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
frame.index
RangeIndex(start=0, stop=6, step=1)
frame.columns
Index(['state', 'year', 'pop'], dtype='object')
frame.values
array([['Ohio', 2000, 1.5],
       ['Ohio', 2001, 1.7],
       ['Ohio', 2002, 3.6],
       ['Nevada', 2001, 2.4],
       ['Nevada', 2002, 2.9],
       ['Nevada', 2003, 3.2]], dtype=object)
frame.head(3)
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
frame.tail(2) # 전체 데이터중 뒤에서부터 2행만 보여주기
state year pop
4 Nevada 2002 2.9
5 Nevada 2003 3.2
data
{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(data, columns= ['year', 'state', 'pop', 'debt'],
                      index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
frame2.index
Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')

(1) 열 색인

frame2['state'] # 2차원 데이터프레임을 색인하면 1차원 시리즈 데이터
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
type(frame2['state'])
pandas.core.series.Series
frame2.state
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

(2) 행 색인

frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
frame2['one'] # error
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:


/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()


/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()


KeyError: 'one'


The above exception was the direct cause of the following exception:


KeyError                                  Traceback (most recent call last)

<ipython-input-70-387cfdc2fd9b> in <module>
----> 1 frame2['one']


/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   3456             if self.columns.nlevels > 1:
   3457                 return self._getitem_multilevel(key)
-> 3458             indexer = self.columns.get_loc(key)
   3459             if is_integer(indexer):
   3460                 indexer = [indexer]


/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 
   3365         if is_scalar(key) and isna(key) and not self.hasnans:


KeyError: 'one'
frame2.loc['one'] # 행색인(라벨) 1차원 시리즈 데이터
year     2000
state    Ohio
pop       1.5
debt      NaN
Name: one, dtype: object
frame2.iloc[0] # 행색인(정수)
year     2000
state    Ohio
pop       1.5
debt      NaN
Name: one, dtype: object
frame2.loc['one':'two'] # 행슬라이싱(라벨)
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
frame2.iloc[0:2] # 행슬라이싱(정수) , 주의: 정수슬라이싱을 할때는 끝범위가 포함되지 않음
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
frame2['one':'two'] # 슬라이싱 할 때는 loc가 없어도 가능
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
frame2['debt'] = 16.5 # 브로드캐스팅
frame2
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5
six 2003 Nevada 3.2 16.5
frame2['debt'] = np.arange(6) # numpy array를 넣었음
frame2
year state pop debt
one 2000 Ohio 1.5 0
two 2001 Ohio 1.7 1
three 2002 Ohio 3.6 2
four 2001 Nevada 2.4 3
five 2002 Nevada 2.9 4
six 2003 Nevada 3.2 5
sr = pd.Series([1, 2, 3, 4, 5, 6], index = ['one', 'two', 'three', 'four', 'five', 'six'])

frame2['debt'] = sr
frame2
year state pop debt
one 2000 Ohio 1.5 1
two 2001 Ohio 1.7 2
three 2002 Ohio 3.6 3
four 2001 Nevada 2.4 4
five 2002 Nevada 2.9 5
six 2003 Nevada 3.2 6
sr = pd.Series([1, 3, 5], index = ['one', 'three', 'five'])

frame2['debt'] = sr
frame2
year state pop debt
one 2000 Ohio 1.5 1.0
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 3.0
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 5.0
six 2003 Nevada 3.2 NaN
pop = {'Nevada' : {2001:2.4, 2002:2.9},
       'Ohio' : {2000:1.5, 2001:1.7, 2002:3.6}}

frame3 = pd.DataFrame(pop)       
frame3
Nevada Ohio
2001 2.4 1.7
2002 2.9 3.6
2000 NaN 1.5
frame3.T
2001 2002 2000
Nevada 2.4 2.9 NaN
Ohio 1.7 3.6 1.5
pd.DataFrame(pop, index= [2001, 2002, 2003])
Nevada Ohio
2001 2.4 1.7
2002 2.9 3.6
2003 NaN NaN

1.3 Index

  • 컬럼명, 인덱스 (데이터과 행과 열을 알려주는 메타 데이터)
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj
a    0
b    1
c    2
dtype: int64
obj.index
Index(['a', 'b', 'c'], dtype='object')
obj.index[:2]
Index(['a', 'b'], dtype='object')
obj.index[1]
'b'
obj.index[1] = 'd' # 쓰기 금지
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-104-f78538b7b4e7> in <module>
----> 1 obj.index[1] = 'd'


/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
   4583     @final
   4584     def __setitem__(self, key, value):
-> 4585         raise TypeError("Index does not support mutable operations")
   4586 
   4587     def __getitem__(self, key):


TypeError: Index does not support mutable operations
labels = pd.Index(['a', 'b', 'c'])
pd.Series([10, 20, 30], index=labels)
a    10
b    20
c    30
dtype: int64

2. 중요한 기능들

2.1 재색인

sr = pd.Series([1, 2, 3, 4], index=[0, 3, 4, 5])
sr
0    1
3    2
4    3
5    4
dtype: int64
sr.reindex(np.arange(6))
0    1.0
1    NaN
2    NaN
3    2.0
4    3.0
5    4.0
dtype: float64
sr.reindex(np.arange(6), method='bfill')
0    1
1    2
2    2
3    2
4    3
5    4
dtype: int64
sr.reindex(np.arange(6), method='ffill')
0    1
1    1
2    1
3    2
4    3
5    4
dtype: int64

2.2 로우나 컬럼 삭제하기

sr = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
sr
a    0
b    1
c    2
d    3
e    4
dtype: int64
sr.drop('c')
a    0
b    1
d    3
e    4
dtype: int64
sr.drop(['c', 'd'])
a    0
b    1
e    4
dtype: int64
data = pd.DataFrame(np.arange(16).reshape(4,4),
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns = ['one', 'two', 'three', 'four'])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
# axis=0 (행축), axis=1 (열축)
# axis=index, axis=columns
# (1) drop 연산을 할 경우에는 지정된 "축을" 삭제
# (2) 통계/수학 메서드(sum, mean, ...)를 사용할 때는 "축을 따라서" 계산
# 열삭제
# data.drop('two', axis=1) # axis 기본값은 0
data.drop('two', axis='columns')
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15
data.drop(['two', 'four'], axis='columns')
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14

2.3 색인하기, 선택하기, 거르기

data = pd.DataFrame(np.arange(16).reshape(4,4),
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns = ['one', 'two', 'three', 'four'])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data['two'] # Series
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
data[['two', 'four']] # DataFrame
two four
Ohio 1 3
Colorado 5 7
Utah 9 11
New York 13 15
data < 5
one two three four
Ohio True True True True
Colorado True False False False
Utah False False False False
New York False False False False
data[data < 5] = 0
data
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
  • loc, iloc
data.loc['Colorado']['one']
0
data.loc['Colorado']['one':'three']
one      0
two      5
three    6
Name: Colorado, dtype: int64
data.loc['Colorado'][['one', 'three']]
one      0
three    6
Name: Colorado, dtype: int64
data.loc['Colorado', ['one', 'three']]
one      0
three    6
Name: Colorado, dtype: int64
data.iloc[1, [0, 2]]
one      0
three    6
Name: Colorado, dtype: int64
data.loc[:'Utah']
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
data.iloc[:3]
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11

실습

  • 데이터 프레임 객체에서 색인 연습
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five'])
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
from IPython.display import Image
Image('./images/image7.png', width=400)

png

# 1
frame2['state']
# 2
frame2[['state', 'pop', 'debt']]
frame2.loc[:, 'state':'debt']
# 3
frame2.loc['two']
frame2.iloc[1]
# 4
frame2.loc['two':'four']
frame2.iloc[1:4]
# 5
frame2.iloc[2:4, 2:4]
frame2.loc['three':'four', 'pop':'debt']
# 6
frame2[['pop', 'debt', 'state']]
# 7
frame2.loc[['three', 'four', 'two']]
frame2.iloc[[2, 3, 1]]

3. 산술 연산과 데이터 정렬

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])
s1
a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
s2
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
s1 + s2
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
df1 = pd.DataFrame(np.arange(12).reshape(3, 4), columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20).reshape(4, 5), columns = list('abcde'))
df2.loc[1, 'b'] = np.nan
df1
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
df2
a b c d e
0 0 1.0 2 3 4
1 5 NaN 7 8 9
2 10 11.0 12 13 14
3 15 16.0 17 18 19
df1 + df2
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 NaN 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
df1.add(df2, fill_value=0) # 값이 없는 원소들은 0으로 채워넣고 연산을 진행
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 5.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0

4. 함수 적용과 매핑

frame = pd.DataFrame(np.arange(12).reshape(4,3), columns= list('bde'),
                     index = ["Utah", "Ohio", "Texas", "Oregon"])
frame
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11
# 컬럼별 합계
frame.sum() # pandas에서는 numpy와 다르게 axis=0이 기본값으로 설정
            # 0번축(행축)을 따라서 합계구하기
b    18
d    22
e    26
dtype: int64
frame.sum(axis=0)
b    18
d    22
e    26
dtype: int64
# 행별 합계
frame.sum(axis='columns') # frame.sum(aixs=1) # 열을 따라서 합계 구하기
Utah       3
Ohio      12
Texas     21
Oregon    30
dtype: int64
np.sum(frame, axis=1)
Utah       3
Ohio      12
Texas     21
Oregon    30
dtype: int64
frame.mean()
b    4.5
d    5.5
e    6.5
dtype: float64
frame.min()
b    0
d    1
e    2
dtype: int64
frame.max(axis=1)
Utah       2
Ohio       5
Texas      8
Oregon    11
dtype: int64
frame.max(axis=0) - frame.min(axis=0)
b    9
d    9
e    9
dtype: int64
  • Dataframe에서 제공하지 않는 함수는 직접 만들어서 적용할 수 있음 (apply 함수 이용)
def range_f(x):
  return x.max() - x.min()

frame.apply(range_f, axis=0)  
b    9
d    9
e    9
dtype: int64
frame.apply(lambda x:x.max() - x.min(), axis=0) 
b    9
d    9
e    9
dtype: int64
  • Dataframe의 모든 원소에 어떤 함수를 적용할 때 applymap
frame
b d e
Utah 0 1 2
Ohio 3 4 5
Texas 6 7 8
Oregon 9 10 11
x = 0.45678
'%.2f'%x
'0.46'
def fmt(x):
  return '%.2f'%x
frame.applymap(fmt)
b d e
Utah 0.00 1.00 2.00
Ohio 3.00 4.00 5.00
Texas 6.00 7.00 8.00
Oregon 9.00 10.00 11.00
frame['b'].applymap(fmt) # Series 데이터에는 applymap을 적용할 수 없고
                         # 대신 map 함수 사용
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-296-9a0b524b5fa3> in <module>
----> 1 frame['b'].applymap(fmt)


/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py in __getattr__(self, name)
   5485         ):
   5486             return self[name]
-> 5487         return object.__getattribute__(self, name)
   5488 
   5489     def __setattr__(self, name: str, value) -> None:


AttributeError: 'Series' object has no attribute 'applymap'
frame['b'].map(fmt)
Utah      0.00
Ohio      3.00
Texas     6.00
Oregon    9.00
Name: b, dtype: object
  • 누락된 값(Nan)이 있는지 Bool 값으로 확인하기
frame.isnull()
b d e
Utah False False False
Ohio False False False
Texas False False False
Oregon False False False

Reference

파이썬 라이브러리를 활용한 데이터 분석 (웨스 맥키니 저)

댓글남기기