Data analysis 3 (pandas)
Pandas (1)
Pandas 개요
- pandas는 for문을 사용하지 않고 데이터를 처리한다거나 배열 기반의 함수를 제 공하는 등 NumPy의 배열 기반 계산 스타일을 많이 차용
- pandas가 NumPy 스타일을 많이 차용했지만 가장 큰 차이점은 pandas는 표 형 식의 데이터나 다양한 형태의 데이터를 다루는 데 초점을 맞춰 설계
- 그에 비해 NumPy는 단일 산술 배열 데이터를 다루는 데 특화
- 고수준의 자료구조를 제공하고 파이썬 생태계 내의 다른 분석 라이브러리 등과 함께 사용
1. Pandas 자료구조
import pandas as pd
import numpy as np
1.1 Series
- 1차원 데이터
obj = pd.Series([4, 7, -5, 3])
obj
0 4
1 7
2 -5
3 3
dtype: int64
obj.values
array([ 4, 7, -5, 3])
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2
d 4
b 7
a -5
c 3
dtype: int64
obj2['d'] # 라벨 색인
4
obj2[0] # 정수 색인
4
obj2[[0, 1, 3]] # 팬시 색인 (정수)
d 4
b 7
c 3
dtype: int64
obj2[['d', 'b', 'c']]
d 4
b 7
c 3
dtype: int64
obj2 > 0
d True
b True
a False
c True
dtype: bool
obj2[obj2 > 0] # 불리안 색인
d 4
b 7
c 3
dtype: int64
obj2 * 2
d 8
b 14
a -10
c 6
dtype: int64
np.exp(obj2) # 유니버설 함수
d 54.598150
b 1096.633158
a 0.006738
c 20.085537
dtype: float64
sdata = {'Ohio':3500, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = pd.Series(sdata)
obj3
Ohio 3500
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
obj4 = pd.Series(sdata, index=['California', 'Ohio', 'Oregon', 'Texas'])
obj4
California NaN
Ohio 3500.0
Oregon 16000.0
Texas 71000.0
dtype: float64
pd.isnull(obj4).sum()
1
pd.notnull(obj4)
California False
Ohio True
Oregon True
Texas True
dtype: bool
obj
0 4
1 7
2 -5
3 3
dtype: int64
obj.index = ['a', 'b', 'c', 'd']
obj
a 4
b 7
c -5
d 3
dtype: int64
1.2 DataFrame
- 2차원 데이터
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year' : [2000, 2001, 2002, 2001, 2002, 2003],
'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}
frame = pd.DataFrame(data)
frame
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
3 | Nevada | 2001 | 2.4 |
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
frame.index
RangeIndex(start=0, stop=6, step=1)
frame.columns
Index(['state', 'year', 'pop'], dtype='object')
frame.values
array([['Ohio', 2000, 1.5],
['Ohio', 2001, 1.7],
['Ohio', 2002, 3.6],
['Nevada', 2001, 2.4],
['Nevada', 2002, 2.9],
['Nevada', 2003, 3.2]], dtype=object)
frame.head(3)
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
frame.tail(2) # 전체 데이터중 뒤에서부터 2행만 보여주기
state | year | pop | |
---|---|---|---|
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
data
{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame2 = pd.DataFrame(data, columns= ['year', 'state', 'pop', 'debt'],
index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | NaN |
six | 2003 | Nevada | 3.2 | NaN |
frame2.index
Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
(1) 열 색인
frame2['state'] # 2차원 데이터프레임을 색인하면 1차원 시리즈 데이터
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
six Nevada
Name: state, dtype: object
type(frame2['state'])
pandas.core.series.Series
frame2.state
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
six Nevada
Name: state, dtype: object
(2) 행 색인
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | NaN |
six | 2003 | Nevada | 3.2 | NaN |
frame2['one'] # error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
/usr/local/lib/python3.8/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'one'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-70-387cfdc2fd9b> in <module>
----> 1 frame2['one']
/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py in __getitem__(self, key)
3456 if self.columns.nlevels > 1:
3457 return self._getitem_multilevel(key)
-> 3458 indexer = self.columns.get_loc(key)
3459 if is_integer(indexer):
3460 indexer = [indexer]
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 'one'
frame2.loc['one'] # 행색인(라벨) 1차원 시리즈 데이터
year 2000
state Ohio
pop 1.5
debt NaN
Name: one, dtype: object
frame2.iloc[0] # 행색인(정수)
year 2000
state Ohio
pop 1.5
debt NaN
Name: one, dtype: object
frame2.loc['one':'two'] # 행슬라이싱(라벨)
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
frame2.iloc[0:2] # 행슬라이싱(정수) , 주의: 정수슬라이싱을 할때는 끝범위가 포함되지 않음
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
frame2['one':'two'] # 슬라이싱 할 때는 loc가 없어도 가능
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | NaN |
six | 2003 | Nevada | 3.2 | NaN |
frame2['debt'] = 16.5 # 브로드캐스팅
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 16.5 |
two | 2001 | Ohio | 1.7 | 16.5 |
three | 2002 | Ohio | 3.6 | 16.5 |
four | 2001 | Nevada | 2.4 | 16.5 |
five | 2002 | Nevada | 2.9 | 16.5 |
six | 2003 | Nevada | 3.2 | 16.5 |
frame2['debt'] = np.arange(6) # numpy array를 넣었음
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 0 |
two | 2001 | Ohio | 1.7 | 1 |
three | 2002 | Ohio | 3.6 | 2 |
four | 2001 | Nevada | 2.4 | 3 |
five | 2002 | Nevada | 2.9 | 4 |
six | 2003 | Nevada | 3.2 | 5 |
sr = pd.Series([1, 2, 3, 4, 5, 6], index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame2['debt'] = sr
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 1 |
two | 2001 | Ohio | 1.7 | 2 |
three | 2002 | Ohio | 3.6 | 3 |
four | 2001 | Nevada | 2.4 | 4 |
five | 2002 | Nevada | 2.9 | 5 |
six | 2003 | Nevada | 3.2 | 6 |
sr = pd.Series([1, 3, 5], index = ['one', 'three', 'five'])
frame2['debt'] = sr
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 1.0 |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | 3.0 |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | 5.0 |
six | 2003 | Nevada | 3.2 | NaN |
pop = {'Nevada' : {2001:2.4, 2002:2.9},
'Ohio' : {2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = pd.DataFrame(pop)
frame3
Nevada | Ohio | |
---|---|---|
2001 | 2.4 | 1.7 |
2002 | 2.9 | 3.6 |
2000 | NaN | 1.5 |
frame3.T
2001 | 2002 | 2000 | |
---|---|---|---|
Nevada | 2.4 | 2.9 | NaN |
Ohio | 1.7 | 3.6 | 1.5 |
pd.DataFrame(pop, index= [2001, 2002, 2003])
Nevada | Ohio | |
---|---|---|
2001 | 2.4 | 1.7 |
2002 | 2.9 | 3.6 |
2003 | NaN | NaN |
1.3 Index
- 컬럼명, 인덱스 (데이터과 행과 열을 알려주는 메타 데이터)
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj
a 0
b 1
c 2
dtype: int64
obj.index
Index(['a', 'b', 'c'], dtype='object')
obj.index[:2]
Index(['a', 'b'], dtype='object')
obj.index[1]
'b'
obj.index[1] = 'd' # 쓰기 금지
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-104-f78538b7b4e7> in <module>
----> 1 obj.index[1] = 'd'
/usr/local/lib/python3.8/dist-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
4583 @final
4584 def __setitem__(self, key, value):
-> 4585 raise TypeError("Index does not support mutable operations")
4586
4587 def __getitem__(self, key):
TypeError: Index does not support mutable operations
labels = pd.Index(['a', 'b', 'c'])
pd.Series([10, 20, 30], index=labels)
a 10
b 20
c 30
dtype: int64
2. 중요한 기능들
2.1 재색인
sr = pd.Series([1, 2, 3, 4], index=[0, 3, 4, 5])
sr
0 1
3 2
4 3
5 4
dtype: int64
sr.reindex(np.arange(6))
0 1.0
1 NaN
2 NaN
3 2.0
4 3.0
5 4.0
dtype: float64
sr.reindex(np.arange(6), method='bfill')
0 1
1 2
2 2
3 2
4 3
5 4
dtype: int64
sr.reindex(np.arange(6), method='ffill')
0 1
1 1
2 1
3 2
4 3
5 4
dtype: int64
2.2 로우나 컬럼 삭제하기
sr = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
sr
a 0
b 1
c 2
d 3
e 4
dtype: int64
sr.drop('c')
a 0
b 1
d 3
e 4
dtype: int64
sr.drop(['c', 'd'])
a 0
b 1
e 4
dtype: int64
data = pd.DataFrame(np.arange(16).reshape(4,4),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
# axis=0 (행축), axis=1 (열축)
# axis=index, axis=columns
# (1) drop 연산을 할 경우에는 지정된 "축을" 삭제
# (2) 통계/수학 메서드(sum, mean, ...)를 사용할 때는 "축을 따라서" 계산
# 열삭제
# data.drop('two', axis=1) # axis 기본값은 0
data.drop('two', axis='columns')
one | three | four | |
---|---|---|---|
Ohio | 0 | 2 | 3 |
Colorado | 4 | 6 | 7 |
Utah | 8 | 10 | 11 |
New York | 12 | 14 | 15 |
data.drop(['two', 'four'], axis='columns')
one | three | |
---|---|---|
Ohio | 0 | 2 |
Colorado | 4 | 6 |
Utah | 8 | 10 |
New York | 12 | 14 |
2.3 색인하기, 선택하기, 거르기
data = pd.DataFrame(np.arange(16).reshape(4,4),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
data['two'] # Series
Ohio 1
Colorado 5
Utah 9
New York 13
Name: two, dtype: int64
data[['two', 'four']] # DataFrame
two | four | |
---|---|---|
Ohio | 1 | 3 |
Colorado | 5 | 7 |
Utah | 9 | 11 |
New York | 13 | 15 |
data < 5
one | two | three | four | |
---|---|---|---|---|
Ohio | True | True | True | True |
Colorado | True | False | False | False |
Utah | False | False | False | False |
New York | False | False | False | False |
data[data < 5] = 0
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
- loc, iloc
data.loc['Colorado']['one']
0
data.loc['Colorado']['one':'three']
one 0
two 5
three 6
Name: Colorado, dtype: int64
data.loc['Colorado'][['one', 'three']]
one 0
three 6
Name: Colorado, dtype: int64
data.loc['Colorado', ['one', 'three']]
one 0
three 6
Name: Colorado, dtype: int64
data.iloc[1, [0, 2]]
one 0
three 6
Name: Colorado, dtype: int64
data.loc[:'Utah']
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
data.iloc[:3]
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
실습
- 데이터 프레임 객체에서 색인 연습
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four', 'five'])
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | NaN |
from IPython.display import Image
Image('./images/image7.png', width=400)
# 1
frame2['state']
# 2
frame2[['state', 'pop', 'debt']]
frame2.loc[:, 'state':'debt']
# 3
frame2.loc['two']
frame2.iloc[1]
# 4
frame2.loc['two':'four']
frame2.iloc[1:4]
# 5
frame2.iloc[2:4, 2:4]
frame2.loc['three':'four', 'pop':'debt']
# 6
frame2[['pop', 'debt', 'state']]
# 7
frame2.loc[['three', 'four', 'two']]
frame2.iloc[[2, 3, 1]]
3. 산술 연산과 데이터 정렬
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])
s1
a 7.3
c -2.5
d 3.4
e 1.5
dtype: float64
s2
a -2.1
c 3.6
e -1.5
f 4.0
g 3.1
dtype: float64
s1 + s2
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
df1 = pd.DataFrame(np.arange(12).reshape(3, 4), columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20).reshape(4, 5), columns = list('abcde'))
df2.loc[1, 'b'] = np.nan
df1
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
df2
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0 | 1.0 | 2 | 3 | 4 |
1 | 5 | NaN | 7 | 8 | 9 |
2 | 10 | 11.0 | 12 | 13 | 14 |
3 | 15 | 16.0 | 17 | 18 | 19 |
df1 + df2
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 2.0 | 4.0 | 6.0 | NaN |
1 | 9.0 | NaN | 13.0 | 15.0 | NaN |
2 | 18.0 | 20.0 | 22.0 | 24.0 | NaN |
3 | NaN | NaN | NaN | NaN | NaN |
df1.add(df2, fill_value=0) # 값이 없는 원소들은 0으로 채워넣고 연산을 진행
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 2.0 | 4.0 | 6.0 | 4.0 |
1 | 9.0 | 5.0 | 13.0 | 15.0 | 9.0 |
2 | 18.0 | 20.0 | 22.0 | 24.0 | 14.0 |
3 | 15.0 | 16.0 | 17.0 | 18.0 | 19.0 |
4. 함수 적용과 매핑
frame = pd.DataFrame(np.arange(12).reshape(4,3), columns= list('bde'),
index = ["Utah", "Ohio", "Texas", "Oregon"])
frame
b | d | e | |
---|---|---|---|
Utah | 0 | 1 | 2 |
Ohio | 3 | 4 | 5 |
Texas | 6 | 7 | 8 |
Oregon | 9 | 10 | 11 |
# 컬럼별 합계
frame.sum() # pandas에서는 numpy와 다르게 axis=0이 기본값으로 설정
# 0번축(행축)을 따라서 합계구하기
b 18
d 22
e 26
dtype: int64
frame.sum(axis=0)
b 18
d 22
e 26
dtype: int64
# 행별 합계
frame.sum(axis='columns') # frame.sum(aixs=1) # 열을 따라서 합계 구하기
Utah 3
Ohio 12
Texas 21
Oregon 30
dtype: int64
np.sum(frame, axis=1)
Utah 3
Ohio 12
Texas 21
Oregon 30
dtype: int64
frame.mean()
b 4.5
d 5.5
e 6.5
dtype: float64
frame.min()
b 0
d 1
e 2
dtype: int64
frame.max(axis=1)
Utah 2
Ohio 5
Texas 8
Oregon 11
dtype: int64
frame.max(axis=0) - frame.min(axis=0)
b 9
d 9
e 9
dtype: int64
- Dataframe에서 제공하지 않는 함수는 직접 만들어서 적용할 수 있음 (apply 함수 이용)
def range_f(x):
return x.max() - x.min()
frame.apply(range_f, axis=0)
b 9
d 9
e 9
dtype: int64
frame.apply(lambda x:x.max() - x.min(), axis=0)
b 9
d 9
e 9
dtype: int64
- Dataframe의 모든 원소에 어떤 함수를 적용할 때 applymap
frame
b | d | e | |
---|---|---|---|
Utah | 0 | 1 | 2 |
Ohio | 3 | 4 | 5 |
Texas | 6 | 7 | 8 |
Oregon | 9 | 10 | 11 |
x = 0.45678
'%.2f'%x
'0.46'
def fmt(x):
return '%.2f'%x
frame.applymap(fmt)
b | d | e | |
---|---|---|---|
Utah | 0.00 | 1.00 | 2.00 |
Ohio | 3.00 | 4.00 | 5.00 |
Texas | 6.00 | 7.00 | 8.00 |
Oregon | 9.00 | 10.00 | 11.00 |
frame['b'].applymap(fmt) # Series 데이터에는 applymap을 적용할 수 없고
# 대신 map 함수 사용
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-296-9a0b524b5fa3> in <module>
----> 1 frame['b'].applymap(fmt)
/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py in __getattr__(self, name)
5485 ):
5486 return self[name]
-> 5487 return object.__getattribute__(self, name)
5488
5489 def __setattr__(self, name: str, value) -> None:
AttributeError: 'Series' object has no attribute 'applymap'
frame['b'].map(fmt)
Utah 0.00
Ohio 3.00
Texas 6.00
Oregon 9.00
Name: b, dtype: object
- 누락된 값(Nan)이 있는지 Bool 값으로 확인하기
frame.isnull()
b | d | e | |
---|---|---|---|
Utah | False | False | False |
Ohio | False | False | False |
Texas | False | False | False |
Oregon | False | False | False |
댓글남기기