第7章第1节：合并数据集

所有用到的数据可以从作者的 github下载。

%pylab inline
import pandas as pd
from pandas import Series, DataFrame
import json

Populating the interactive namespace from numpy and matplotlib

pandas提供了一些内置的方式来处理数据的合并：

pandas.merge(): 根据一个或者多个键值，将不同 DataFrame 中的行连接起来，就是SQL中的数据库连接工作。
pandas.concat(): 沿着一条轴将多个对象堆叠在一起
DataFrame.combine_first(): 将重复数据编接在一起，用一个对象中的值填充另一个对象中的缺失值。

数据库风格的DataFrame合并¶

数据集的合并（merge）或者连接（join）运算，是通过一个或者多个键将行链接起来。这是关系型数据库的核心。

在合并(merge)时，支持内连接(inner)、左连接(left)、右连接(right)、外连接(outer)，通过how指定。默认为内连接(inner)。

多对多的合并，结果是行的笛卡尔积，即针对一个键值，两个对象对应值的所有组合。连接方式只影响出现在结果中的键。其中：

内连接只保留合并列中的交集
左连接？
右连接？
外连接保留和并列的并集，相当于组合了左连接和右连接的结果

df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df1

df2 = DataFrame({'key': ['a', 'b', 'd'],
                 'data2': range(3)})
df2

# 默认用重复的列名进行合并，并且只保留合并列中的交集，其他舍去(内连接)
pd.merge(df1, df2)

# 最好显示指定合并列
pd.merge(df1, df2, on='key')

# 列名不同时，可以分别指定
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

#  外连接：
pd.merge(df1,df2,how = 'outer')

# 左连接
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                 'data1': range(6)})
df1

df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                 'data2': range(5)})
df2

# 左连接
pd.merge(df1, df2, on='key', how='left')

#对多个键进行合并，传入一个由列名组成的列表即可
left = DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

# 对于重复列名，pandas会自动添加后缀
pd.merge(left,right,on = 'key1')

# 可以通过suffixes选项指定后缀
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

索引上的合并¶

如果连接键在索引中，可以通过 left_index = True或者right_index = True 指定。

left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                  'value': range(6)})
left1

right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
right1

pd.merge(left1, right1, left_on='key', right_index=True)

pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

#  对于层次化索引
lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                   'key2': [2000, 2001, 2002, 2001, 2002],
                   'data': np.arange(5.)})
lefth

righth = DataFrame(np.arange(12).reshape((6, 2)),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1', 'event2'])
righth

#这种情况下，必须指明用作合并键的多个列（注意对重复索引值的处理）
#注意得到的结果的index是跟左边对象的index一致
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

# 同时使用合并双方的索引
left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
                 columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                   index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

# DataFrame.join(), 能更方便地实现按索引合并
# 还可以用作合并多个带有相同或者相似索引的DataFrame对象，而不管有没有重叠的列
# DataFrame的join方法是在连接键上做左连接
left2.join(right2, how='outer')

left1.join(right1, on='key')

# join()方法支持参数DataFrame的索引跟调用者DataFrame的某个列之间的连接（这个方法有点像merge中的left_index这样的参数）
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                    index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
another

left2.join([right2, another])

# 对于简单的索引合并，还可以向join传入多个DataFrame
left2.join([right2, another], how='outer')

缺失值的合并¶

a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

b = Series(np.arange(len(a), dtype=np.float64),
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

np.where(pd.isnull(a), b, a)

array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])

b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})
df1.combine_first(df2)

《利用Python进行数据分析》读书笔记。

第7章第2节：重塑和轴向旋转

所有用到的数据可以从作者的 github下载。

%pylab inline
import pandas as pd
from pandas import Series, DataFrame
import json

Populating the interactive namespace from numpy and matplotlib

pandas 有很多重排表格数据的运算，称为 reshape（重塑）和 pivot（轴向旋转）操作。

重塑层次化索引¶

stack:将数据的列“旋转”为行

unstack：将数据的行“旋转”为列

data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))
data

result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64

result.unstack()

# 默认情况下，unstack处理的是内层的索引，若想别的层次，传入编号或者名称即可，注意最外一层编号为0

result.unstack(0)

# 也可用列名指定
result.unstack('state')

# 下面看有缺失值的情况,unstack()会标示出缺失值
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2.unstack()

# stack会滤除缺失数据
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

# 保留缺失值
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

# 对DataFrame进行unstack时，作为旋转轴的级别成为结果中最低的,弄到最内层
df = DataFrame({'left': result, 'right': result + 5},
               columns=pd.Index(['left', 'right'], name='side'))
df

df.unstack('state')

df.unstack('state').stack('side')

pivot: 将“长格式”转换为“宽格式”¶

data = pd.read_csv('data/ch07/macrodata.csv')
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
data = DataFrame(data.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))

ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata[:10]

# 将data、item作为行、列名，value填充进二维表
pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head()

ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]

pivoted = ldata.pivot('date', 'item')
pivoted[:5]

pivoted['value'][:5]

# pivot其实只是一个“快捷方式而已”， 其本质是用set_index创建层次化索引，再用unstack重塑
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]

《利用Python进行数据分析》读书笔记。

第7章第3节：数据转换

所有用到的数据可以从作者的 github下载。

%pylab inline
import pandas as pd
from pandas import Series, DataFrame
import json

Populating the interactive namespace from numpy and matplotlib

重排之后，下面介绍数据的过滤、清理、以及其他转换工作。

去除重复数据¶

data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

# 得到去重之后的DataFrame，这是非常常用的
data.drop_duplicates()

# 可以选定需要去重的列
data['v1'] = range(7)
data.drop_duplicates(['k1'])

# 默认保留第一次出现的行,  可以设定为最后一个
data.drop_duplicates(['k1', 'k2'], keep='last')

用函数或者映射(mapping)进行数据转换¶

可以实现根据值进行转换。

data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

#  添加一个肉类到动物的映射
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

# Series的map方法可以接受一个函数或含有映射关系的字典型对象
# 为了保证映射正确，先转换大小写。 这种方法很常用
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

# 也可以用一个函数实现上述功能
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

替换值¶

使用 replace()函数，可以更简单的实现值替换。

data = Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

重命名轴索引¶

与值一样，轴标签页可以用 map()函数进行映射。

data = DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

data.index.map(str.upper)

array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)

data.index = data.index.map(str.upper)
data

# 对于轴，与 replace()类似的函数是 rename()
data.rename(index=str.title, columns=str.upper)

data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

# inplace 指定 就地修改，而无需新建一个数据结构
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

离散化和面元划分¶

为了便于分析，连续数据常常被离散化或拆分为面元（bin），即分组。

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

# 使用 cut(), 将 ages 分为 18-25，25-35.。。等几个面元(bin)
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

# 返回的是一个特殊的Categorical对象，可以看作是表示面元名称的字符串。 
# 含有一个表示不同分类名称的 categories 数组以及一个 codes 属性.
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories

Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')

pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

#  默认为“左开右闭”，可以通过 right=False 指定“左闭右开”
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

# 指定分组标签（面元名称）
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

# 按数量均分为4组，精度为2位（而不是按值划分）
data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.74, 0.98], (0.49, 0.74], (0.25, 0.49], (0.74, 0.98], (0.25, 0.49], ..., (0.00071, 0.25], (0.00071, 0.25], (0.74, 0.98], (0.00071, 0.25], (0.74, 0.98]]
Length: 20
Categories (4, object): [(0.00071, 0.25] < (0.25, 0.49] < (0.49, 0.74] < (0.74, 0.98]]

# qcut: 根据样本分位数切分。如下：根据4分位数切成4份
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats

[(0.0103, 0.682], [-3.378, -0.647], (0.0103, 0.682], (-0.647, 0.0103], (0.0103, 0.682], ..., (0.0103, 0.682], (-0.647, 0.0103], (-0.647, 0.0103], [-3.378, -0.647], (0.0103, 0.682]]
Length: 1000
Categories (4, object): [[-3.378, -0.647] < (-0.647, 0.0103] < (0.0103, 0.682] < (0.682, 3.119]]

pd.value_counts(cats)

(0.682, 3.119]      250
(0.0103, 0.682]     250
(-0.647, 0.0103]    250
[-3.378, -0.647]    250
dtype: int64

#  根据设定的分位数切分
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(0.0103, 1.294], [-3.378, -1.276], (0.0103, 1.294], (-1.276, 0.0103], (0.0103, 1.294], ..., (0.0103, 1.294], (-1.276, 0.0103], (-1.276, 0.0103], (-1.276, 0.0103], (0.0103, 1.294]]
Length: 1000
Categories (4, object): [[-3.378, -1.276] < (-1.276, 0.0103] < (0.0103, 1.294] < (1.294, 3.119]]

检测和过滤离群点¶

离群点（outlier）的过滤或变换运算在很大程度上其实就是数组运算。

np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()

# 第4列中，绝对值>3的行
col = data[3]
col[np.abs(col) > 3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

# 全部含有超过3或-3的值的行
data[(np.abs(data) > 3).any(1)]

# 修改值
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

排列和随机采样¶

df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

# 返回一个随机排列
sampler = np.random.permutation(5)
sampler

array([1, 3, 0, 2, 4])

# 在基于ix的索引操作或者take函数中使用该数组
df.take(sampler)

# 进行截取
df.take(np.random.permutation(len(df))[:3])

bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)

sampler

array([3, 0, 4, 1, 1, 2, 3, 0, 1, 2])

draws = bag.take(sampler)
draws

array([ 6,  5,  4,  7,  7, -1,  6,  5,  7, -1])

计算指标/哑变量¶

一种常用的用于统计建模或机器学习的转换方式是：将分类变量（categorical variable）转换为“哑变量矩阵”(dummy matrix)或“指标矩阵”（indicator matrix）。

如果DataFrame的某一列有k各不同的值，可以派生出一个k列的矩阵或者DataFrame（值为1和0）。

这样的做法在下一章（第八章）的地图的例子中有体现。

df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})
df

# 得到哑变量DataFrame
pd.get_dummies(df['key'])

# 给指标列加上一个前缀
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/ch02/movielens/movies.dat', sep='::', header=None,
                        names=mnames,engine='python')
movies[:10]

#  将 `|` 分隔的 genres 拆分
genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
genres

['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

#  构建一个全零的 DataFrame
dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
dummies.head()

#  设置标记
for i, gen in enumerate(movies.genres):
    dummies.ix[i, gen.split('|')] = 1

# 将标记合并到movies
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic

movies_windic.ix[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object

# 对于很大的数据，这种方法构建指标非常慢。肯定需要编写一个能够利用DataFrame内部机制的更低级的函数才行 
# 一个对统计应用的秘诀是：结合get_dummies和诸如cut之类的离散化函数
np.random.seed(12345)

values = np.random.rand(10)
values

array([ 0.92961609,  0.31637555,  0.18391881,  0.20456028,  0.56772503,
        0.5955447 ,  0.96451452,  0.6531771 ,  0.74890664,  0.65356987])

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

《利用Python进行数据分析》读书笔记。

第7章第4节：字符串操作

所有用到的数据可以从作者的 github下载。

%pylab inline
import pandas as pd
from pandas import Series, DataFrame

Populating the interactive namespace from numpy and matplotlib

Python有简单易用的字符串和文本处理功能。大部分文本运算直接做成了字符串对象的内置方法。当然还能用正则表达式。pandas对此进行了加强，能够对数组数据应用字符串表达式和正则表达式，而且能处理烦人的缺失数据。

字符串对象方法¶

对于大部分的字符串而言，内置的方法已经能够满足要求了。

python 的字符串方法主要有：

count
endswith, startswith
join
index
find
rfind
replace
strip, rstrip, lstrip
split
lower, upper
ljust, rjust

# 返回一个列表
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

#  去除空格
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

# + 连接字符串。 注意下面的赋值方式
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

# 上面的不实用，下面是一种更快的风格
'::'.join(pieces)

'a::b::guido'

# 字串定位，常用的有 in、index、find
'guido' in val

True

val.index(',')

1

val.find(':')

-1

# 不包含子串会报错
# val.index(':')

# 返回个数
val.count(',')

2

#  替换
val.replace(',', '::')

'a::b::  guido'

#  剔除
val.replace(',', '')

'ab  guido'

正则表达式¶

正则表达式（regex）提供了一种灵活的在文本中搜索、匹配字符串的模式。用的是re模块。

re模块的函数分为3类：模式匹配、替换、拆分。

关于python 内置的正则表达式(re 模块），可以参考AstralWind的总结。

另外animalize 介绍了更强大的第三方模块(regex)。

re 模块的主要方法有：

findall, finditer
match
search
split
sub, subn

import re
text = "foo    bar\t baz  \tqux"

# 先编译正则表达式 \s+ (多个空白字符)，然后再调用split
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

# 等价于
# 如果想对许多字符串都应用同一条正则表达式，应该先compile节省时间
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

# 找到匹配regex的所有模式 (\s+)
regex.findall(text)

['    ', '\t ', '  \t']

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
# r， 指定为原生字符串，使得转义字符 \ 不起作用
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

#findall 返回字符串中所有匹配项
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

# search只返回第一个匹配项
# 返回的是一种特殊特殊对象，这个对象只能告诉我们模式在原始字符串中的起始和结束位置
m = regex.search(text)
m

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

text[m.start():m.end()]

'dave@google.com'

# match更加严格，它只匹配出现在字符串开头的模式
regex.match(text)

# sub方法，会将匹配到的模式替换为指定字符串，并返回新字符串
regex.sub('REDACTED', text)

'Dave REDACTED\nSteve REDACTED\nRob REDACTED\nRyan REDACTED\n'

# 如果想将找出的模式分段， 需要用圆括号括起来
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.match('wesm@bright.net')
m.groups() # 返回 tuple

('wesm', 'bright', 'net')

regex.findall(text) # 返回列表

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)) # 返回替换后的字符串

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com

regex = re.compile(r"""
    (?P<username>[A-Z0-9._%+-]+)
    @
    (?P<domain>[A-Z0-9.-]+)
    \.
    (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)

m = regex.match('wesm@bright.net')
m.groupdict()   #  返回一个简单的字典

{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

pandas中矢量化字符串函数¶

将字符串方法或正则表达式应用到一系列数据。常用的方法包括：

cat
contains
count
endswith, startswith
findall
get
join
len
lower, upper
match
pad
center
repeat
replace
slice
split
strip, rstrip, lstrip

通过data.map()方法，所有字符串和正则都能传入各个值（通过lambda或者其他函数），但是如果存在NA就会报错。 #然而，Series有些跳过NA的方法。通过Series的str属性可以访问这些方法。

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = Series(data)

data

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

data.isnull()

Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

/Users/holbrook/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:1: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
  if __name__ == '__main__':

Dave     (dave, google, com)
Rob        (rob, gmail, com)
Steve    (steve, gmail, com)
Wes                      NaN
dtype: object

#有两个办法可以实现矢量化的元素获取操作：要么使用str.get,要么在str属性上用索引
matches.str.get(1)

Dave     google
Rob       gmail
Steve     gmail
Wes         NaN
dtype: object

matches.str[0]

Dave      dave
Rob        rob
Steve    steve
Wes        NaN
dtype: object

# 进行截取
data.str[:5]

Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

《利用Python进行数据分析》读书笔记。

第7章第5节：USDA食品数据库的例子

所有用到的数据可以从作者的 github下载。

%pylab inline
import pandas as pd
from pandas import Series, DataFrame
import json

Populating the interactive namespace from numpy and matplotlib

数据加载¶

import json
db = json.load(open('data/ch07/foods-2011-10-03.json'))
# 得到的db是个list,每个条目都是含有某种食物全部数据的字典
len(db)

6636

db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

# nutrients 对应的值是有关食物营养成分的一个字典列表，很长……
db[0]['nutrients'][0]

{'description': 'Protein',
 'group': 'Composition',
 'units': 'g',
 'value': 25.18}

数据准备¶

# 将营养成分做成DataFrame
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]

info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)

info[:5]

info.head()

# 查看分类分布情况
pd.value_counts(info.group)[:10]

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Fast Foods                           365
Legumes and Legume Products          365
Lamb, Veal, and Game Products        345
Sweets                               341
Pork Products                        328
Fruits and Fruit Juices              328
Name: group, dtype: int64

将所有营养成分整合到一个大表中¶

# 将列表连接起来，相当于rbind，把行对其连接在一起

nutrients = []

for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)

nutrients.head()

去重¶

nutrients.duplicated().sum()

14179

nutrients = nutrients.drop_duplicates()

修整¶

# 由于nutrients与info有重复的名字，所以需要重命名一下info
col_mapping = {'description' : 'food',
               'group'       : 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
info.head()

col_mapping = {'description' : 'nutrient',
               'group' : 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients.head()

数据转换¶

ndata = pd.merge(nutrients, info, on='id', how='outer')
ndata.head()

ndata.ix[30000]

nutrient                                       Glycine
nutgroup                                   Amino Acids
units                                                g
value                                             0.04
id                                                6158
food            Soup, tomato bisque, canned, condensed
fgroup                      Soups, Sauces, and Gravies
manufacturer                                          
Name: 30000, dtype: object

建模和计算¶

#  根据营养成分，得到锌的中位数
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].sort_values().plot(kind='barh')

<matplotlib.axes._subplots.AxesSubplot at 0x115eed908>

# 发现各营养成分最为丰富的食物
by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])

get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())

max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

# make the food a little smaller
max_foods.food = max_foods.food.str[:50]

max_foods.ix['Amino Acids']['food']

nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object

	New York	Oregon
a	7.0	8.0
c	9.0	10.0
e	11.0	12.0
f	16.0	17.0

	date	item	value
0	1959-03-31	realgdp	2710.349
1	1959-03-31	infl	0.000
2	1959-03-31	unemp	5.800
3	1959-06-30	realgdp	2778.801
4	1959-06-30	infl	2.340
5	1959-06-30	unemp	5.100
6	1959-09-30	realgdp	2775.488
7	1959-09-30	infl	2.740
8	1959-09-30	unemp	5.300
9	1959-12-31	realgdp	2785.204

item	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8
1959-06-30	2.34	2778.801	5.1
1959-09-30	2.74	2775.488	5.3
1959-12-31	0.27	2785.204	5.6
1960-03-31	2.31	2847.699	5.2

	date	item	value	value2
0	1959-03-31	realgdp	2710.349	-0.535021
1	1959-03-31	infl	0.000	1.783525
2	1959-03-31	unemp	5.800	2.835291
3	1959-06-30	realgdp	2778.801	-1.444524
4	1959-06-30	infl	2.340	1.728538
5	1959-06-30	unemp	5.100	1.100782
6	1959-09-30	realgdp	2775.488	-1.371209
7	1959-09-30	infl	2.740	1.069021
8	1959-09-30	unemp	5.300	-0.658462
9	1959-12-31	realgdp	2785.204	-2.165827

	value			value2
item	infl	realgdp	unemp	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8	1.783525	-0.535021	2.835291
1959-06-30	2.34	2778.801	5.1	1.728538	-1.444524	1.100782
1959-09-30	2.74	2775.488	5.3	1.069021	-1.371209	-0.658462
1959-12-31	0.27	2785.204	5.6	-0.322622	-2.165827	-1.525572
1960-03-31	2.31	2847.699	5.2	-1.386987	-0.456043	-0.392422

心内求法

利用Python进行数据分析(7)：数据规整化：清理、转换、合并、重塑

合并数据集

数据库风格的DataFrame合并¶

索引上的合并¶

缺失值的合并¶

重塑和轴向旋转

重塑层次化索引¶

pivot: 将“长格式”转换为“宽格式”¶

数据转换

去除重复数据¶

用函数或者映射(mapping)进行数据转换¶

替换值¶

重命名轴索引¶

离散化和面元划分¶

检测和过滤离群点¶

排列和随机采样¶

计算指标/哑变量¶

字符串操作

字符串对象方法¶

正则表达式¶

pandas中矢量化字符串函数¶

示例：usda食品数据库

数据加载¶

数据准备¶

将所有营养成分整合到一个大表中¶

去重¶

修整¶

数据转换¶

建模和计算¶

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	data	key1	key2
0	0.0	Ohio	2000
1	1.0	Ohio	2001
2	2.0	Ohio	2002
3	3.0	Nevada	2001
4	4.0	Nevada	2002

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0

	state	Ohio	Colorado
number	side
one	left	0	3
one	right	5	8
two	left	1	4
two	right	6	9
three	left	2	5
three	right	7	10

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

	data1	lkey	data2	rkey
0	0	b	1	b
1	1	b	1	b
2	6	b	1	b
3	2	a	0	a
4	4	a	0	a
5	5	a	0	a

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2	3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy
5	6	Heat (1995)	Action\|Crime\|Thriller
6	7	Sabrina (1995)	Comedy\|Romance
7	8	Tom and Huck (1995)	Adventure\|Children's
8	9	Sudden Death (1995)	Action
9	10	GoldenEye (1995)	Action\|Adventure\|Thriller

	Action	Adventure	Animation	Children's	Comedy	Crime	Documentary	Drama	Fantasy	Film-Noir	Horror	Musical	Mystery	Romance	Sci-Fi	Thriller	War	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	(0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

	description	group	units	value
0	Protein	Composition	g	25.18
1	Total lipid (fat)	Composition	g	29.20
2	Carbohydrate, by difference	Composition	g	3.06
3	Ash	Other	g	3.28
4	Energy	Energy	kcal	376.00
5	Water	Composition	g	39.28
6	Energy	Energy	kJ	1573.00

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	Action	Adventure	Animation	Children's	Comedy	Crime	Documentary	Drama	Fantasy	Film-Noir	Horror	Musical	Mystery	Romance	Sci-Fi	Thriller	War	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	(0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

	description	group	id
0	Cheese, caraway	Dairy and Egg Products	1008
1	Cheese, cheddar	Dairy and Egg Products	1009
2	Cheese, edam	Dairy and Egg Products	1018
3	Cheese, feta	Dairy and Egg Products	1019
4	Cheese, mozzarella, part skim milk	Dairy and Egg Products	1028