%pylab inline
import pandas as pd
import json

Populating the interactive namespace from numpy and matplotlib

将数据加载到 DataFrame¶

records = [json.loads(line) for line in open('../data/datasets/bitly_usagov/example.txt')]
frame = pd.DataFrame(records)  # 从列表创建 DataFrame

数据准备¶

# 规整时区数据
frame['tz'].fillna('Missing')   #  填充 缺失值 NA
frame[frame['tz']=='']='Unknown' # 填充空值

数据转换¶

# 按值统计个数
tz_counts = frame['tz'].value_counts()

# 从USER_AGENT 数据中获取客户浏览器类型数据
results = pd.Series([x.split()[0] for x in frame.a.dropna()])

# 从USER_AGENT 数据中获取客户操作系统数据
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')

# 将时区按照操作系统分组
by_tz_os = cframe.groupby(['tz',operating_system])

# 按分组计数，然后用 unstack重塑 数据
agg_counts = by_tz_os.size().unstack().fillna(0)

# 以按行加和并排序的数据作为索引
indexer = agg_counts.sum(1).argsort()
#用  take 函数,  通过索引取出时区最多的值
count_subset = agg_counts.take(indexer)[-10:]

建模和计算¶

暂无

结果展示¶

# TOP10 时区
tz_counts[:10].plot(kind='barh')

<matplotlib.axes._subplots.AxesSubplot at 0x10b0ed550>

# TOP10 时区的操作系统数量（表）
count_subset

# TOP10 时区的操作系统数量（图）
count_subset.plot(kind = 'barh',stacked = True)

<matplotlib.axes._subplots.AxesSubplot at 0x10b2956a0>

# TOP10 时区的操作系统比例（图）
normed_subset = count_subset.div(count_subset.sum(1),axis = 0)
normed_subset.plot(kind = 'barh',stacked = True)

<matplotlib.axes._subplots.AxesSubplot at 0x10b4ac860>

%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib

将数据加载到 DataFrame¶

# 定义列名
unames = ['user_id','gender','age','occupation','zip']
rnames = ['user_id','movie_id','rating','timestamp']
mnames = ['movie_id','title','genres']

# 读取数据
users = pd.read_table('../data/datasets/movielens/users.dat',sep='::',header=None,names=unames,engine='python')
ratings = pd.read_table('../data/datasets/movielens/ratings.dat',sep='::',header=None,names=rnames,engine='python')
movies = pd.read_table('../data/datasets/movielens/movies.dat',sep='::',header=None,names=mnames,engine='python')

# 检验数据
users.head()
ratings.head()
movies.head()

合并数据¶

# 先合并 users 到ratings,  再合并movies
# pandas 会根据列名进行连接

data = pd.merge(pd.merge(ratings,users),movies)
data.head()

数据转换¶

#  按性别统计每部电影的得分
# pivot_table, 透视表, 是一个很常用的工具
mean_ratings = data.pivot_table('rating', index='title',columns=['gender'], aggfunc='mean')

# 另一种写法
#mean_ratings = pd.pivot_table(data,values=['rating'], index='title',columns=['gender'], aggfunc='mean')

mean_ratings.head()

数据筛选¶

# 筛选评分数据>250 条的电影
ratings_by_title = data.groupby('title').size() # 按照title对data分组并计数
active_titles = ratings_by_title.index[ratings_by_title >= 251]  # index返回的下标
mean_ratings = mean_ratings.ix[active_titles]  #  进行筛选。注意，这里原表和透视表的排序相同
mean_ratings.head()

/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:4: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.

结果展示¶

#  女性观众最喜欢的 TOP10 电影
top_female_ratings = mean_ratings.sort_values(by='F',ascending=False)[:10]
top_female_ratings

# 分歧最大的电影

mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

#  分歧最大且女性更喜欢的电影
mean_ratings.sort_values(by='diff')[:10]

#  分歧最大且男性更喜欢的电影
mean_ratings.sort_values(by='diff',ascending=False)[:10]

#  分歧最大的电影

rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title.sort_values(ascending=False)[:10]

title
Foreign Student (1994)                                             2.828427
Criminal Lovers (Les Amants Criminels) (1999)                      2.309401
Identification of a Woman (Identificazione di una donna) (1982)    2.121320
Sunset Park (1996)                                                 2.121320
Eaten Alive (1976)                                                 2.121320
Neon Bible, The (1995)                                             2.121320
Talk of Angels (1998)                                              2.121320
Tokyo Fist (1995)                                                  2.121320
Paralyzing Fear: The Story of Polio in America, A (1998)           2.121320
Better Living (1998)                                               2.121320
Name: rating, dtype: float64

%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib

将数据加载到 DataFrame，并合并¶

#pd.read_csv('data/ch02/names/yob1880.txt', names=['name', 'sex', 'births'])

years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = '../data/datasets/babynames/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)

    frame['year'] = year
    pieces.append(frame)

#  将多个 DataFrame 连接成一个
names = pd.concat(pieces, ignore_index=True)

聚合¶

# 按年度聚合，区分性别
total_births = names.pivot_table('births', index='year',
                                 columns='sex', aggfunc=sum)
total_births.head(10)

#  画张图看看
total_births.plot(title='Total births by sex and year')

<matplotlib.axes._subplots.AxesSubplot at 0x11b6407b8>

数据转换¶

# 计算名字占所有出生婴儿的比重

def add_prop(group):
    # Integer division floors
    births = group.births.astype(float)

    group['prop'] = births / births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)
names.head()

# 检查所有分组的 prop 之和是否近似=1
np.allclose(names.groupby(['year','sex']).prop.sum(),1)

True

数据筛选¶

取 Top1000的名字

def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000.head()

结果展示¶

分析命名趋势¶

boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

# 按名字聚合，做透视表
total_births = top1000.pivot_table('births', index='year', columns='name',
                                   aggfunc=sum)

# 分析5个名字的趋势
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
            title="Number of births per year")

/usr/local/lib/python3.6/site-packages/pandas/core/reshape/pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)

array([<matplotlib.axes._subplots.AxesSubplot object at 0x10d992978>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x117dbd128>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x117d6e240>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1176ae898>], dtype=object)

评估名字的多样性¶

#  最流行的1000个名字的占比
table = top1000.pivot_table('prop', index='year',
                            columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

/usr/local/lib/python3.6/site-packages/pandas/core/reshape/pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/pivot.py:135: FutureWarning: 'sex' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)

<matplotlib.axes._subplots.AxesSubplot at 0x1183198d0>

# 占总出生人数前50%的不同名字的数量
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1  #numpy 的 searchsorted，查找 q 放在哪个索引合适

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)   # 按 year 分组，每个分组都执行一次
diversity = diversity.unstack('sex')  
diversity.head()

/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  
/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:6: FutureWarning: 'sex' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version

diversity.plot(title="Number of popular names in top 50%")

<matplotlib.axes._subplots.AxesSubplot at 0x116c2f828>

最后一个字母的变革¶

#  获取尾字母，并聚合
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', index=last_letters,
                          columns=['sex', 'year'], aggfunc=sum)
# 选取有代表性的3个年份
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()

#  规范化处理，计算比例
letter_prop = subtable / subtable.sum().astype(float)

#  绘图
plt.subplots_adjust(hspace=0.25)
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x119d4bf98>

<matplotlib.figure.Figure at 0x1190fc6a0>

# 进一步分析男孩的 d, n, y 三个字母
letter_prop = table / table.sum().astype(float)

dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
plt.close('all')
dny_ts.plot()

/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:4: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.

<matplotlib.axes._subplots.AxesSubplot at 0x120172ef0>

男孩名变成女孩名¶

# 合并重复的名字
all_names = top1000.name.unique()  

# 找到 lesl 开头的名字
mask = np.array(['lesl' in x.lower() for x in all_names])  
lesley_like = all_names[mask]
lesley_like

array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)

filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64

table = filtered.pivot_table('births', index='year',
                             columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

/usr/local/lib/python3.6/site-packages/pandas/core/reshape/pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
/usr/local/lib/python3.6/site-packages/pandas/core/reshape/pivot.py:135: FutureWarning: 'sex' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)

plt.close('all')
table.plot(style={'M': 'k-', 'F': 'k--'})

<matplotlib.axes._subplots.AxesSubplot at 0x1203aada0>

	Not Windows	Windows
tz
America/Sao_Paulo	13.0	20.0
Europe/Madrid	16.0	19.0
Pacific/Honolulu	0.0	36.0
Asia/Tokyo	2.0	35.0
Europe/London	43.0	31.0
America/Denver	132.0	59.0
America/Los_Angeles	130.0	252.0
America/Chicago	115.0	285.0
Unknown	521.0	0.0
America/New_York	339.0	912.0

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

	user_id	movie_id	rating	timestamp	gender	age	occupation	zip	title	genres
0	1	1193	5	978300760	F	1	10	48067	One Flew Over the Cuckoo's Nest (1975)	Drama
1	2	1193	5	978298413	M	56	16	70072	One Flew Over the Cuckoo's Nest (1975)	Drama
2	12	1193	4	978220179	M	25	12	32793	One Flew Over the Cuckoo's Nest (1975)	Drama
3	15	1193	4	978199279	M	25	7	22903	One Flew Over the Cuckoo's Nest (1975)	Drama
4	17	1193	5	978158471	M	50	1	95350	One Flew Over the Cuckoo's Nest (1975)	Drama

gender	F	M
title
$1,000,000 Duck (1971)	3.375000	2.761905
'Night Mother (1986)	3.388889	3.352941
'Til There Was You (1997)	2.675676	2.733333
'burbs, The (1989)	2.793478	2.962085
...And Justice for All (1979)	3.828571	3.689024

gender	F	M
title
'burbs, The (1989)	2.793478	2.962085
10 Things I Hate About You (1999)	3.646552	3.311966
101 Dalmatians (1961)	3.791444	3.500000
101 Dalmatians (1996)	3.240000	2.911215
12 Angry Men (1957)	4.184397	4.328421

心内求法

利用Python进行数据分析(2)：引言

数据分析的一般步骤

例子：分析网站的用户访问数据

将数据加载到 DataFrame¶

数据准备¶

数据转换¶

建模和计算¶

结果展示¶

例子：电影评分数据分析

将数据加载到 DataFrame¶

合并数据¶

数据转换¶

数据筛选¶

结果展示¶

例子：全美婴儿姓名分析

将数据加载到 DataFrame，并合并¶

聚合¶

数据转换¶

数据筛选¶

结果展示¶

分析命名趋势¶

评估名字的多样性¶

最后一个字母的变革¶

男孩名变成女孩名¶

gender	F	M
title
Close Shave, A (1995)	4.644444	4.473795
Wrong Trousers, The (1993)	4.588235	4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)	4.572650	4.464589
Wallace & Gromit: The Best of Aardman Animation (1996)	4.563107	4.385075
Schindler's List (1993)	4.562602	4.491415
Shawshank Redemption, The (1994)	4.539075	4.560625
Grand Day Out, A (1992)	4.537879	4.293255
To Kill a Mockingbird (1962)	4.536667	4.372611
Creature Comforts (1990)	4.513889	4.272277
Usual Suspects, The (1995)	4.513317	4.518248

gender	F	M	diff
title
Dirty Dancing (1987)	3.790378	2.959596	-0.830782
Jumpin' Jack Flash (1986)	3.254717	2.578358	-0.676359
Grease (1978)	3.975265	3.367041	-0.608224
Little Women (1994)	3.870588	3.321739	-0.548849
Steel Magnolias (1989)	3.901734	3.365957	-0.535777
Anastasia (1997)	3.800000	3.281609	-0.518391
Rocky Horror Picture Show, The (1975)	3.673016	3.160131	-0.512885
Color Purple, The (1985)	4.158192	3.659341	-0.498851
Age of Innocence, The (1993)	3.827068	3.339506	-0.487561
Free Willy (1993)	2.921348	2.438776	-0.482573

gender	F	M	diff
title
Good, The Bad and The Ugly, The (1966)	3.494949	4.221300	0.726351
Kentucky Fried Movie, The (1977)	2.878788	3.555147	0.676359
Dumb & Dumber (1994)	2.697987	3.336595	0.638608
Longest Day, The (1962)	3.411765	4.031447	0.619682
Cable Guy, The (1996)	2.250000	2.863787	0.613787
Evil Dead II (Dead By Dawn) (1987)	3.297297	3.909283	0.611985
Hidden, The (1987)	3.137931	3.745098	0.607167
Rocky III (1982)	2.361702	2.943503	0.581801
Caddyshack (1980)	3.396135	3.969737	0.573602
For a Few Dollars More (1965)	3.409091	3.953795	0.544704

sex	F	M
year
1880	90993	110493
1881	91955	100748
1882	107851	113687
1883	112322	104632
1884	129021	114445
1885	133056	107802
1886	144538	110785
1887	145983	101412
1888	178631	120857
1889	178369	110590

	name	sex	births	year	prop
0	Mary	F	7065	1880	0.077643
1	Anna	F	2604	1880	0.028618
2	Emma	F	2003	1880	0.022013
3	Elizabeth	F	1939	1880	0.021309
4	Minnie	F	1746	1880	0.019188

sex	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	108376.0	691247.0	670605.0	977.0	5204.0	28438.0
b	NaN	694.0	450.0	411.0	3912.0	38859.0
c	5.0	49.0	946.0	482.0	15476.0	23125.0
d	6750.0	3729.0	2607.0	22111.0	262112.0	44398.0
e	133569.0	435013.0	313833.0	28655.0	178823.0	129012.0

sex	F	M
year
2006	1.0	NaN
2007	1.0	NaN
2008	1.0	NaN
2009	1.0	NaN
2010	1.0	NaN