前回の続き。
select-from-where.hatenablog.com
挑戦する(したかった)コンペはこちら。
Recruit Restaurant Visitor Forecasting | Kaggle
最新のソースはGitHubで公開、随時更新していく。
github.com
今回は与えられたデータの確認2回目。
与えられたcsvデータどうしを結合して分析していきます。
8.Airリザーブの訪問データに予約データを結合
- キャンセル数はこのデータから把握することができない
air_visit_df = pd.read_csv('Data/csv/air_visit_data.csv')
air_visit_df.tail(3)
|
air_store_id |
visit_date |
visitors |
252105 |
air_24e8414b9b07decb |
2017-04-20 |
7 |
252106 |
air_24e8414b9b07decb |
2017-04-21 |
8 |
252107 |
air_24e8414b9b07decb |
2017-04-22 |
5 |
air_reserve_df = pd.read_csv('Data/csv/air_reserve.csv')
air_reserve_df['visit_datetime'] = air_reserve_df['visit_datetime'].str[:10]
air_reserve_df = air_reserve_df.rename(columns={'visit_datetime': 'visit_date'}).drop('reserve_datetime',axis=1)
air_reserve_df = air_reserve_df.groupby(['air_store_id','visit_date'])['reserve_visitors'].sum().reset_index()
air_reserve_df.tail(3)
|
air_store_id |
visit_date |
reserve_visitors |
29827 |
air_fea5dc9594450608 |
2017-04-28 |
3 |
29828 |
air_fea5dc9594450608 |
2017-05-20 |
6 |
29829 |
air_fee8dcf4d619598e |
2017-01-09 |
5 |
sample = pd.read_csv('Data/csv/sample_submission.csv')
sample_id = DataFrame(sample, columns=['air_store_id'])
sample_id['air_store_id'] = sample.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_id = sample_id.drop_duplicates().reset_index(drop=True)
sample_id.tail(3)
|
air_store_id |
818 |
air_fef9ccb3ba0da2f7 |
819 |
air_ffcc2d5087e1b476 |
820 |
air_fff68b929994bfbd |
air_df = pd.merge(air_visit_df,sample_id,on=['air_store_id'],how='inner')
air_df.tail(3)
|
air_store_id |
visit_date |
visitors |
250465 |
air_24e8414b9b07decb |
2017-04-20 |
7 |
250466 |
air_24e8414b9b07decb |
2017-04-21 |
8 |
250467 |
air_24e8414b9b07decb |
2017-04-22 |
5 |
air_df = pd.merge(air_visit_df,air_reserve_df,on=['air_store_id','visit_date'],how='left')
air_df.tail(3)
|
air_store_id |
visit_date |
visitors |
reserve_visitors |
252105 |
air_24e8414b9b07decb |
2017-04-20 |
7 |
NaN |
252106 |
air_24e8414b9b07decb |
2017-04-21 |
8 |
NaN |
252107 |
air_24e8414b9b07decb |
2017-04-22 |
5 |
NaN |
air_df.describe(include='all')
|
air_store_id |
visit_date |
visitors |
reserve_visitors |
count |
252108 |
252108 |
252108.000000 |
28064.000000 |
unique |
829 |
478 |
NaN |
NaN |
top |
air_5c817ef28f236bdf |
2017-03-17 |
NaN |
NaN |
freq |
477 |
799 |
NaN |
NaN |
mean |
NaN |
NaN |
20.973761 |
13.751283 |
std |
NaN |
NaN |
16.757007 |
17.284799 |
min |
NaN |
NaN |
1.000000 |
1.000000 |
25% |
NaN |
NaN |
9.000000 |
5.000000 |
50% |
NaN |
NaN |
17.000000 |
10.000000 |
75% |
NaN |
NaN |
29.000000 |
18.000000 |
max |
NaN |
NaN |
877.000000 |
1633.000000 |
8–1.日次の訪問予定者数、訪問者数、予約訪問者率(訪問予定者数 / 訪問者数)
air_df_daily = air_df.copy()
air_df_daily['visit_date'] = air_df_daily.visit_date.map(lambda x: datetime.strptime(x,'%Y-%m-%d'))
air_df_daily = air_df_daily.drop('air_store_id',axis=1)
air_df_daily = air_df_daily.groupby(['visit_date'])['reserve_visitors','visitors'].sum().reset_index()
air_df_daily.tail(3)
|
visit_date |
reserve_visitors |
visitors |
475 |
2017-04-20 |
1488.0 |
13990 |
476 |
2017-04-21 |
3413.0 |
19624 |
477 |
2017-04-22 |
2861.0 |
21245 |
air_df_daily['reserve_rate'] = air_df_daily['reserve_visitors'] / air_df_daily['visitors'] * 100
air_df_daily[air_df_daily['reserve_rate'] > 0].head(3)
|
visit_date |
reserve_visitors |
visitors |
reserve_rate |
0 |
2016-01-01 |
17.0 |
1033 |
1.645692 |
1 |
2016-01-02 |
128.0 |
1764 |
7.256236 |
2 |
2016-01-03 |
53.0 |
2368 |
2.238176 |
air_df_daily.describe(include='all')
|
visit_date |
reserve_visitors |
visitors |
reserve_rate |
count |
478 |
392.000000 |
478.000000 |
392.000000 |
unique |
478 |
NaN |
NaN |
NaN |
top |
2016-09-08 00:00:00 |
NaN |
NaN |
NaN |
freq |
1 |
NaN |
NaN |
NaN |
first |
2016-01-01 00:00:00 |
NaN |
NaN |
NaN |
last |
2017-04-22 00:00:00 |
NaN |
NaN |
NaN |
mean |
NaN |
984.479592 |
11062.035565 |
8.140861 |
std |
NaN |
947.290374 |
5079.505034 |
4.490445 |
min |
NaN |
2.000000 |
1033.000000 |
0.010993 |
25% |
NaN |
274.000000 |
5894.500000 |
5.103207 |
50% |
NaN |
672.500000 |
11429.500000 |
8.117165 |
75% |
NaN |
1341.000000 |
14286.000000 |
10.751376 |
max |
NaN |
4619.000000 |
23982.000000 |
21.465458 |
line_height1 = air_df_daily.reserve_visitors.as_matrix()
line_height2 = air_df_daily.visitors.as_matrix()
line_height3 = air_df_daily.reserve_rate.as_matrix()
left = air_df_daily.visit_date
fig, ax1 = plt.subplots(figsize=(18, 10))
p1 = plt.plot(left, line_height1, color="deepskyblue")
p2 = plt.plot(left, line_height2, color="navy")
ax1.set_ylabel('Num of people')
ax2 = ax1.twinx()
p3 = ax2.plot(left, line_height3, color="red")
ax2.set_ylabel('Reserve Rate')
ax2.set_ylim(0, 50)
plt.legend((p1[0], p2[0], p3[0]), ("reserve", "visit", "reserve_rate"), loc='upper right', bbox_to_anchor=(0.7, 0.5, 0.5, 0.5))
plt.title("daily_visitors")
plt.xlabel("visit_date")
plt.grid(True)
8–2.気づき
訪問データありに限ると、予約データが欠損している期間がある
- 2016年8,9,10月は欠損しているものと思われる
期間によって、取り扱いを変える必要がありそう
- 下記期間でトレンドの変化が見られる
- 2016年1月〜7月
- 2016年8月〜10月
- 2016年11月〜12月
- 2017年1月〜4月
- こちらもキャンセル数はこのデータから把握することができない
store_id_df = pd.read_csv('Data/csv/store_id_relation.csv')
air_df = pd.merge(air_df,store_id_df,on=['air_store_id'],how='left')
air_df.tail(3)
|
air_store_id |
visit_date |
visitors |
reserve_visitors |
hpg_store_id |
252105 |
air_24e8414b9b07decb |
2017-04-20 |
7 |
NaN |
NaN |
252106 |
air_24e8414b9b07decb |
2017-04-21 |
8 |
NaN |
NaN |
252107 |
air_24e8414b9b07decb |
2017-04-22 |
5 |
NaN |
NaN |
hpg_reserve_df = pd.read_csv('Data/csv/hpg_reserve.csv')
hpg_reserve_df['visit_datetime'] = hpg_reserve_df['visit_datetime'].str[:10]
hpg_reserve_df = hpg_reserve_df.rename(columns={'visit_datetime': 'visit_date'}).drop('reserve_datetime',axis=1)
hpg_reserve_df = hpg_reserve_df.groupby(['hpg_store_id','visit_date'])['reserve_visitors'].sum().reset_index()
hpg_reserve_df.tail(3)
|
hpg_store_id |
visit_date |
reserve_visitors |
1355050 |
hpg_fffc097dce87af3e |
2017-03-27 |
5 |
1355051 |
hpg_fffc097dce87af3e |
2017-04-21 |
3 |
1355052 |
hpg_fffc097dce87af3e |
2017-04-23 |
8 |
air_hpg_df = pd.merge(air_df,hpg_reserve_df,on=['hpg_store_id','visit_date'],how='left')
air_hpg_df.tail(3)
|
air_store_id |
visit_date |
visitors |
reserve_visitors_x |
hpg_store_id |
reserve_visitors_y |
252105 |
air_24e8414b9b07decb |
2017-04-20 |
7 |
NaN |
NaN |
NaN |
252106 |
air_24e8414b9b07decb |
2017-04-21 |
8 |
NaN |
NaN |
NaN |
252107 |
air_24e8414b9b07decb |
2017-04-22 |
5 |
NaN |
NaN |
NaN |
air_hpg_df = DataFrame(air_hpg_df, columns=['air_store_id','hpg_store_id','visit_date','visitors','reserve_visitors','reserve_visitors_x','reserve_visitors_y'])
air_hpg_df['reserve_visitors'] = air_hpg_df['reserve_visitors_x'].add(air_hpg_df['reserve_visitors_y'],fill_value=0)
air_hpg_df = air_hpg_df.rename(columns={'reserve_visitors_x': 'reserve_visitors_air', 'reserve_visitors_y': 'reserve_visitors_hpg'})
air_hpg_df.tail(3)
|
air_store_id |
hpg_store_id |
visit_date |
visitors |
reserve_visitors |
reserve_visitors_air |
reserve_visitors_hpg |
252105 |
air_24e8414b9b07decb |
NaN |
2017-04-20 |
7 |
NaN |
NaN |
NaN |
252106 |
air_24e8414b9b07decb |
NaN |
2017-04-21 |
8 |
NaN |
NaN |
NaN |
252107 |
air_24e8414b9b07decb |
NaN |
2017-04-22 |
5 |
NaN |
NaN |
NaN |
9–1.月次の訪問予定者数、訪問者数、予約訪問者率(訪問予定者数 / 訪問者数)
air_hpg_monthly = air_hpg_df.loc[:,['visit_date','visitors','reserve_visitors','reserve_visitors_air','reserve_visitors_hpg']]
air_hpg_monthly['visit_date'] = air_hpg_monthly.visit_date.str[:7]
air_hpg_monthly = air_hpg_monthly.rename(columns={'visit_date': 'visit_month'})
air_hpg_monthly = air_hpg_monthly.groupby(['visit_month'])['visitors','reserve_visitors','reserve_visitors_air','reserve_visitors_hpg'].sum().reset_index()
air_hpg_monthly['reserve_rate'] = air_hpg_monthly['reserve_visitors'] / air_hpg_monthly['visitors'] * 100
air_hpg_monthly.tail(3)
|
visit_month |
visitors |
reserve_visitors |
reserve_visitors_air |
reserve_visitors_hpg |
reserve_rate |
13 |
2017-02 |
401462 |
55158.0 |
47963.0 |
7195.0 |
13.739283 |
14 |
2017-03 |
495487 |
72358.0 |
61065.0 |
11293.0 |
14.603410 |
15 |
2017-04 |
338236 |
43843.0 |
37493.0 |
6350.0 |
12.962251 |
air_hpg_monthly.describe(include='all')
|
visit_month |
visitors |
reserve_visitors |
reserve_visitors_air |
reserve_visitors_hpg |
reserve_rate |
count |
16 |
16.00000 |
16.000000 |
16.000000 |
16.000000 |
16.000000 |
unique |
16 |
NaN |
NaN |
NaN |
NaN |
NaN |
top |
2017-01 |
NaN |
NaN |
NaN |
NaN |
NaN |
freq |
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
mean |
NaN |
330478.31250 |
29881.437500 |
24119.750000 |
5761.687500 |
8.644849 |
std |
NaN |
131525.06374 |
27187.080937 |
23720.900285 |
4189.623543 |
5.328091 |
min |
NaN |
152924.00000 |
5569.000000 |
2.000000 |
1418.000000 |
1.363948 |
25% |
NaN |
181608.50000 |
10555.750000 |
6841.750000 |
2355.750000 |
3.665765 |
50% |
NaN |
394155.00000 |
14780.000000 |
12612.000000 |
5829.000000 |
8.625267 |
75% |
NaN |
420050.50000 |
48613.750000 |
42463.000000 |
6528.000000 |
13.046896 |
max |
NaN |
497617.00000 |
93584.000000 |
75556.000000 |
18028.000000 |
18.806431 |
height1 = air_hpg_monthly.reserve_visitors_air.as_matrix()
height2 = air_hpg_monthly.reserve_visitors_hpg.as_matrix()
height3 = air_hpg_monthly.visitors.as_matrix()
line_height = air_hpg_monthly.reserve_rate.as_matrix()
w1 = 0.4
w2 = w1 / 2
left1 = np.arange(len(height1))
left2 = np.arange(len(height3))
fig, ax1 = plt.subplots(figsize=(14, 8))
p1 = ax1.bar(left1, height1, align='center', width=w1, tick_label=air_hpg_monthly.visit_month, color='deepskyblue')
p2 = plt.bar(left1, height2, align="center", width=w1, bottom=height1, color='pink')
p3 = ax1.bar(left2 + w1, height3, align='center', width=w1, tick_label=air_hpg_monthly.visit_month, color='navy')
ax1.set_ylabel('Num of people')
ax2 = ax1.twinx()
p4 = ax2.plot(left1 + w2, line_height, color="red")
ax2.set_ylabel('Reserve Rate')
ax2.set_ylim(0, np.nanmax(line_height)*1.75)
plt.legend((p1[0], p2[0], p3[0], p4[0]), ("air_reserve", "hpg_reserve", "visit", "reserve_rate"), loc='upper right', bbox_to_anchor=(0.7, 0.5, 0.5, 0.5))
9–2.日次の訪問予定者数、訪問者数、予約訪問者率(訪問予定者数 / 訪問者数)
air_hpg_df_daily = air_hpg_df.copy()
air_hpg_df_daily['visit_date'] = air_hpg_df_daily.visit_date.map(lambda x: datetime.strptime(x,'%Y-%m-%d'))
air_hpg_df_daily = air_hpg_df_daily.drop('air_store_id',axis=1)
air_hpg_df_daily = air_hpg_df_daily.groupby(['visit_date'])['reserve_visitors','visitors'].sum().reset_index()
air_hpg_df_daily.tail(3)
|
visit_date |
reserve_visitors |
visitors |
475 |
2017-04-20 |
1650.0 |
13990 |
476 |
2017-04-21 |
4262.0 |
19624 |
477 |
2017-04-22 |
3427.0 |
21245 |
air_hpg_df_daily['reserve_rate'] = air_hpg_df_daily['reserve_visitors'] / air_hpg_df_daily['visitors'] * 100
air_hpg_df_daily[air_hpg_df_daily['reserve_rate'] > 0].head(3)
|
visit_date |
reserve_visitors |
visitors |
reserve_rate |
0 |
2016-01-01 |
21.0 |
1033 |
2.032914 |
1 |
2016-01-02 |
139.0 |
1764 |
7.879819 |
2 |
2016-01-03 |
75.0 |
2368 |
3.167230 |
air_hpg_df_daily.describe(include='all')
|
visit_date |
reserve_visitors |
visitors |
reserve_rate |
count |
478 |
478.000000 |
478.000000 |
478.000000 |
unique |
478 |
NaN |
NaN |
NaN |
top |
2016-09-08 00:00:00 |
NaN |
NaN |
NaN |
freq |
1 |
NaN |
NaN |
NaN |
first |
2016-01-01 00:00:00 |
NaN |
NaN |
NaN |
last |
2017-04-22 00:00:00 |
NaN |
NaN |
NaN |
mean |
NaN |
1000.215481 |
11062.035565 |
8.175872 |
std |
NaN |
1111.148722 |
5079.505034 |
5.752625 |
min |
NaN |
21.000000 |
1033.000000 |
0.477484 |
25% |
NaN |
238.250000 |
5894.500000 |
2.828093 |
50% |
NaN |
513.500000 |
11429.500000 |
8.044074 |
75% |
NaN |
1344.750000 |
14286.000000 |
11.572778 |
max |
NaN |
6222.000000 |
23982.000000 |
29.372503 |
line_height1 = air_hpg_df_daily.reserve_visitors.as_matrix()
line_height2 = air_hpg_df_daily.visitors.as_matrix()
line_height3 = air_hpg_df_daily.reserve_rate.as_matrix()
left = air_hpg_df_daily.visit_date
fig, ax1 = plt.subplots(figsize=(18, 10))
p1 = plt.plot(left, line_height1, color="deepskyblue")
p2 = plt.plot(left, line_height2, color="navy")
ax1.set_ylabel('Num of people')
ax2 = ax1.twinx()
p3 = ax2.plot(left, line_height3, color="red")
ax2.set_ylabel('Reserve Rate')
ax2.set_ylim(0, 50)
plt.legend((p1[0], p2[0], p3[0]), ("reserve", "visit", "reserve_rate"), loc='upper right', bbox_to_anchor=(0.7, 0.5, 0.5, 0.5))
plt.title("daily_visitors")
plt.xlabel("visit_date")
plt.grid(True)
9–3.気づき
ホットペッパーの訪問データを足すことによる、発見したトレンドへの影響はほとんどない
10.レストラン情報を結合
10–1.Airリザーブのレストラン情報を結合
air_store_df = pd.read_csv('Data/csv/air_store_info.csv')
air_store_df.head(3)
|
air_store_id |
air_genre_name |
air_area_name |
latitude |
longitude |
0 |
air_0f0cdeee6c9bf3d7 |
Italian/French |
Hyōgo-ken Kōbe-shi Kumoidōri |
34.695124 |
135.197852 |
1 |
air_7cc17a324ae5c7dc |
Italian/French |
Hyōgo-ken Kōbe-shi Kumoidōri |
34.695124 |
135.197852 |
2 |
air_fee8dcf4d619598e |
Italian/French |
Hyōgo-ken Kōbe-shi Kumoidōri |
34.695124 |
135.197852 |
airi_hpg_df = pd.merge(air_hpg_df,air_store_df,on=['air_store_id'],how='left')
airi_hpg_df = airi_hpg_df.rename(columns={'latitude': 'latitude_air', 'longitude': 'longitude_air'})
airi_hpg_df.tail(3)
|
air_store_id |
hpg_store_id |
visit_date |
visitors |
reserve_visitors |
reserve_visitors_air |
reserve_visitors_hpg |
air_genre_name |
air_area_name |
latitude_air |
longitude_air |
252105 |
air_24e8414b9b07decb |
NaN |
2017-04-20 |
7 |
NaN |
NaN |
NaN |
Other |
Tōkyō-to Shibuya-ku Higashi |
35.653217 |
139.711036 |
252106 |
air_24e8414b9b07decb |
NaN |
2017-04-21 |
8 |
NaN |
NaN |
NaN |
Other |
Tōkyō-to Shibuya-ku Higashi |
35.653217 |
139.711036 |
252107 |
air_24e8414b9b07decb |
NaN |
2017-04-22 |
5 |
NaN |
NaN |
NaN |
Other |
Tōkyō-to Shibuya-ku Higashi |
35.653217 |
139.711036 |
10–2.ホットペッパーグルメのレストラン情報を結合
hpg_store_df = pd.read_csv('Data/csv/hpg_store_info.csv')
hpg_store_df.head(3)
|
hpg_store_id |
hpg_genre_name |
hpg_area_name |
latitude |
longitude |
0 |
hpg_6622b62385aec8bf |
Japanese style |
Tōkyō-to Setagaya-ku Taishidō |
35.643675 |
139.668221 |
1 |
hpg_e9e068dd49c5fa00 |
Japanese style |
Tōkyō-to Setagaya-ku Taishidō |
35.643675 |
139.668221 |
2 |
hpg_2976f7acb4b3a3bc |
Japanese style |
Tōkyō-to Setagaya-ku Taishidō |
35.643675 |
139.668221 |
airi_hpgi_df = pd.merge(airi_hpg_df,hpg_store_df,on=['hpg_store_id'],how='left')
airi_hpgi_df = airi_hpgi_df.rename(columns={'latitude': 'latitude_hpg', 'longitude': 'longitude_hpg'})
airi_hpgi_df.tail(3)
|
air_store_id |
hpg_store_id |
visit_date |
visitors |
reserve_visitors |
reserve_visitors_air |
reserve_visitors_hpg |
air_genre_name |
air_area_name |
latitude_air |
longitude_air |
hpg_genre_name |
hpg_area_name |
latitude_hpg |
longitude_hpg |
252105 |
air_24e8414b9b07decb |
NaN |
2017-04-20 |
7 |
NaN |
NaN |
NaN |
Other |
Tōkyō-to Shibuya-ku Higashi |
35.653217 |
139.711036 |
NaN |
NaN |
NaN |
NaN |
252106 |
air_24e8414b9b07decb |
NaN |
2017-04-21 |
8 |
NaN |
NaN |
NaN |
Other |
Tōkyō-to Shibuya-ku Higashi |
35.653217 |
139.711036 |
NaN |
NaN |
NaN |
NaN |
252107 |
air_24e8414b9b07decb |
NaN |
2017-04-22 |
5 |
NaN |
NaN |
NaN |
Other |
Tōkyō-to Shibuya-ku Higashi |
35.653217 |
139.711036 |
NaN |
NaN |
NaN |
NaN |
airi_hpgi_df.describe(include='all')
|
air_store_id |
hpg_store_id |
visit_date |
visitors |
reserve_visitors |
reserve_visitors_air |
reserve_visitors_hpg |
air_genre_name |
air_area_name |
latitude_air |
longitude_air |
hpg_genre_name |
hpg_area_name |
latitude_hpg |
longitude_hpg |
count |
252108 |
46019 |
252108 |
252108.000000 |
35698.000000 |
28064.000000 |
13550.000000 |
252108 |
252108 |
252108.000000 |
252108.000000 |
20098 |
20098 |
20098.000000 |
20098.000000 |
unique |
829 |
150 |
478 |
NaN |
NaN |
NaN |
NaN |
14 |
103 |
NaN |
NaN |
16 |
33 |
NaN |
NaN |
top |
air_5c817ef28f236bdf |
hpg_f83413cde79ac5e7 |
2017-03-17 |
NaN |
NaN |
NaN |
NaN |
Izakaya |
Fukuoka-ken Fukuoka-shi Daimyō |
NaN |
NaN |
Japanese style |
Tōkyō-to Shibuya-ku None |
NaN |
NaN |
freq |
477 |
473 |
799 |
NaN |
NaN |
NaN |
NaN |
62052 |
19775 |
NaN |
NaN |
6770 |
1709 |
NaN |
NaN |
mean |
NaN |
NaN |
NaN |
20.973761 |
13.392991 |
13.751283 |
6.803469 |
NaN |
NaN |
35.613121 |
137.357865 |
NaN |
NaN |
36.316416 |
137.161974 |
std |
NaN |
NaN |
NaN |
16.757007 |
17.141707 |
17.284799 |
7.686077 |
NaN |
NaN |
2.044473 |
3.671577 |
NaN |
NaN |
3.009303 |
3.881765 |
min |
NaN |
NaN |
NaN |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
NaN |
NaN |
33.211967 |
130.195555 |
NaN |
NaN |
33.556881 |
130.392801 |
25% |
NaN |
NaN |
NaN |
9.000000 |
4.000000 |
5.000000 |
2.000000 |
NaN |
NaN |
34.692337 |
135.341564 |
NaN |
NaN |
34.669514 |
134.847642 |
50% |
NaN |
NaN |
NaN |
17.000000 |
9.000000 |
10.000000 |
4.000000 |
NaN |
NaN |
35.658068 |
139.670038 |
NaN |
NaN |
35.659214 |
139.060024 |
75% |
NaN |
NaN |
NaN |
29.000000 |
18.000000 |
18.000000 |
8.000000 |
NaN |
NaN |
35.694003 |
139.751599 |
NaN |
NaN |
35.711353 |
139.737998 |
max |
NaN |
NaN |
NaN |
877.000000 |
1633.000000 |
1633.000000 |
157.000000 |
NaN |
NaN |
44.020632 |
144.273398 |
NaN |
NaN |
43.768033 |
142.359664 |
10–3.都道府県ごとの訪問者数
airi_hpgi_df_area = airi_hpgi_df.copy()
airi_hpgi_df_area['air_area_name'] = airi_hpgi_df_area.air_area_name.map(lambda x: ' '.join(x.split(' ')[:1]).translate(str.maketrans('Ōōū', 'Oou')))
airi_hpgi_df_area['visit_date'] = airi_hpgi_df_area.visit_date.map(lambda x: datetime.strptime(x,'%Y-%m-%d'))
airi_hpgi_df_area = airi_hpgi_df_area.groupby(['visit_date','air_area_name'])['visitors'].sum().reset_index()
airi_hpgi_df_area.head(3)
|
visit_date |
air_area_name |
visitors |
0 |
2016-01-01 |
Fukuoka-ken |
272 |
1 |
2016-01-01 |
Hiroshima-ken |
42 |
2 |
2016-01-01 |
Hokkaido |
20 |
line_height1 = airi_hpgi_df_area[airi_hpgi_df_area['air_area_name'] == 'Tokyo-to'].visitors.as_matrix()
line_height2 = airi_hpgi_df_area[airi_hpgi_df_area['air_area_name'] == 'Osaka-fu'].visitors.as_matrix()
line_height3 = airi_hpgi_df_area[airi_hpgi_df_area['air_area_name'] == 'Fukuoka-ken'].visitors.as_matrix()
left = airi_hpgi_df_area.visit_date.drop_duplicates()
fig, (ax1,ax2,ax3) = plt.subplots(nrows=3, figsize=(18,10))
ax1.plot(left, line_height1, color="orange")
ax1.set_title('Tokyo')
ax1.grid(True)
ax2.plot(left, line_height2, color="black")
ax2.set_title('Osaka')
ax2.set_ylabel('Number_of_people')
ax2.grid(True)
ax3.plot(left, line_height3, color="yellow")
ax3.set_title('Fukuoka')
ax3.set_xlabel('visit_date')
ax3.grid(True)
10–4.ジャンルごとの訪問者数
airi_hpgi_df_genre = airi_hpgi_df.copy()
airi_hpgi_df_genre['air_area_name'] = airi_hpgi_df_genre.air_area_name.map(lambda x: ' '.join(x.split(' ')[:1]).translate(str.maketrans('Ōōū', 'Oou')))
airi_hpgi_df_genre['visit_date'] = airi_hpgi_df_genre.visit_date.map(lambda x: datetime.strptime(x,'%Y-%m-%d'))
airi_hpgi_df_genre = airi_hpgi_df_genre.groupby(['visit_date','air_genre_name'])['visitors'].sum().reset_index()
airi_hpgi_df_genre.head(3)
|
visit_date |
air_genre_name |
visitors |
0 |
2016-01-01 |
Bar/Cocktail |
136 |
1 |
2016-01-01 |
Cafe/Sweets |
266 |
2 |
2016-01-01 |
Creative cuisine |
7 |
line_height1 = airi_hpgi_df_genre[airi_hpgi_df_genre['air_genre_name'] == 'Izakaya'].visitors.as_matrix()
line_height2 = airi_hpgi_df_genre[airi_hpgi_df_genre['air_genre_name'] == 'Cafe/Sweets'].visitors.as_matrix()
line_height3 = airi_hpgi_df_genre[airi_hpgi_df_genre['air_genre_name'] == 'Dining bar'].visitors.as_matrix()
left = airi_hpgi_df_genre.visit_date.drop_duplicates()
fig, (ax1,ax2,ax3) = plt.subplots(nrows=3, figsize=(18,10))
ax1.plot(left, line_height1, color="c")
ax1.set_title('Izakaya')
ax1.grid(True)
ax2.plot(left, line_height2, color="r")
ax2.set_title('Cafe/Sweets')
ax2.set_ylabel('Number_of_people')
ax2.grid(True)
ax3.plot(left, line_height3, color="b")
ax3.set_title('Dining bar')
ax3.set_xlabel('visit_date')
ax3.grid(True)
10–5.気づき
エリアやジャンルによって、トレンドに目立った違いは見られない
11.カレンダー情報を結合
calendar_df = pd.read_csv('Data/csv/date_info.csv')
calendar_df.head()
|
calendar_date |
day_of_week |
holiday_flg |
0 |
2016-01-01 |
Friday |
1 |
1 |
2016-01-02 |
Saturday |
1 |
2 |
2016-01-03 |
Sunday |
1 |
3 |
2016-01-04 |
Monday |
0 |
4 |
2016-01-05 |
Tuesday |
0 |
calendar_df = calendar_df.rename(columns={'calendar_date': 'visit_date'})
comp_df = pd.merge(airi_hpgi_df,calendar_df,on=['visit_date'],how='left')
comp_df.head(3)
|
air_store_id |
hpg_store_id |
visit_date |
visitors |
reserve_visitors |
reserve_visitors_air |
reserve_visitors_hpg |
air_genre_name |
air_area_name |
latitude_air |
longitude_air |
hpg_genre_name |
hpg_area_name |
latitude_hpg |
longitude_hpg |
day_of_week |
holiday_flg |
0 |
air_ba937bf13d40fb24 |
NaN |
2016-01-13 |
25 |
NaN |
NaN |
NaN |
Dining bar |
Tōkyō-to Minato-ku Shibakōen |
35.658068 |
139.751599 |
NaN |
NaN |
NaN |
NaN |
Wednesday |
0 |
1 |
air_ba937bf13d40fb24 |
NaN |
2016-01-14 |
32 |
NaN |
NaN |
NaN |
Dining bar |
Tōkyō-to Minato-ku Shibakōen |
35.658068 |
139.751599 |
NaN |
NaN |
NaN |
NaN |
Thursday |
0 |
2 |
air_ba937bf13d40fb24 |
NaN |
2016-01-15 |
29 |
NaN |
NaN |
NaN |
Dining bar |
Tōkyō-to Minato-ku Shibakōen |
35.658068 |
139.751599 |
NaN |
NaN |
NaN |
NaN |
Friday |
0 |