🙆♀️
データサイエンス100本ノック(構造化データ加工編)をRで解く 31 - 40
R-031
df_receipt %>%
group_by(store_cd) %>%
summarise(sd_amount = sd(amount)) %>%
arrange(desc(sd_amount)) %>%
head(10)
R-032
df_receipt %>%
pull(amount) %>%
quantile()
R-033
df_receipt %>%
group_by(store_cd) %>%
summarise(avg_amount = mean(amount)) %>%
dplyr::filter(
avg_amount >= 330
)
R-034
df_receipt %>%
dplyr::filter(
!str_detect(customer_id, "^Z.*")
) %>%
group_by(customer_id) %>%
summarise(avg_amount = mean(amount))
R-035
# 平均の平均の場合
df_receipt %>%
dplyr::filter(
!str_detect(customer_id, "^Z.*")
) %>%
group_by(customer_id) %>%
summarise(avg_amount = mean(amount)) %>%
dplyr::filter(
avg_amount >= mean(avg_amount)
) %>%
head(10)
# 全体の平均の場合
df_receipt %>%
dplyr::filter(
!str_detect(customer_id, "^Z.*")
) %>%
group_by(customer_id) %>%
summarise(avg_amount = mean(amount)) %>%
dplyr::filter(
avg_amount >= mean(df_receipt$amount)
) %>%
head(10)
R-036
df_receipt %>%
inner_join(df_store, by="store_cd") %>%
head(10)
R-037
df_product %>%
inner_join(
df_category,
by=c("category_major_cd", "category_medium_cd", "category_small_cd")
) %>%
head(10)
R-038
total_amount_by_customer <- df_receipt %>%
group_by(customer_id) %>%
summarise(total_amount = sum(amount))
df_customer %>%
left_join(total_amount_by_customer, by="customer_id") %>%
dplyr::filter(
gender == "女性",
str_detect(customer_id, regex("^[^Z].*"))
) %>%
mutate(
total_amount = replace_na(total_amount, 0)
) %>%
select(
customer_id,
gender,
total_amount
) %>%
head(10)
R-039
df_receipt %>%
dplyr::filter(
!str_starts(customer_id, "Z")
) %>%
group_by(customer_id) %>%
summarise(days = n_distinct(sales_ymd), total_amount = sum(amount)) %>%
mutate(
rank_days = rank(-days, ties.method="first"),
rank_amount = rank(-total_amount, ties.method="first")
) %>%
dplyr::filter(
{rank_days <= 20} | {rank_amount <= 20}
)
R-040
nrow(df_store) * nrow(df_product)
Discussion