🙆‍♀️

データサイエンス100本ノック（構造化データ加工編）をRで解く 31 - 40

2021/11/21に公開

R-031

df_receipt %>%
    group_by(store_cd) %>%
    summarise(sd_amount = sd(amount)) %>%
    arrange(desc(sd_amount)) %>%
    head(10)

R-032

df_receipt %>%
    pull(amount) %>%
    quantile()

R-033

df_receipt %>%
    group_by(store_cd) %>%
    summarise(avg_amount = mean(amount)) %>%
    dplyr::filter(
        avg_amount >= 330
    )

R-034

df_receipt %>%
    dplyr::filter(
        !str_detect(customer_id, "^Z.*")
    ) %>%
    group_by(customer_id) %>%
    summarise(avg_amount = mean(amount))

R-035

# 平均の平均の場合
df_receipt %>%
    dplyr::filter(
        !str_detect(customer_id, "^Z.*")
    ) %>%
    group_by(customer_id) %>%
    summarise(avg_amount = mean(amount)) %>%
    dplyr::filter(
        avg_amount >= mean(avg_amount)
    ) %>%
    head(10)

# 全体の平均の場合
df_receipt %>%
    dplyr::filter(
        !str_detect(customer_id, "^Z.*")
    ) %>%
    group_by(customer_id) %>%
    summarise(avg_amount = mean(amount)) %>%
    dplyr::filter(
        avg_amount >= mean(df_receipt$amount)
    ) %>%
    head(10)

R-036

df_receipt %>%
    inner_join(df_store, by="store_cd") %>%
    head(10)

R-037

df_product %>%
    inner_join(
        df_category, 
	by=c("category_major_cd", "category_medium_cd", "category_small_cd")
    ) %>%
    head(10)

R-038

total_amount_by_customer <- df_receipt %>%
    group_by(customer_id) %>%
    summarise(total_amount = sum(amount))

df_customer %>%
    left_join(total_amount_by_customer, by="customer_id") %>%
    dplyr::filter(
        gender == "女性",
        str_detect(customer_id, regex("^[^Z].*"))
    ) %>%
    mutate(
        total_amount = replace_na(total_amount, 0)
    ) %>%
    select(
        customer_id,
        gender,
        total_amount
    ) %>%
    head(10)

R-039

df_receipt %>%
    dplyr::filter(
        !str_starts(customer_id, "Z")
    ) %>%
    group_by(customer_id) %>%
    summarise(days = n_distinct(sales_ymd), total_amount = sum(amount)) %>%
    mutate(
        rank_days = rank(-days, ties.method="first"),
        rank_amount = rank(-total_amount, ties.method="first")
    ) %>%
    dplyr::filter(
        {rank_days <= 20} | {rank_amount <= 20}
    )

R-040

nrow(df_store) * nrow(df_product)

次の問題：41-50

R-031

R-032

R-033

R-034

R-035

R-036

R-037

R-038

R-039

R-040

Discussion