😽

データサイエンス100本ノック（構造化データ加工編）をRで解く 41 - 50

2021/11/21に公開

R-041

df_receipt %>%
    group_by(sales_ymd) %>%
    summarise(total_amount = sum(amount)) %>%
    ungroup() %>%
    arrange(sales_ymd) %>%
    mutate(
        diff_yesterday = c(0, diff(total_amount))
    ) %>%
    head(10)

R-042

df_receipt %>%
    group_by(sales_ymd) %>%
    summarise(total_amount = sum(amount)) %>%
    ungroup() %>%
    arrange(sales_ymd) %>%
    mutate(
        lag_1day = lag(total_amount, n=1),
        lag_2day = lag(total_amount, n=2),
        lag_3day = lag(total_amount, n=3),
    ) %>%
    head(10)

# 差分を計算する場合
df_receipt %>%
    group_by(sales_ymd) %>%
    summarise(total_amount = sum(amount)) %>%
    ungroup() %>%
    arrange(sales_ymd) %>%
    mutate(
        diff_1day = c(rep(0,1), diff(total_amount, lag=1)),
        diff_2day = c(rep(0,2), diff(total_amount, lag=2)),
        diff_3day = c(rep(0,3), diff(total_amount, lag=3))
    ) %>%
    head(10)

R-043

get_generation <- function(x) {
    res <- rep(0, length(x))
    res[x < 10] <- "00-09"
    res[{x >= 10}&{x < 20}] <- "10-19"
    res[{x >= 20}&{x < 30}] <- "20-29"
    res[{x >= 30}&{x < 40}] <- "30-39"
    res[{x >= 40}&{x < 50}] <- "40-49"
    res[{x >= 50}&{x < 60}] <- "50-59"
    res[{x >= 60}&{x < 70}] <- "60-69"
    res[{x >= 70}&{x < 80}] <- "70-79"
    res[{x >= 80}&{x < 90}] <- "80-89"
    res[{x >= 90}] <- "90-"
    return(res)
}

df_receipt %>%
    left_join(
        df_customer %>%
        mutate(
            generation = get_generation(age)
        ) %>%
        select(
            customer_id,
            gender_cd,
            gender,
            generation,
        )
    , by="customer_id") %>%
    group_by(gender, generation) %>%
    summarise(total_amount = sum(amount)) %>%
    ungroup() %>%
    pivot_wider(names_from = generation, values_from = total_amount)

R-044

df_receipt %>%
    left_join(
        df_customer %>%
        mutate(
            generation = get_generation(age)
        ) %>%
        select(
            customer_id,
            gender_cd,
            gender,
            generation,
        )
    , by="customer_id") %>%
    group_by(gender, generation) %>%
    summarise(total_amount = sum(amount)) %>%
    ungroup() %>%
    pivot_wider(names_from = gender, values_from = total_amount)

R-045

df_customer %>%
    mutate(
        birth_day2 = lubridate::parse_date_time(birth_day, "%Y-%m-%d"),
        birth_day3 = str_remove_all(birth_day,"-")
    ) %>%
    select(
        customer_id,
        birth_day,
        birth_day2,
        birth_day3
    ) %>%
    head(10)

R-046

df_customer %>%
    mutate(
        birth_day2 = lubridate::parse_date_time(birth_day, "%Y-%m-%d"),
        birth_day3 = str_remove_all(birth_day,"-")
    ) %>%
    select(
        customer_id,
        birth_day,
        birth_day2,
        birth_day3
    ) %>%
    head(10)

R-047

df_receipt %>%
    mutate(
        sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d")
    ) %>%
    select(
        sales_ymd,
        receipt_no,
        receipt_sub_no
    ) %>%
    head(10)

R-048

df_receipt %>%
    mutate(
        date = lubridate::as_datetime(sales_epoch, tz="Asia/Tokyo")
    ) %>%
    select(
        date,
        receipt_no,
        receipt_sub_no
    ) %>%
    head(10)

R-049

df_receipt %>%
    mutate(
        date = lubridate::as_datetime(sales_epoch, tz="Asia/Tokyo"),
        year = lubridate::year(date)
    ) %>%
    select(
        year,
        receipt_no,
        receipt_sub_no
    ) %>%
    head(10)

R-050

df_receipt %>%
    mutate(
        date = lubridate::as_datetime(sales_epoch, tz="Asia/Tokyo"),
        year = lubridate::year(date),
        month = lubridate::month(date)
    ) %>%
    select(
        year,
        month,
        receipt_no,
        receipt_sub_no
    ) %>%
    head(10)

次の問題：51-60

R-041

R-042

R-043

R-044

R-045

R-046

R-047

R-048

R-049

R-050

Discussion