📘

データサイエンス100本ノック（構造化データ加工編）をRで解く 71 - 80

2021/11/21に公開

R-071

df_receipt %>%
    left_join(
        df_customer %>% select(customer_id, application_date),
        by="customer_id",
    ) %>% 
    mutate(
        sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
        application_date = lubridate::parse_date_time(application_date, "%Y%m%d"),
        member_interval = lubridate::interval(start=application_date, end=sales_ymd, tz="Asia/Tokyo"),
        days_since_application = month(sales_ymd) - month(application_date),
        months_since_application = member_interval %/% months(1)
    ) %>%
    select(
        customer_id,
        sales_ymd,
        application_date,
        months_since_application
    ) %>%
    head(10)

R-072

df_receipt %>%
    left_join(
        df_customer %>% select(customer_id, application_date),
        by="customer_id",
    ) %>% 
    mutate(
        sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
        application_date = lubridate::parse_date_time(application_date, "%Y%m%d"),
        member_interval = lubridate::interval(start=application_date, end=sales_ymd, tz="Asia/Tokyo"),
        days_since_application = member_interval %/% days(1),
        months_since_application = member_interval %/% months(1),
        years_since_application = member_interval %/% years(1)
    ) %>%
    select(
        customer_id,
        sales_ymd,
        application_date,
        days_since_application,
        months_since_application,
        years_since_application
    ) %>%
    head(10)

R-073

df_receipt %>%
    left_join(
        df_customer %>% select(customer_id, application_date),
        by="customer_id",
    ) %>% 
    mutate(
        sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
        application_date = lubridate::parse_date_time(application_date, "%Y%m%d"),
        member_interval = lubridate::interval(start=application_date, end=sales_ymd, tz="Asia/Tokyo"),
        days_since_application = member_interval %/% days(1),
        months_since_application = member_interval %/% months(1),
        years_since_application = member_interval %/% years(1),
        secs_since_application = member_interval %/% seconds(1),
    ) %>%
    select(
        customer_id,
        sales_ymd,
        application_date,
        days_since_application,
        months_since_application,
        years_since_application,
        secs_since_application
    ) %>%
    head(10)

R-074

df_receipt %>%
    mutate(
        sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
        week_start = lubridate::floor_date(sales_ymd, "weeks"),
        days_since_the_first_of_the_week = lubridate::interval(start=week_start, end=sales_ymd, tz="Asia/Tokyo") %/% days(1)
    ) %>%
    select(
        customer_id,
        sales_ymd,
        week_start,
        days_since_the_first_of_the_week
    ) %>%
    head(10)

R-075

set.seed(111)
n <- nrow(df_customer)
n_1percent <- ceiling(n * 0.01)

df_customer %>%
    slice_sample(n=n_1percent, weight_by = rep(1/n, n)) %>%
    head(10)

R-076

set.seed(100)

df_customer %>%
    group_by(gender_cd) %>%
    slice_sample(prop=0.01) %>%
    group_by(gender_cd) %>%
    summarise(n=n())

R-077

df_receipt %>%
    dplyr::filter(!str_starts(customer_id, "Z")) %>%
    group_by(customer_id) %>%
    summarise(amount = sum(amount)) %>%
    ungroup() %>%
    drop_na() %>%
    dplyr::filter(
        abs(amount - mean(amount)) >= 3*sd(amount)
    ) %>%
    head(10)

R-078

df_receipt %>%
    dplyr::filter(!str_starts(customer_id, "Z")) %>%
    group_by(customer_id) %>%
    summarise(amount = sum(amount)) %>%
    ungroup() %>%
    drop_na() %>%
    dplyr::filter(
          {amount < quantile(amount, probs = 0.25) - 1.5 * (quantile(amount, probs = 0.75) - quantile(amount, probs = 0.25))}
        | {amount > quantile(amount, probs = 0.74) + 1.5 * (quantile(amount, probs = 0.75) - quantile(amount, probs = 0.25))}
    ) %>%
    head(10)

R-079

n_na <- function(x){
    return(sum(is.na(x)))
}

df_product %>% 
    summarise(across(everything(), n_na))

R-080

df_product_1 <- df_product %>%
    drop_na()
    
df_product %>% dim() %>% print()
df_product_1 %>% dim() %>% print()

次の問題：81 - 90

R-071

R-072

R-073

R-074

R-075

R-076

R-077

R-078

R-079

R-080

Discussion