😸

データサイエンス100本ノック（構造化データ加工編）をRで解く 51 - 60

2021/11/21に公開

R-051

df_receipt %>%
    mutate(
        date = lubridate::as_datetime(sales_epoch, tz="Asia/Tokyo"),
        year = lubridate::year(date),
        month = lubridate::month(date),
        day = lubridate::day(date)
    ) %>%
    select(
        year,
        month,
        day,
        receipt_no,
        receipt_sub_no
    ) %>%
    head(10)

R-052

df_receipt %>%
    dplyr::filter(
        !str_starts(customer_id, "Z")
    ) %>%
    group_by(customer_id) %>%
    summarise(total_amount = sum(amount)) %>%
    mutate(
        is_amount_over_2000 = ifelse(total_amount >= 2000, 1, 0)
    ) %>%
    head(10)

R-053

df_customer %>%
    mutate(
        postal_first3digit = stringr::str_extract(postal_cd, regex("[0-9]{3}")),
        postal_last4digit = stringr::str_extract(postal_cd, regex("[0-9]{4}")),
        is_tokyo = ifelse({"100" <= postal_first3digit} & {postal_first3digit <= "209"}, 1, 0)
    ) %>%
    left_join(
        df_receipt %>%
            group_by(customer_id) %>%
            summarise(total_amount = sum(amount)),
        by = "customer_id"
    ) %>%
    mutate(
        total_amount = replace_na(total_amount, 0)
    ) %>%
    dplyr::filter(
        total_amount != 0
    ) %>%
    group_by(is_tokyo) %>%
    summarise(n = n(), total_amount = sum(total_amount)) %>%
    head(10)

R-054

pref_code <- function(x) {
    res <- rep(0, length(x))
    res[str_detect(x, "埼玉県")] <- 11
    res[str_detect(x, "千葉県")] <- 12
    res[str_detect(x, "東京都")] <- 13
    res[str_detect(x, "神奈川県")] <- 14
    return(res)
}

df_customer %>%
    mutate(
        pref_cd = pref_code(address)
    ) %>%
    select(
        customer_id,
        address,
        pref_cd
    ) %>%
    head(10)

R-055

cd_qtl <- function(x) {
    base <- quantile(x)
    res <- rep(0, length(x))
    res[{base["0%"] <= x} & {x < base["25%"]}]  <- 1
    res[{base["25%"] <= x} & {x < base["50%"]}] <- 2
    res[{base["50%"] <= x} & {x < base["75%"]}] <- 3
    res[{base["75%"] <= x}] <- 4
    return(res)
}

df_receipt %>%
    group_by(customer_id) %>%
    summarise(total_amount = sum(amount)) %>%
    mutate(
        grp_qtl = cd_qtl(total_amount)
    ) %>%
    head(10)

R-056

get_generation <- function(x) {
    res <- rep(0, length(x))
    res[x < 10] <- "00-09"
    res[{x >= 10}&{x < 20}] <- "10-19"
    res[{x >= 20}&{x < 30}] <- "20-29"
    res[{x >= 30}&{x < 40}] <- "30-39"
    res[{x >= 40}&{x < 50}] <- "40-49"
    res[{x >= 50}&{x < 60}] <- "50-59"
    res[{x >= 60}] <- "60-"
    return(res)
}

df_customer %>%
    mutate(
        generation = get_generation(age)
    ) %>%
    select(
        customer_id,
        age,
        generation,
        birth_day
    ) %>%
    head(10)

R-057

df_customer %>%
    mutate(
        generation = get_generation(age),
        generation_gender = str_c(generation, gender, sep = "_")
    ) %>%
    select(
        customer_id,
        age,
        generation,
        gender,
        generation_gender,
        birth_day
    ) %>%
    head(10)

R-058

df_customer %>%
    mutate(
        gender_0 = as.integer(gender_cd == 0),
        gender_1 = as.integer(gender_cd == 1),
        gender_9 = as.integer(gender_cd == 9)
    ) %>%
    select(
        customer_id,
        starts_with("gender_")
    ) %>%
    head(10)

R-059

df_customer %>%
    dplyr::filter(!str_starts(customer_id, "Z")) %>%
    left_join(
        df_receipt %>%
            group_by(customer_id) %>%
            summarise(total_amount = sum(amount)),
        by = "customer_id"
    ) %>% 
    dplyr::filter(!is.na(total_amount)) %>%
    mutate(
        std_amount = (total_amount - mean(total_amount, na.rm=TRUE)) / sd(total_amount, na.rm=TRUE)
    ) %>%
    select(
        customer_id,
        std_amount
    ) %>%
    head(10)

R-060

standardize <- function(x){
    res <- (x - mean(x)) / sd(x)
    return(res)
}

regularize <- function(x){
    res <- (x - min(x)) / (max(x) - min(x))
    return(res)
}

df_receipt %>%
    group_by(customer_id) %>%
    summarise(total_amount = sum(amount)) %>%
    inner_join(
        df_customer %>%
            dplyr::filter(!str_starts(customer_id, "Z")),
        by="customer_id"
    ) %>% 
    select(
        customer_id,
        total_amount
    ) %>% 
    mutate(
        std_amount = standardize(total_amount),
        rgl_amount = regularize(total_amount)
    ) %>% 
    head(10)

次の問題：61-70

R-051

R-052

R-053

R-054

R-055

R-056

R-057

R-058

R-059

R-060

Discussion