😸
データサイエンス100本ノック(構造化データ加工編)をRで解く 51 - 60
R-051
df_receipt %>%
mutate(
date = lubridate::as_datetime(sales_epoch, tz="Asia/Tokyo"),
year = lubridate::year(date),
month = lubridate::month(date),
day = lubridate::day(date)
) %>%
select(
year,
month,
day,
receipt_no,
receipt_sub_no
) %>%
head(10)
R-052
df_receipt %>%
dplyr::filter(
!str_starts(customer_id, "Z")
) %>%
group_by(customer_id) %>%
summarise(total_amount = sum(amount)) %>%
mutate(
is_amount_over_2000 = ifelse(total_amount >= 2000, 1, 0)
) %>%
head(10)
R-053
df_customer %>%
mutate(
postal_first3digit = stringr::str_extract(postal_cd, regex("[0-9]{3}")),
postal_last4digit = stringr::str_extract(postal_cd, regex("[0-9]{4}")),
is_tokyo = ifelse({"100" <= postal_first3digit} & {postal_first3digit <= "209"}, 1, 0)
) %>%
left_join(
df_receipt %>%
group_by(customer_id) %>%
summarise(total_amount = sum(amount)),
by = "customer_id"
) %>%
mutate(
total_amount = replace_na(total_amount, 0)
) %>%
dplyr::filter(
total_amount != 0
) %>%
group_by(is_tokyo) %>%
summarise(n = n(), total_amount = sum(total_amount)) %>%
head(10)
R-054
pref_code <- function(x) {
res <- rep(0, length(x))
res[str_detect(x, "埼玉県")] <- 11
res[str_detect(x, "千葉県")] <- 12
res[str_detect(x, "東京都")] <- 13
res[str_detect(x, "神奈川県")] <- 14
return(res)
}
df_customer %>%
mutate(
pref_cd = pref_code(address)
) %>%
select(
customer_id,
address,
pref_cd
) %>%
head(10)
R-055
cd_qtl <- function(x) {
base <- quantile(x)
res <- rep(0, length(x))
res[{base["0%"] <= x} & {x < base["25%"]}] <- 1
res[{base["25%"] <= x} & {x < base["50%"]}] <- 2
res[{base["50%"] <= x} & {x < base["75%"]}] <- 3
res[{base["75%"] <= x}] <- 4
return(res)
}
df_receipt %>%
group_by(customer_id) %>%
summarise(total_amount = sum(amount)) %>%
mutate(
grp_qtl = cd_qtl(total_amount)
) %>%
head(10)
R-056
get_generation <- function(x) {
res <- rep(0, length(x))
res[x < 10] <- "00-09"
res[{x >= 10}&{x < 20}] <- "10-19"
res[{x >= 20}&{x < 30}] <- "20-29"
res[{x >= 30}&{x < 40}] <- "30-39"
res[{x >= 40}&{x < 50}] <- "40-49"
res[{x >= 50}&{x < 60}] <- "50-59"
res[{x >= 60}] <- "60-"
return(res)
}
df_customer %>%
mutate(
generation = get_generation(age)
) %>%
select(
customer_id,
age,
generation,
birth_day
) %>%
head(10)
R-057
df_customer %>%
mutate(
generation = get_generation(age),
generation_gender = str_c(generation, gender, sep = "_")
) %>%
select(
customer_id,
age,
generation,
gender,
generation_gender,
birth_day
) %>%
head(10)
R-058
df_customer %>%
mutate(
gender_0 = as.integer(gender_cd == 0),
gender_1 = as.integer(gender_cd == 1),
gender_9 = as.integer(gender_cd == 9)
) %>%
select(
customer_id,
starts_with("gender_")
) %>%
head(10)
R-059
df_customer %>%
dplyr::filter(!str_starts(customer_id, "Z")) %>%
left_join(
df_receipt %>%
group_by(customer_id) %>%
summarise(total_amount = sum(amount)),
by = "customer_id"
) %>%
dplyr::filter(!is.na(total_amount)) %>%
mutate(
std_amount = (total_amount - mean(total_amount, na.rm=TRUE)) / sd(total_amount, na.rm=TRUE)
) %>%
select(
customer_id,
std_amount
) %>%
head(10)
R-060
standardize <- function(x){
res <- (x - mean(x)) / sd(x)
return(res)
}
regularize <- function(x){
res <- (x - min(x)) / (max(x) - min(x))
return(res)
}
df_receipt %>%
group_by(customer_id) %>%
summarise(total_amount = sum(amount)) %>%
inner_join(
df_customer %>%
dplyr::filter(!str_starts(customer_id, "Z")),
by="customer_id"
) %>%
select(
customer_id,
total_amount
) %>%
mutate(
std_amount = standardize(total_amount),
rgl_amount = regularize(total_amount)
) %>%
head(10)
Discussion