😽
データサイエンス100本ノック(構造化データ加工編)をRで解く 41 - 50
R-041
df_receipt %>%
group_by(sales_ymd) %>%
summarise(total_amount = sum(amount)) %>%
ungroup() %>%
arrange(sales_ymd) %>%
mutate(
diff_yesterday = c(0, diff(total_amount))
) %>%
head(10)
R-042
df_receipt %>%
group_by(sales_ymd) %>%
summarise(total_amount = sum(amount)) %>%
ungroup() %>%
arrange(sales_ymd) %>%
mutate(
lag_1day = lag(total_amount, n=1),
lag_2day = lag(total_amount, n=2),
lag_3day = lag(total_amount, n=3),
) %>%
head(10)
# 差分を計算する場合
df_receipt %>%
group_by(sales_ymd) %>%
summarise(total_amount = sum(amount)) %>%
ungroup() %>%
arrange(sales_ymd) %>%
mutate(
diff_1day = c(rep(0,1), diff(total_amount, lag=1)),
diff_2day = c(rep(0,2), diff(total_amount, lag=2)),
diff_3day = c(rep(0,3), diff(total_amount, lag=3))
) %>%
head(10)
R-043
get_generation <- function(x) {
res <- rep(0, length(x))
res[x < 10] <- "00-09"
res[{x >= 10}&{x < 20}] <- "10-19"
res[{x >= 20}&{x < 30}] <- "20-29"
res[{x >= 30}&{x < 40}] <- "30-39"
res[{x >= 40}&{x < 50}] <- "40-49"
res[{x >= 50}&{x < 60}] <- "50-59"
res[{x >= 60}&{x < 70}] <- "60-69"
res[{x >= 70}&{x < 80}] <- "70-79"
res[{x >= 80}&{x < 90}] <- "80-89"
res[{x >= 90}] <- "90-"
return(res)
}
df_receipt %>%
left_join(
df_customer %>%
mutate(
generation = get_generation(age)
) %>%
select(
customer_id,
gender_cd,
gender,
generation,
)
, by="customer_id") %>%
group_by(gender, generation) %>%
summarise(total_amount = sum(amount)) %>%
ungroup() %>%
pivot_wider(names_from = generation, values_from = total_amount)
R-044
df_receipt %>%
left_join(
df_customer %>%
mutate(
generation = get_generation(age)
) %>%
select(
customer_id,
gender_cd,
gender,
generation,
)
, by="customer_id") %>%
group_by(gender, generation) %>%
summarise(total_amount = sum(amount)) %>%
ungroup() %>%
pivot_wider(names_from = gender, values_from = total_amount)
R-045
df_customer %>%
mutate(
birth_day2 = lubridate::parse_date_time(birth_day, "%Y-%m-%d"),
birth_day3 = str_remove_all(birth_day,"-")
) %>%
select(
customer_id,
birth_day,
birth_day2,
birth_day3
) %>%
head(10)
R-046
df_customer %>%
mutate(
birth_day2 = lubridate::parse_date_time(birth_day, "%Y-%m-%d"),
birth_day3 = str_remove_all(birth_day,"-")
) %>%
select(
customer_id,
birth_day,
birth_day2,
birth_day3
) %>%
head(10)
R-047
df_receipt %>%
mutate(
sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d")
) %>%
select(
sales_ymd,
receipt_no,
receipt_sub_no
) %>%
head(10)
R-048
df_receipt %>%
mutate(
date = lubridate::as_datetime(sales_epoch, tz="Asia/Tokyo")
) %>%
select(
date,
receipt_no,
receipt_sub_no
) %>%
head(10)
R-049
df_receipt %>%
mutate(
date = lubridate::as_datetime(sales_epoch, tz="Asia/Tokyo"),
year = lubridate::year(date)
) %>%
select(
year,
receipt_no,
receipt_sub_no
) %>%
head(10)
R-050
df_receipt %>%
mutate(
date = lubridate::as_datetime(sales_epoch, tz="Asia/Tokyo"),
year = lubridate::year(date),
month = lubridate::month(date)
) %>%
select(
year,
month,
receipt_no,
receipt_sub_no
) %>%
head(10)
Discussion