📘
データサイエンス100本ノック(構造化データ加工編)をRで解く 71 - 80
R-071
df_receipt %>%
left_join(
df_customer %>% select(customer_id, application_date),
by="customer_id",
) %>%
mutate(
sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
application_date = lubridate::parse_date_time(application_date, "%Y%m%d"),
member_interval = lubridate::interval(start=application_date, end=sales_ymd, tz="Asia/Tokyo"),
days_since_application = month(sales_ymd) - month(application_date),
months_since_application = member_interval %/% months(1)
) %>%
select(
customer_id,
sales_ymd,
application_date,
months_since_application
) %>%
head(10)
R-072
df_receipt %>%
left_join(
df_customer %>% select(customer_id, application_date),
by="customer_id",
) %>%
mutate(
sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
application_date = lubridate::parse_date_time(application_date, "%Y%m%d"),
member_interval = lubridate::interval(start=application_date, end=sales_ymd, tz="Asia/Tokyo"),
days_since_application = member_interval %/% days(1),
months_since_application = member_interval %/% months(1),
years_since_application = member_interval %/% years(1)
) %>%
select(
customer_id,
sales_ymd,
application_date,
days_since_application,
months_since_application,
years_since_application
) %>%
head(10)
R-073
df_receipt %>%
left_join(
df_customer %>% select(customer_id, application_date),
by="customer_id",
) %>%
mutate(
sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
application_date = lubridate::parse_date_time(application_date, "%Y%m%d"),
member_interval = lubridate::interval(start=application_date, end=sales_ymd, tz="Asia/Tokyo"),
days_since_application = member_interval %/% days(1),
months_since_application = member_interval %/% months(1),
years_since_application = member_interval %/% years(1),
secs_since_application = member_interval %/% seconds(1),
) %>%
select(
customer_id,
sales_ymd,
application_date,
days_since_application,
months_since_application,
years_since_application,
secs_since_application
) %>%
head(10)
R-074
df_receipt %>%
mutate(
sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
week_start = lubridate::floor_date(sales_ymd, "weeks"),
days_since_the_first_of_the_week = lubridate::interval(start=week_start, end=sales_ymd, tz="Asia/Tokyo") %/% days(1)
) %>%
select(
customer_id,
sales_ymd,
week_start,
days_since_the_first_of_the_week
) %>%
head(10)
R-075
set.seed(111)
n <- nrow(df_customer)
n_1percent <- ceiling(n * 0.01)
df_customer %>%
slice_sample(n=n_1percent, weight_by = rep(1/n, n)) %>%
head(10)
R-076
set.seed(100)
df_customer %>%
group_by(gender_cd) %>%
slice_sample(prop=0.01) %>%
group_by(gender_cd) %>%
summarise(n=n())
R-077
df_receipt %>%
dplyr::filter(!str_starts(customer_id, "Z")) %>%
group_by(customer_id) %>%
summarise(amount = sum(amount)) %>%
ungroup() %>%
drop_na() %>%
dplyr::filter(
abs(amount - mean(amount)) >= 3*sd(amount)
) %>%
head(10)
R-078
df_receipt %>%
dplyr::filter(!str_starts(customer_id, "Z")) %>%
group_by(customer_id) %>%
summarise(amount = sum(amount)) %>%
ungroup() %>%
drop_na() %>%
dplyr::filter(
{amount < quantile(amount, probs = 0.25) - 1.5 * (quantile(amount, probs = 0.75) - quantile(amount, probs = 0.25))}
| {amount > quantile(amount, probs = 0.74) + 1.5 * (quantile(amount, probs = 0.75) - quantile(amount, probs = 0.25))}
) %>%
head(10)
R-079
n_na <- function(x){
return(sum(is.na(x)))
}
df_product %>%
summarise(across(everything(), n_na))
R-080
df_product_1 <- df_product %>%
drop_na()
df_product %>% dim() %>% print()
df_product_1 %>% dim() %>% print()
Discussion