🚀
データサイエンス100本ノック(構造化データ加工編)をRで解く 81 - 90
R-081
df_product_2 <- df_product %>%
mutate(
unit_price = replace_na(df_product_2$unit_price, mean(df_product$unit_price, na.rm=TRUE) %>% round()),
unit_cost = replace_na(df_product_2$unit_cost, mean(df_product$unit_cost, na.rm=TRUE) %>% round())
)
df_product_2 %>%
summarise(across(everything(), n_na))
R-082
df_product_3 <- df_product %>%
mutate(
unit_price = replace_na(df_product_2$unit_price, median(df_product$unit_price, na.rm=TRUE) %>% round()),
unit_cost = replace_na(df_product_2$unit_cost, median(df_product$unit_cost, na.rm=TRUE) %>% round())
)
df_product_3 %>%
summarise(across(everything(), n_na))
R-083
df_product %>%
group_by(category_small_cd) %>%
mutate(
unit_price_2 = replace_na(unit_price, mean(unit_price, na.rm=TRUE))
) %>%
dplyr::filter(is.na(unit_price))
R-084
df_receipt %>%
mutate(
sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
year = year(sales_ymd),
) %>%
select(
customer_id,
year,
amount
) %>%
group_by(customer_id) %>%
summarise(amount_all = sum(amount), amount_2019 = sum(amount[year!=2019])) %>%
ungroup() %>%
mutate(
rate_2019 = amount_2019 / amount_all
) -> res
res %>% head(10)
R-085
df_customer %>%
left_join(
df_geocode %>% select(postal_cd, longitude, latitude),
by = "postal_cd"
) %>%
group_by(customer_id) %>%
summarise(
longitude = mean(longitude, na.rm=TRUE),
latitude = mean(latitude, na.rm=TRUE)
) -> mst_geo
df_customer_1 <- df_customer %>%
left_join(
mst_geo,
by = "customer_id"
)
df_customer_1 %>% head(10)
R-086
dL <- function(lat1, lat2, long1, long2, const=6378){
lat1 <- lat1 * pi / 180
lat2 <- lat2 * pi / 180
long1 <- long1 * pi / 180
long2 <- long2 * pi / 180
l <- const * acos( sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * cos(long1 - long2))
return(l)
}
df_customer_1 %>%
select(application_store_cd,
address,
latitude,
longitude) %>%
rename(store_cd = application_store_cd,
customer_address = address,
customer_latitude = latitude,
customer_longitude = longitude) %>%
left_join(
df_store %>%
select(store_cd, address, latitude, longitude) %>%
rename(store_address = address,
store_latitude = latitude,
store_longitude = longitude
),
by = "store_cd"
) %>%
mutate(
distance = dL(customer_latitude, store_latitude, customer_longitude, store_longitude)
) %>%
head(10)
R-087
# 売上実績の結合
df_customer %>%
select(
customer_id,
customer_name,
postal_cd
) %>%
left_join(
df_receipt %>%
group_by(customer_id) %>%
summarise(amount = sum(amount)),
by = "customer_id"
) -> joined
# 売上実績のない顧客の処理
joined %>%
dplyr::filter(is.na(amount)) %>%
group_by(customer_name, postal_cd) %>%
summarise(customer_id = min(customer_id)) %>%
ungroup() %>%
select(
customer_id,
customer_name,
postal_cd
) %>%
mutate(
amount = 0
) -> customer_has_no_amount
# 売上実績が同一の顧客の処理
joined %>%
dplyr::filter(!is.na(amount)) %>%
group_by(customer_name, postal_cd, amount) %>%
summarise(n = n()) %>%
dplyr::filter(n > 1) %>%
head(10) # そのような顧客はいない
# 売上実績が異なる顧客の処理
joined %>%
dplyr::filter(!is.na(amount), !customer_id %in% customer_has_no_amount$customer_id) %>%
group_by(customer_name, postal_cd) %>%
summarise(amount = max(amount)) %>%
ungroup() %>%
left_join(
joined %>% select(customer_id, customer_name, postal_cd, amount),
by = c("customer_name", "postal_cd", "amount")
) %>%
select(
customer_id,
customer_name,
postal_cd,
amount
) -> customer_has_amount
df_customer_u <- bind_rows(
customer_has_amount,
customer_has_no_amount
) %>% arrange(customer_id)
R-088
df_customer_n <- bind_rows(
customer_has_amount,
customer_has_no_amount
) %>% arrange(customer_id)
df_customer_n %>% dim()
R-089
set.seed(100)
df_split <- initial_split(customer_has_amount, prop=0.8)
train <- training(df_split)
test <- testing(df_split)
R-090
df_receipt %>%
mutate(
sales_ymd = lubridate::ymd(sales_ymd),
year = year(sales_ymd),
month = month(sales_ymd),
ym = paste(year, str_pad(month, 2, "left", 0), sep="-"),
train_1 = ifelse({ym >= "2017-01"} & {ym <= "2017-12"}, 1, 0),
test_1 = ifelse({ym >= "2018-01"} & {ym <= "2018-06"}, 1, 0),
train_2 = ifelse({ym >= "2017-06"} & {ym <= "2018-05"}, 1, 0),
test_2 = ifelse({ym >= "2018-06"} & {ym <= "2018-12"}, 1, 0),
train_3 = ifelse({ym >= "2018-10"} & {ym <= "2019-03"}, 1, 0),
test_3 = ifelse({ym >= "2019-04"} & {ym <= "2019-10"}, 1, 0)
) %>%
head(5)
Discussion