🚀

データサイエンス100本ノック（構造化データ加工編）をRで解く 81 - 90

2021/11/21に公開

R-081

df_product_2 <- df_product %>%
    mutate(
        unit_price = replace_na(df_product_2$unit_price, mean(df_product$unit_price, na.rm=TRUE) %>% round()),
        unit_cost = replace_na(df_product_2$unit_cost,  mean(df_product$unit_cost, na.rm=TRUE) %>% round())
    ) 

df_product_2 %>% 
    summarise(across(everything(), n_na))

R-082

df_product_3 <- df_product %>%
    mutate(
        unit_price = replace_na(df_product_2$unit_price, median(df_product$unit_price, na.rm=TRUE) %>% round()),
        unit_cost = replace_na(df_product_2$unit_cost,  median(df_product$unit_cost, na.rm=TRUE) %>% round())
    ) 

df_product_3 %>% 
    summarise(across(everything(), n_na))

R-083

df_product %>% 
    group_by(category_small_cd) %>%
    mutate(
        unit_price_2 = replace_na(unit_price, mean(unit_price, na.rm=TRUE))
    ) %>%
    dplyr::filter(is.na(unit_price))

R-084

df_receipt %>%
    mutate(
        sales_ymd = lubridate::parse_date_time(sales_ymd, "%Y%m%d"),
        year = year(sales_ymd),
    ) %>%
    select(
        customer_id,
        year,
        amount
    ) %>%
    group_by(customer_id) %>%
    summarise(amount_all = sum(amount), amount_2019 = sum(amount[year!=2019])) %>%
    ungroup() %>%
    mutate(
        rate_2019 = amount_2019 / amount_all
    ) -> res

res %>% head(10)

R-085

df_customer %>%
    left_join(
        df_geocode %>% select(postal_cd, longitude, latitude),
        by = "postal_cd"
    ) %>%
    group_by(customer_id) %>%
    summarise(
        longitude = mean(longitude, na.rm=TRUE),
        latitude  = mean(latitude, na.rm=TRUE)
    ) -> mst_geo

df_customer_1 <- df_customer %>%
    left_join(
        mst_geo,
        by = "customer_id"
    ) 

df_customer_1 %>% head(10)

R-086

dL <- function(lat1, lat2, long1, long2, const=6378){
    lat1 <- lat1 * pi / 180
    lat2 <- lat2 * pi / 180
    long1 <- long1 * pi / 180
    long2 <- long2 * pi / 180
    l <- const * acos( sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * cos(long1 - long2))
    return(l)
}

df_customer_1 %>%
    select(application_store_cd,
           address,
           latitude,
           longitude) %>%
    rename(store_cd = application_store_cd,
           customer_address = address,
           customer_latitude = latitude,
           customer_longitude = longitude) %>%
    left_join(
        df_store %>% 
            select(store_cd, address, latitude, longitude) %>%
            rename(store_address = address,
                   store_latitude = latitude,
                   store_longitude = longitude
                   ),
        by = "store_cd"
    ) %>%
    mutate(
        distance = dL(customer_latitude, store_latitude, customer_longitude, store_longitude)
    ) %>%
    head(10)

R-087

# 売上実績の結合
df_customer %>%
    select(
        customer_id,
        customer_name,
        postal_cd
    ) %>%
    left_join(
        df_receipt %>%
            group_by(customer_id) %>%
            summarise(amount = sum(amount)),
        by = "customer_id"
    ) -> joined
    
# 売上実績のない顧客の処理
joined %>%
    dplyr::filter(is.na(amount)) %>%
    group_by(customer_name, postal_cd) %>% 
    summarise(customer_id = min(customer_id)) %>%
    ungroup() %>%
    select(
        customer_id,
        customer_name,
        postal_cd
    ) %>%
    mutate(
        amount = 0
    ) -> customer_has_no_amount
    
# 売上実績が同一の顧客の処理
joined %>%
    dplyr::filter(!is.na(amount)) %>%
    group_by(customer_name, postal_cd, amount) %>%
    summarise(n = n()) %>%
    dplyr::filter(n > 1) %>%
    head(10) # そのような顧客はいない

# 売上実績が異なる顧客の処理  
joined %>%
    dplyr::filter(!is.na(amount), !customer_id %in% customer_has_no_amount$customer_id) %>%
    group_by(customer_name, postal_cd) %>%
    summarise(amount = max(amount)) %>%
    ungroup() %>%
    left_join(
        joined %>% select(customer_id, customer_name, postal_cd, amount),
        by = c("customer_name", "postal_cd", "amount")
    ) %>%
    select(
        customer_id,
        customer_name,
        postal_cd,
        amount
    ) -> customer_has_amount
    
df_customer_u <- bind_rows(
    customer_has_amount,
    customer_has_no_amount
) %>% arrange(customer_id)

R-088

df_customer_n <- bind_rows(
    customer_has_amount,
    customer_has_no_amount
) %>% arrange(customer_id)

df_customer_n %>% dim()

R-089

set.seed(100)
df_split <- initial_split(customer_has_amount, prop=0.8)
train <- training(df_split)
test  <- testing(df_split)

R-090

df_receipt %>% 
    mutate(
        sales_ymd = lubridate::ymd(sales_ymd),
        year = year(sales_ymd),
        month = month(sales_ymd),
        ym = paste(year, str_pad(month, 2, "left", 0), sep="-"),
        train_1 = ifelse({ym >= "2017-01"} & {ym <= "2017-12"}, 1, 0),
        test_1  = ifelse({ym >= "2018-01"} & {ym <= "2018-06"}, 1, 0),
        train_2 = ifelse({ym >= "2017-06"} & {ym <= "2018-05"}, 1, 0),
        test_2  = ifelse({ym >= "2018-06"} & {ym <= "2018-12"}, 1, 0),
        train_3 = ifelse({ym >= "2018-10"} & {ym <= "2019-03"}, 1, 0),
        test_3  = ifelse({ym >= "2019-04"} & {ym <= "2019-10"}, 1, 0)
    ) %>%
    head(5)

次の問題：91 - 100

R-081

R-082

R-083

R-084

R-085

R-086

R-087

R-088

R-089

R-090

Discussion