このチャプターの目次

Code

今までのCodeをまとめます。

import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
import pickle

######### 関数群 #########
def extract_honorific(name):
	
	name_lst = name.split(",")
	honorific_lst = name_lst[1].split(".")
	honorific = honorific_lst[0][1:]
	
	return honorific


data = pd.read_csv("titanic/train.csv")
# print(data.shape)

######### 用いる特徴量を決める #########
data = data[["Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
# print(data.shape)
data = data.dropna()
# print(data.shape)
print(data.shape)

target = data["Survived"].values
data = data.drop("Survived", axis=1)

######### データの前処理(名前の整理) #########
name_lst = list(data["Name"].values)
print(name_lst[0])

honorific_lst = list()
for i in range(0, len(name_lst)):
	honorific_lst.append(extract_honorific(name_lst[i]))

print(len(honorific_lst))

data["Honorific"] = honorific_lst

data = data.drop("Name", axis=1)

######### Pclassの整理 ######### 
data["Pclass"] = data["Pclass"].replace({1:3,2:2,3:1})

######### データの前処理(category_encoders) #########
ce_ohe = ce.OneHotEncoder(cols=["Sex", "Embarked", "Honorific"])
data = ce_ohe.fit_transform(data)

######### RandomForest #########
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(data.values, target)

######### Modelの保存 #########
with open('models/clf_ohe.pkl', 'wb') as f:
	pickle.dump([clf, ce_ohe], f)