murohi@AIチャンネル

一流のAIエンジニア目指して奮闘中

PG公開最終スコア(0.78468)

# 関数の読み込み
import numpy as np
import pandas as pd
from pandas import DataFrame


# trainingデータの読み込み
train= pd.read_csv("./train.csv").replace("S",0).replace("C",1).replace("Q",2)
# 出港:データを見た90%以上がSであったので欠損値もSだと仮定し代入
train["Embarked"] = train["Embarked"].fillna(0)
# 名前からの分類
combine = [train]
for train in combine:
train['Salutation'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
for train in combine:
train['Salutation'] = train['Salutation'].replace(['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

train['Salutation'] = train['Salutation'].replace('Mlle', 'Miss')
train['Salutation'] = train['Salutation'].replace('Ms', 'Miss')
train['Salutation'] = train['Salutation'].replace('Mme', 'Mrs')
del train['Name']

Salutation_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for train in combine:
train['Salutation'] = train['Salutation'].map(Salutation_mapping)
train['Salutation'] = train['Salutation'].fillna(0)


#年齢:敬称に合わせ平均年齢を代入(敬称がmasterであれば0‐8歳にする。 それ以外はいったん29歳とする。
#欠損値補完
train["Age"] = train["Age"].fillna(train["Age"].median())


def male_feamale_child_old (passenger):
age , sex =passenger
if age <9:
return 'child'
elif age >65:
return 'elder'
else :
return sex

train['person']=train'Age','Sex'.apply( male_feamale_child_old,axis=1)

#家族の人数
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1

deck1=train['Cabin'].fillna("N")
levels =
for level in deck1:
levels.append(level[0])

cabin_df =DataFrame(levels)
cabin_df.columns =['Cabin']
train['Cabin']=cabin_df

#個人かの確認列
for train in combine:
train['IsAlone'] = 0
train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1

#個人かの確認列
for train in combine:
train['More5'] = 0
train.loc[train['FamilySize'] > 5, 'More5'] = 1

train=train.drop('Ticket', axis=1)
train=pd.get_dummies(train)
#train.head(100)


# testデータの読み込み
test= pd.read_csv("./test.csv").replace("S",0).replace("C",1).replace("Q",2)
test["Age"] = test["Age"].fillna(test["Age"].median())
test['person']=test[['Age','Sex']].apply( male_feamale_child_old,axis=1)
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
combine = [test]
for test in combine:
test['Salutation'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
for test in combine:
test['Salutation'] = test['Salutation'].replace(['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

test['Salutation'] = test['Salutation'].replace('Mlle', 'Miss')
test['Salutation'] = test['Salutation'].replace('Ms', 'Miss')
test['Salutation'] = test['Salutation'].replace('Mme', 'Mrs')
del test['Name']

Salutation_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for test in combine:
test['Salutation'] = test['Salutation'].map(Salutation_mapping)
test['Salutation'] = test['Salutation'].fillna(0)


deck2=test['Cabin'].fillna("N")
levels =
for level in deck2:
levels.append(level[0])

cabin_df2 =DataFrame(levels)
cabin_df2.columns =['Cabin']
test['Cabin']=cabin_df2

test["Fare"] = test["Fare"].fillna(test["Fare"].median())

test=test.drop('Ticket', axis=1)
test = pd.get_dummies(test,columns=["FamilySize"])
test.describe()


test['person']=test[['Age','Sex']].apply( male_feamale_child_old,axis=1)

#家族の人数
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1


#個人かの確認列
for test in combine:
test['IsAlone'] = 0
train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1

#個人かの確認列
for test in combine:
test['More5'] = 0
test.loc[test['FamilySize'] > 5, 'More5'] = 1

test=test.drop('Ticket', axis=1)
test=pd.get_dummies(test)
test.head(100)


# scikit-learnのインポートをします
from sklearn.ensemble import RandomForestClassifier

# 「train」の目的変数と説明変数の値を取得(分割)
target = train["Survived"].values
features_one = train[["Pclass","SibSp", "Fare","Embarked","Salutation","Age","Parch","FamilySize","Cabin_A","Cabin_B","Cabin_C","Cabin_D","Cabin_E","Cabin_G","Cabin_N","person_child","person_elder","person_female","person_male","IsAlone","More5"]].values
# 決定木の作成(モデルの宣言)
random_forest=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=25, max_features='auto', max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=15,
min_weight_fraction_leaf=0.0, n_estimators=51, n_jobs=4,
oob_score=False, random_state=0, verbose=0, warm_start=False)


random_forest=random_forest.fit(features_one, target)

# 「test」の説明変数の値を取得
test_features = test[["Pclass","SibSp", "Fare","Embarked","Salutation","Age","Parch","FamilySize","Cabin_A","Cabin_B","Cabin_C","Cabin_D","Cabin_E","Cabin_G","Cabin_N","person_child","person_elder","person_female","person_male","IsAlone","More5"]].values
# 「test」の説明変数を使って「my_tree_one」のモデルで予測
my_prediction = random_forest.predict(test_features)

# PassengerIdを取得
PassengerId = np.array(test["PassengerId"]).astype(int)
# my_prediction(予測データ)とPassengerIdをデータフレームへ落とし込む
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
# my_tree_one.csvとして書き出し
my_solution.to_csv("submissionrandom.csv", index_label = ["PassengerId"])