import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.describe(include='all')
print (train.info())
全データ数が891に対しての欠測値の数が、以下のようになっている。
Ageが177でおよそ20%の欠測
Cabinが687でおよそ77%の欠測
Embarkedが2の欠測であった。
Cabinは欠測率が高い =>削除した方がいいかも
Ageは欠測率が小さい=>補完する
Embarkedは欠測値が少ないので、補完する
print("cout=", 891)
pd.isnull(train).sum()
test.isnull().sum()
p_id = train["PassengerId"]
train = train.drop(["PassengerId"], axis=1)
生存フラグ
0:死亡
1:生存
sns.countplot(x="Survived", data=train)
Ticket Class
1:1st
2:2nd
3:3rd
print('unique_val:', train['Pclass'].unique())
print('unique_val_counts')
print('Number of Name unique:', train['Pclass'].nunique())
print(train['Pclass'].value_counts())
sns.countplot(x="Survived", hue='Pclass', data=train)
print (train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
固有名称が891と分類が難しい
print('Number of Name unique:', train['Name'].nunique())
print(train['Name'].value_counts())
female or male
print('unique_val:', train['Sex'].unique())
print('Number of Name unique:', train['Sex'].nunique())
print('unique_val_counts')
print(train['Sex'].value_counts())
#0 is dead, 1is survival
sns.countplot(x="Survived", hue="Sex",data=train)
print (train[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean())
train['Sex'] = train['Sex'].map({'male':0, 'female':1})
train['Age'].hist()
plt.title('Age Distribution by Dead')
train[train['Survived'] == 0]['Age'].hist()
plt.title('Age Distribution by Survived')
train[train['Survived'] == 1]['Age'].hist()
兄弟・配偶者の人数
train["SibSp"].plot("hist")
print('unique_val:', train['SibSp'].unique())
print('unique_val_counts')
print('Number of Name unique:', train['SibSp'].nunique())
print(train['SibSp'].value_counts())
sns.countplot(x="Survived", hue='SibSp', data=train)
親・子供の人数
train["Parch"].plot('hist')
print('unique_val:', train['Parch'].unique())
print('unique_val_counts')
print('Number of Name unique:', train['Parch'].nunique())
print(train['Parch'].value_counts())
sns.countplot(x="Survived", hue='Parch', data=train)
#print('unique_val:', train['Ticket'].unique())
print('unique_val_counts')
print('Number of Name unique:', train['Ticket'].nunique())
print(train['Ticket'].value_counts());
#sns.countplot(x="Survived", hue='Ticket', data=train)
print(train[train['Ticket'] == 'CA. 2343']['Survived'].value_counts())
print(train[train['Ticket'] == '1601']['Survived'].value_counts())
print(train[train['Ticket'] == '347082']['Survived'].value_counts())
print(train[train['Ticket'] == 'CA 2144']['Survived'].value_counts())
料金
#print('unique_val:', train['Fare'].unique())
print('unique_val_counts')
print('Number of Name unique:', train['Fare'].nunique())
print(train['Fare'].value_counts().head())
train['Fare'].plot('hist')
plt.show()
#sns.countplot(x='Survived', hue='Fare', data=train)
Cabin number
#print('unique_val:', train['Cabin'].unique())
print('unique_val_counts')
print('Number of Name unique:', train['Cabin'].nunique())
print(train['Cabin'].value_counts().head(3))
#sns.countplot(x="Survived", hue='Cabin', data=train)
出発した港
C = Cherbourg
Q = Queenstown
S = Southampton
print('unique_val:', train['Embarked'].unique())
print('unique_val_counts')
print('Number of Name unique:', train['Embarked'].nunique())
print(train['Embarked'].value_counts().head(3))
sns.countplot(x="Survived", hue='Embarked', data=train)
pclass_1 = train[train['Pclass'] == 1]
p1_f = pclass_1[pclass_1['Sex'] == 1]
p1_m = pclass_1[pclass_1['Sex'] == 0]
fig, ax = plt.subplots(figsize=(16, 5))
plt.subplot(1, 2, 1)
g = sns.countplot(x='Survived', data=p1_f)
plt.title('Pclass=1 and Sex = female')
plt.subplot(1, 2, 2)
g = sns.countplot(x='Survived', data=p1_m)
plt.title('Pclass=1 and Sex = female')
pclass_2 = train[train['Pclass'] == 2]
p2_f = pclass_2[pclass_2['Sex'] == 1]
p2_m = pclass_2[pclass_2['Sex'] == 0]
fig, ax = plt.subplots(figsize=(16, 5))
plt.subplot(1, 2, 1)
g = sns.countplot(x='Survived', data=p2_f)
plt.title('Pclass=2 and Sex = female')
plt.subplot(1, 2, 2)
g = sns.countplot(x='Survived', data=p2_m)
plt.title('Pclass=2 and Sex = female')
pclass_3 = train[train['Pclass'] == 3]
p3_f = pclass_3[pclass_3['Sex'] == 1]
p3_m = pclass_3[pclass_3['Sex'] == 0]
fig, ax = plt.subplots(figsize=(16, 5))
plt.subplot(1, 2, 1)
g = sns.countplot(x='Survived', data=p3_f)
plt.title('Pclass=3 and Sex = female')
plt.subplot(1, 2, 2)
g = sns.countplot(x='Survived', data=p3_m)
plt.title('Pclass=3 and Sex = male')
fig, axes = plt.subplots(figsize=(9, 6))
sns.heatmap(train.corr(),annot=True)
a = train['Embarked'].fillna('S')
b = train.drop(['Embarked'], axis=1)
train = pd.concat([b,a], axis=1)
train.isnull().sum()
train['Embarked'] = train['Embarked'].fillna('S')
train.isnull().sum()
ダミー変換をする
train['Embarked'] = train['Embarked'].map({'S':0, 'C':1, 'Q':2})
drop_col = ['Survived', 'Name', 'Ticket', 'Cabin', 'Age']
y = train['Survived']
X = train.drop(drop_col, axis=1)
p_id = test['PassengerId']
drop_col_test = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Age']
test['Embarked'] = test['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].map({'S':0, 'C':1, 'Q':2})
test['Fare'] = test['Fare'].fillna(test['Fare'].median())
test['Sex'] = test['Sex'].map({'male':0, 'female':1})
test = test.drop(drop_col_test, axis=1)
test.isnull().sum()
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
acc_logreg = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_logreg)
# Random Forest
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)
y_pred = randomforest.predict(X_val)
acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
print(acc_randomforest)
pred = randomforest.predict(test)
submit = pd.DataFrame({'PassengerId':p_id, 'Survived':pred})
submit.to_csv('submit_randomforest.csv', index=None)