Skip to content

Fixes : #6551 #6956

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions machine_learning/xgboostclassifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import matplotlib.pyplot as plt

# https://xgboost.readthedocs.io/en/stable/
import numpy as np
import pandas as pd
import seaborn as sns
from xgboost import XGBClassifier

training = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

# Commented out IPython magic to ensure Python compatibility.
training["train_test"] = 1
test["train_test"] = 0
test["Survived"] = np.NaN
all_data = pd.concat([training, test])
# %matplotlib inline
all_data.columns

all_data.describe()

all_data["cabin_mul"] = all_data.Cabin.apply(
lambda x: 0 if pd.isna(x) else len(x.split(" "))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: x

)
all_data["cabin_adv"] = all_data.Cabin.apply(lambda x: str(x)[0])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: x

all_data["name_title"] = all_data.Name.apply(
lambda x: x.split(",")[1].split(".")[0].strip()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: x

)
all_data.Age = all_data.Age.fillna(training.Age.median())
all_data.Fare = all_data.Fare.fillna(training.Fare.median())
all_data.dropna(subset=["Embarked"], inplace=True)
all_data["norm_fare"] = np.log(all_data.Fare + 1)
all_data.Pclass = all_data.Pclass.astype(str)
all_data["Age"] = all_data["Age"].apply(np.int64)
all_dummies = pd.get_dummies(
all_data[
[
"Pclass",
"Sex",
"Age",
"SibSp",
"Parch",
"norm_fare",
"Embarked",
"cabin_adv",
"cabin_mul",
"name_title",
"train_test",
]
]
)

from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]] = scale.fit_transform(
all_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]]
)
all_dummies_scaled.head()

X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_train_scaled

["train_test"], axis=1
)
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_test_scaled

["train_test"], axis=1
)

y_train = all_data[all_data.train_test == 1].Survived

from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train_scaled, y_train)

y_hat_base_vc = xgb.predict(X_test_scaled).astype(int)
basic_submission = {"PassengerId": test.PassengerId, "Survived": y_hat_base_vc}
base_submission = pd.DataFrame(data=basic_submission)
base_submission.to_csv("xgb_submission.csv", index=False)
164 changes: 164 additions & 0 deletions machine_learning/xgboostregressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import os

import matplotlib.pyplot as plt

# https://xgboost.readthedocs.io/en/stable/
import numpy as np
import pandas as pd
import seaborn as sns
from xgboost import XGBRegressor

for dirname, _, filenames in os.walk("/kaggle/input"):
for filename in filenames:
print(os.path.join(dirname, filename))

trainAmes = pd.read_csv("/kaggle/input/ames-housing-dataset/AmesHousing.csv")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: trainAmes

test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
train = pd.read_csv(
"/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
)

trainAmes.columns = trainAmes.columns.str.replace(" ", "")
trainAmes = trainAmes.rename(columns={"YearRemod/Add": "YearRemodAdd"})

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: trainAmes


data = pd.concat([trainAmes, train, test], axis=0, sort=False)
print("Size of the Housing Dataset", len(data))
useless = ["Id", "PID", "Order", "SalePrice"]
data = data.drop(useless, axis=1)
duplicate = data[data.duplicated(keep="last")].index
len(duplicate)

duplicate = duplicate[0:390]
trainAmes = trainAmes.drop(duplicate, axis=0)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: trainAmes


training = pd.concat([trainAmes, train], axis=0, sort=False)
useless = ["Id", "PID", "Order"]
training = training.drop(useless, axis=1)

# Separating Target and Features

target = training["SalePrice"]
test_id = test["Id"]
test = test.drop(["Id"], axis=1)
training2 = training.drop(["SalePrice"], axis=1)


# Concatenating train & test set

train_test = pd.concat([training2, test], axis=0, sort=False)

# Filling Categorical NaN (That we know how to fill due to the description file )

train_test["Functional"] = train_test["Functional"].fillna("Typ")
train_test["Electrical"] = train_test["Electrical"].fillna("SBrkr")
train_test["KitchenQual"] = train_test["KitchenQual"].fillna("TA")
train_test["Exterior1st"] = train_test["Exterior1st"].fillna(
train_test["Exterior1st"].mode()[0]
)
train_test["Exterior2nd"] = train_test["Exterior2nd"].fillna(
train_test["Exterior2nd"].mode()[0]
)
train_test["SaleType"] = train_test["SaleType"].fillna(train_test["SaleType"].mode()[0])
train_test["PoolQC"] = train_test["PoolQC"].fillna("None")
train_test["Alley"] = train_test["Alley"].fillna("None")
train_test["FireplaceQu"] = train_test["FireplaceQu"].fillna("None")
train_test["Fence"] = train_test["Fence"].fillna("None")
train_test["MiscFeature"] = train_test["MiscFeature"].fillna("None")
for col in ("GarageArea", "GarageCars"):
train_test[col] = train_test[col].fillna(0)

for col in ["GarageType", "GarageFinish", "GarageQual", "GarageCond"]:
train_test[col] = train_test[col].fillna("None")

for col in ("BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"):
train_test[col] = train_test[col].fillna("None")

for col in (
"BsmtFinSF1",
"BsmtFinSF2",
"BsmtFullBath",
"BsmtHalfBath",
"MasVnrArea",
"BsmtUnfSF",
"TotalBsmtSF",
):
train_test[col] = train_test[col].fillna(0)

train_test["LotFrontage"] = train_test["LotFrontage"].fillna(
train["LotFrontage"].median()
)

# Checking the features with NaN remained out

for col in train_test:
if train_test[col].isna().sum() > 0:
print(train_test[col][1])

# Converting non-numeric predictors stored as numbers into string

train_test["MSSubClass"] = train_test["MSSubClass"].apply(str)
train_test["YrSold"] = train_test["YrSold"].apply(str)
train_test["MoSold"] = train_test["MoSold"].apply(str)
train_test["OverallQual"] = train_test["OverallQual"].apply(str)
train_test["OverallCond"] = train_test["OverallCond"].apply(str)
train_test["SqFtPerRoom"] = train_test["GrLivArea"] / (
train_test["TotRmsAbvGrd"]
+ train_test["FullBath"]
+ train_test["HalfBath"]
+ train_test["KitchenAbvGr"]
)

train_test["Total_Home_Quality"] = train_test["OverallQual"] + train_test["OverallCond"]

train_test["Total_Bathrooms"] = (
train_test["FullBath"]
+ (0.5 * train_test["HalfBath"])
+ train_test["BsmtFullBath"]
+ (0.5 * train_test["BsmtHalfBath"])
)

train_test["HighQualSF"] = train_test["1stFlrSF"] + train_test["2ndFlrSF"]
train_test["renovated"] = train_test["YearRemodAdd"] + train_test["YearBuilt"]

# Removing the useless variables

useless = ["GarageYrBlt", "YearRemodAdd"]
train_test = train_test.drop(useless, axis=1)
# Creating dummy variables from categorical features

from scipy.stats import skew

train_test_dummy = pd.get_dummies(train_test)

numeric_features = train_test_dummy.dtypes[train_test_dummy.dtypes != object].index
skewed_features = (
train_test_dummy[numeric_features]
.apply(lambda x: skew(x))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide descriptive name for the parameter: x

.sort_values(ascending=False)
)
high_skew = skewed_features[skewed_features > 0.5]
skew_index = high_skew.index

# Normalize skewed features using log_transformation

for i in skew_index:
train_test_dummy[i] = np.log1p(train_test_dummy[i])

target_log = np.log1p(target)

from xgboost import XGBRegressor

# Train-Test separation

X_train = train_test_dummy[0:4000]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_train

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_train

X_test = train_test_dummy[4000:]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_test

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable and function names should follow the snake_case naming convention. Please update the following name accordingly: X_test


xgb = XGBRegressor()
xgb.fit(X_train, target_log)

test_pred = xgb.predict(X_test)
submission = pd.DataFrame(test_id, columns=["Id"])
test_pred = np.expm1(test_pred)
submission["SalePrice"] = test_pred
submission.head()
submission.to_csv("xgb.csv", index=False, header=True)