Predicting Legendary Pokemon Using Dataset of Stats

Total 721 different pokemons and approx 800 total pokemon stats containing Normal,Legendary and Mega Evolutions.

dataset = pd.read_csv("Pokemon(copy).csv")
print ('Datasets:' ,dataset.shape )

Datasets: (800, 9)

Lets drop the Name column.
dataset.drop("Name",axis=1,inplace=True)


sns.countplot(dataset['Legendary'],label="Count")


dataset.columns
Index([‘Total’, ‘HP’, ‘Attack’, ‘Defense’, ‘Sp. Atk’, ‘Sp. Def’, ‘Speed’,
‘Legendary’],
dtype=’object’)


dataset.describe()

Total HP Attack Defense Sp. Atk Sp. Def Speed Legendary
count 800.000 800.0000 800.0000 800.0000 800.0000 800.0000 800.0000 800.000
mean 435.1025 69.25875 79.00125 73.8425 72.8200 71.9025 68.2775 0.08125
std 119.96304 25.534669 32.457366 31.183501 32.722294 27.828916 29.060474 0.27339
min 180.00000 1.000000 5.000000 5.000000 10.000000 20.0000 5.000000 0.00000
25% 330.000 50.0000 55.0000 50.0000 49.7500 50.0000 45.0000 0.00000
50% 450.000 65.0000 75.0000 70.0000 65.0000 70.0000 65.0000 0.00000
75% 515.000 80.0000 100.0000 90.0000 95.0000 90.0000 90.0000 0.00000
max 780.000 255.0000 190.0000 230.0000 194.0000 230.0000 180.0000 1.000

stats=[‘Total’,’HP’, ‘Attack’, ‘Defense’, ‘Sp. Atk’, ‘Sp. Def’, ‘Speed’]


corr = dataset[stats].corr() # .corr is used for find corelation
plt.figure(figsize=(14,14))
sns.heatmap(corr, cbar = True, square = True, annot=True, fmt= ‘.2f’,annot_kws={‘size’: 15},
xticklabels= stats, yticklabels= stats,
cmap= ‘coolwarm’)

color_function = {0: “blue”, 1: “red”}
colors = dataset[“Legendary”].map(lambda x: color_function.get(x))
pd.scatter_matrix(dataset[stats], c=colors, alpha = 0.6, figsize = (12, 12));


Predicting Legendary Pokemon from Stats

Train = pd.read_csv("PokemonTrain.csv")
Test = pd.read_csv("PokemonTest.csv")
pokemons1 = Train[ :600 ]
pokemons2 = Test[ 600: ]

X_train = pokemons1.drop("Legendary", axis=1)
Y_train = pokemons1["Legendary"]
X_test = pokemons2
X_train.shape, Y_train.shape, X_test.shape

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

models = pd.DataFrame({'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression','Random Forest',
'Naive Bayes', 'Perceptron','Stochastic Gradient Decent', 'Linear SVC','Decision Tree'],
 'Score': [acc_svc,acc_knn,acc_log,acc_random_forest,acc_gaussian,acc_perceptron,acc_sgd,
acc_linear_svc,acc_decision_tree]})
models.sort_values(by='Score', ascending=False)
Model Score
0 Support Vector Machines 99.50
3 Random Forest 99.50
8 Decision Tree 99.50
1 KNN 98.17
2 Logistic Regression 93.67
4 Naive Bayes 92.83
7 Linear SVC 92.83
6 Stochastic Gradient Decent 92.67
5 Perceptron 87.17


K-Fold cross validation :

def classification_model(model,data,prediction_input,output):
model.fit(data[prediction_input],data[output]) #Here we fit the model using training set
predictions = model.predict(data[prediction_input])

accuracy = metrics.accuracy_score(predictions,data[output])
print("Accuracy : %s" % "{0:.3%}".format(accuracy))

kf = KFold(data.shape[0], n_folds=5)

error = []
for train, test in kf:

train_X = (data[prediction_input].iloc[train,:])
train_y = data[output].iloc[train]
model.fit(train_X, train_y)

test_X=data[prediction_input].iloc[test,:]
test_y=data[output].iloc[test]
error.append(model.score(test_X,test_y))

print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))
model = DecisionTreeClassifier()
data=dataset
prediction_var = stats
outcome_var= “Legendary”
classification_model(model,data,prediction_var,outcome_var)

Accuracy : 99.625%
Cross-Validation Score : 96.875%
Cross-Validation Score : 96.562%
Cross-Validation Score : 95.417%
Cross-Validation Score : 95.000%
Cross-Validation Score : 94.625%

model=LogisticRegression()
classification_model(model,data,prediction_var,outcome_var)

​Accuracy : 93.625%
Cross-Validation Score : 96.250%
Cross-Validation Score : 96.250%
Cross-Validation Score : 95.000%
Cross-Validation Score : 94.219%
Cross-Validation Score : 92.875%

model = RandomForestClassifier(n_estimators=100)
classification_model(model,data,prediction_var,outcome_var)

Accuracy : 99.625%
Cross-Validation Score : 96.250%
Cross-Validation Score : 96.250%
Cross-Validation Score : 95.417%
Cross-Validation Score : 95.000%
Cross-Validation Score : 94.375%

Leave a Reply Here