Thanks for dropping by my page!
I'm currently a Masters of Data Science student graduating in 2022.
I really enjoy learning new tools, exploring data to help solve problems, and helping others understand ideas and data.
Naive Bayes Classifier Demo in Python
November 2020 – Me
Overview:
This is a Naive Bayes Classifier python algorithm I wrote from scratch using only pandas and numpy libraries.
With small adjustments to the code, is able to preform NB classification on any dataset using only the numerical features (binary is ok).
It is currently configured to run NB on a sample League of Legends games dataset containing around 10k records/ games. The goal is to predict game outcome (blue or red team victory) based on data available after the first 10 minutes of gameplay (each game typically lasts around 20-50 minutes).
Please follow the link to the github repo for a more in-depth explanation, data and resource citations.
Tools:
Numpy: matrix operations
Pandas: data handling
Language used: Python
See the repo here. :-)
Here’s a preview of the code:
# ===================================================================
# Learn from train set
# ===================================================================
# Y = (y1,y2,..,yk) classes
# X = (x1,x2,...,xn) features
P_Y = []
distributions_XGivenY = []
for classtype in classes:
class_data = train[train['blueWins'] == classtype] # grab data for yj
P_Y.append(len(class_data) / len(train)) # get P(yj)
class_feature_distributions = [] # temp variable to store feature distributions for yj
for feature in features:
feature_data = class_data[feature] # grab data for xi|yj
class_feature_distributions.append([np.mean(feature_data), np.std(feature_data)])
# each class gets their own array for feature distributions
distributions_XGivenY.append(class_feature_distributions)
# convert feature|class distribution data to pd df
# this is in form: columns=classes, rows=features
distributions_XGivenY = pd.DataFrame(distributions_XGivenY, columns=[features]).transpose()
# ===================================================================
# model validation using test set
# ===================================================================
validation_array = [] # store if model was correct or incorrect
validation_score = 0 # mean of validation array
def likelihood(value, mu, sigma):
# probability density function
# image: https://wikimedia.org/api/rest_v1/media/math/render/svg/c9167a4f19898b676d4d1831530a8ff1246d33ab
a = 2 * np.pi * sigma ** 2
b = (value - mu) / sigma
return 1 / np.sqrt(a) * np.exp(-0.5 * b ** 2)
def validate_result(guess, index):
# compares guess with actual value from data
if guess == test['blueWins'][index]: return 1
else: return 0
for ind in test.index:
# initiate
P_yjGivenX = np.NINF # P(yj|X)
prediction = 0 # argmax(j=1,k) [ P(yj) * PROD(i=1,n)(P(xi|yj) ]
for classtype in classes:
class_distribution_data = distributions_XGivenY[classtype] # grab feature distribution data for yj
P_xiGivenyj_array = [] # temp array to store log(P(xi|yj)) values
for feature in features:
feature_data = class_distribution_data[feature] # grab xi distribution data
feature_mu, feature_sigma = np.concatenate(feature_data.to_numpy()).ravel()
feature_value = test[feature][ind] # grab instance xi data
# calculate log(P(xi|yj))
P_xiGivenyj_array.append(np.log(likelihood(feature_value, feature_mu, feature_sigma)))
# calculate log( P(yj)*PROD(i=1,n)(P(xi|yj) )
temp_P = np.log(P_Y[classtype]) + np.sum(P_xiGivenyj_array)
if temp_P > P_yjGivenX:
# argmax(j=1,k) [ P(yj) * PROD(i=1,n)(P(xi|yj) ]
P_yjGivenX = np.log(P_Y[classtype]) + np.sum(P_xiGivenyj_array)
prediction = classtype
validation_array.append(validate_result(prediction, ind)) # model validation
validation_score = np.mean(validation_array) # prediction success rate. with np.rand seed = 0, success rate is ~75%
print(validation_score)