machine learning – Coding the Prediction part of the Naive Bayes Algorithm

as a task we were told to code the naive Bayes algorithm (in this case determining of a review is a positive or negative review). I managed to construct the model but the issue comes in when I’m coding the prediction function.

This function here samples the data given

def sample_data(data,label,training_sample_size):
"""
    splits the data into 2 samples which will be used for training and testing
    
    input
        data                 : the data that will be split
        label                : the label associated with each row in data
        training_sample_size : the size of the training sample data
        
    output
        training_data        : returns the sample data that will be used for training model
        training_labels      : returns the labels associated with the training data
        testing_data         : returns the sample data that will be used for testing
        testing_label        : returns the labels associated with testing_data
"""

length = list(range(0,len(label)))
training_index = random.sample(length,training_sample_size)
test_index = (i for i in length if i not in training_index)

training_data = (data(i) for i in training_index)
training_labels = (label(i) for i in training_index)

testing_data = (data(i) for i in test_index)
testing_labels = (label(i) for i in test_index)


return (training_data,training_labels,testing_data,testing_labels)

this function here constructs the model

def Naive_Bayes_Model(Data,label):
"""
    Creates a model based off the given data and label provided
    
    input
        Data              : the data that will be used in training the model
        label             : the label associated with the data
    
    output
        model             : the model that will be used for predicting
        prior_probability : the probability of each category, i.e. probability it is ham or spam
"""

outcomes = np.unique(label)
prior_probability = list(np.zeros(len(outcomes)))
prior_data = ()
words = ()
words_probability = ()

#splitting words by associated label and getting prior probability
for index,value in enumerate(outcomes):

    temp = (Data(i) for i,x in enumerate(label) if x == value)
    prior_probability(index) = len(temp)/len(label)
    prior_data.append(temp)
    
    


#constructing probabilities of words
for i in range(0,len(outcomes)):
    
    category_data = prior_data(i)
    
    for j in range(0,len(category_data)):
        
        sentence = category_data(j)
        
        for k in range(0,len(sentence),1):
            
            x = sentence(k)
            
            if x in words:
                index = words.index(x)
                words_probability(index)(i)+= (1/len(category_data))
            elif x!='':
                words.append(x)
                words_probability.append(np.zeros(len(outcomes)))
                words_probability(-1)(i) = (1/len(category_data))
       


words_probability = np.array(words_probability)

#creating model
model = pd.DataFrame()

for i in range(0,len(outcomes),1):
    model(str(outcomes(i))) = words_probability(:,i)


model.index = words

return model,prior_probability

This function here is where I’m attempting to get the probability of it being positive or negative review (doesn’t work)

def Naive_Bayes_Prediction(model,prior_probability,data,label):
"""
    uses the model given to predictions on the data given
    
    Inputs
        model : model that will be used to make prediction
        prior : probability of each label class
        data  : sample data that will be testing on
        label : label of associated data that will be used to check if model wrong or right
"""


for i in range (0,len(data),1):   #looping through each row in dataset
    
    sent = data(i)
    prob = ()
    for j in model.columns:       #looping through each outcome
        
        p = 1
        
        for k in model.index:
            
            if k in sent:
                
                p*= model(j)(k)
            else:
                p*= 1 - model(j)(k)
        
        prob.append(p)
        
        
        deno = 0
        for n in range (0,len(prob),1):
            deno+= prob(n)*prior_probability(n)
            
        for m in range(0,len(prob),1):
            prob(m) = prob(m) / deno
          
    
    print(' '.join(sent) + ': ' + ';'.join((str(x) for x in prob)))

and this is how I combine all the above functions:

file = open('simple-food-reviews.txt','r+')

data = () 
label =()

 for line in file:

     line = line.rstrip("n")
     x = line.split(' ')
     label.append(int(x(0)))
     data.append(x(1::))

 #Sampling data
 (training_data,training_labels,testing_data,testing_labels) = sample_data(data,label,12)

 #training model
 model,prior_probability = Naive_Bayes_Model(training_data,training_labels)

 #predicting off model
 Naive_Bayes_Prediction(model,prior_probability,testing_data,testing_labels)

the data I used:

1 the food is lovely
1 this is a great restaurant
1 i really enjoyed my food
1 i enjoyed the experience at the restaurant
1 we had a lovely meal
1 my food tasted great
1 the service was great
1 what a lovely restaurant
1 the food the service and the restaurant was great
-1 the service is terrible
-1 the food tasted awful
-1 this is a bad restaurant  
-1 the food was really bad
-1 the service and the food was terrible
-1 we had a terrible experience
-1 avoid this restaurant
-1 avoid the food
-1 the meal was terrible

whenever I run this algorithm I get Nan values. this is the error I get:

/home/gideon/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:37: RuntimeWarning: 
invalid value encountered in double_scalars