Making an Auto Complete Program using N-Gram Models

Introduction

To build an auto-complete system, we need to make a language model first. A language model essentially assigns a probability to a sequence of words. Thus the linguistically next word would have a higher probability.
 

Major Libraries 

  1. math
  2. random
  3. NumPy
  4. pandas
  5. nltk
import math
import random
import numpy as np
import pandas as pd
import nltk

Pre-Processing

I’ve used a text (.txt) file containing tweets in the English (US) Language separated by a newline character.

def obtain_sentences(data):
  sentences = data.split("\n")
  sentences = [s.strip() for s in sentences]
  sentences = [s for s in sentences if len(s) > 0]
  return sentences
def tokenize_the_corpus(sentences):
  tokenized_sentences = []
  for sentence in sentences:
    sentence = sentence.lower()
    tokenized_sentence = nltk.word_tokenize(sentence)
    tokenized_sentences.append(tokenized_sentence)
  return tokenized_sentences

sentences = obtain_sentences(data)
tokenized_sentences = tokenize_the_corpus(sentences)

After converting the words and stopwords into tokens after lowercasing, we need to split the dataset. As this was a fairly small dataset, I decided to split it 80/10/10.

train_size = int(len(tokenized_sentences) * 0.8)
train_data = tokenized_sentences[0:train_size]
test_data = tokenized_sentences[train_size:]

Also, in language models, we don’t always use all the words. As the words which only occur once in the corpus won’t have a high probability. Also as we’ve split the dataset, there might be some words that are in the training dataset but not in the test dataset thus, we need to convert these words into special tokens (<UNK>). 

def count_the_words(sentences):

  word_counts = {}
  for sentence in sentences:
    for token in sentence:
      if token not in word_counts.keys():
        word_counts[token] = 1
      else:
        word_counts[token] += 1
  return word_counts

def handling_oov(tokenized_sentences, count_threshold):
  closed_vocabulary = []
  words_count = count_the_words(tokenized_sentences)
  for word, count in words_count.items():
    if count >= count_threshold :
      closed_vocabulary.append(word)

  return closed_vocabulary

def unk_tokenize(tokenized_sentences, vocabulary, unknown_token = "<unk>"):
  vocabulary = set(vocabulary)
  new_tokenized_sentences = []
  for sentence in tokenized_sentences:
    new_sentence = []
    for token in sentence:
      if token in vocabulary:
        new_sentence.append(token)
      else:
        new_sentence.append(unknown_token)
    new_tokenized_sentences.append(new_sentence)
  return new_tokenized_sentences

That’s all that we need to do for pre-processing

def preprocess(train_data, test_data, count_threshold):
  vocabulary = handling_oov(train_data, count_threshold)
  new_train_data = unk_tokenize(train_data, vocabulary)
  new_test_data = unk_tokenize(test_data, vocabulary)

  return new_train_data, new_test_data, vocabulary 

min_freq = 2
final_train, final_test, vocabulary = preprocess(train_data, test_data, min_freq)

Building the N-Gram Model

First, we write a function to calculate the number of n-grams (collection of n words) for a given n. In this step, we’ll also add start and end tokens.

def count_n_grams(data, n, start_token = "<s>", end_token = "<e>"):
  n_grams = {}
  for sentence in data:
    sentence = [start_token]*n + sentence + [end_token]
    sentence = tuple(sentence)

    m = len(sentence) if n==1 else len(sentence)-1
    for i in range(m):
      n_gram = sentence[i:i+n]
      if n_gram in n_grams.keys():
        n_grams[n_gram] += 1
      else:
        n_grams[n_gram] = 1
  return n_grams

Now, we’ll write a function to calculate the probability of a word given the prior n words with smoothing

def prob_for_single_word(word, previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary_size, k = 1.0):
  previous_n_gram = tuple(previous_n_gram)
  previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
  denom = previous_n_gram_count + k * vocabulary_size
  nplus1_gram = previous_n_gram + (word,)
  nplus1_gram_count = nplus1_gram_counts[nplus1_gram] if nplus1_gram in nplus1_gram_counts else 0
  num = nplus1_gram_count + k
  prob = num / denom
  return prob

Now, we just extend this function to all words

def probs(previous_n_gram, n_gram_counts, nplus1_gram_counts, vocabulary, k=1.0):
  previous_n_gram = tuple(previous_n_gram)
  vocabulary = vocabulary + ["<e>", "<unk>"]
  vocabulary_size = len(vocabulary)
  probabilities = {}
  for word in vocabulary:
    probability = prob_for_single_word(word, previous_n_gram, 
                                           n_gram_counts, nplus1_gram_counts, 
                                           vocabulary_size, k=k)
    probabilities[word] = probability

  return probabilities

Now, we write the code to make the count matrix

def count_matrix(nplus1_gram_counts, vocabulary):
  vocabulary = vocabulary + ["<e>", "<unk>"]
  n_grams = []
  for n_plus1_gram in nplus1_gram_counts.keys():
    n_gram = n_plus1_gram[0:-1]
    n_grams.append(n_gram)
  n_grams = list(set(n_grams))

  row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
  col_index = {word:j for j, word in enumerate(vocabulary)}
    
  nrow = len(n_grams)
  ncol = len(vocabulary)
  count_matrix = np.zeros((nrow, ncol))
  for n_plus1_gram, count in nplus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    
  count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
  return count_matrix

Also, the probability matrix

def prob_matrix(nplus1_gram_counts, vocabulary, k):
  countmatrix = count_matrix(nplus1_gram_counts, unique_words)
  countmatrix += k
  prob_matrix = countmatrix.div(countmatrix.sum(axis=1), axis=0)
  return prob_matrix

Evaluation 

To evaluate a language model we usually use the “perplexity” matrix.

def perplexity(sentence, n_gram_counts, nplus1_gram_counts, vocabulary_size, k=1.0):
    n = len(list(n_gram_counts.keys())[0]) 
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    
    N = len(sentence)

    product_pi = 1.0

    for t in range(n, N): 

        n_gram = sentence[t-n:t]

        word = sentence[t]

        probability = prob_for_single_word(word,n_gram, n_gram_counts, nplus1_gram_counts, len(unique_words), k=1)

        product_pi *= 1 / probability

    perplexity = product_pi**(1/float(N))
 
    return perplexity

Building the Auto-Complete System

Now, we’ll write the code for suggesting a word and then extend it into getting multiple suggestions

def auto_complete(previous_tokens, n_gram_counts, nplus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0]) 
    previous_n_gram = previous_tokens[-n:]
    probabilities = probs(previous_n_gram,n_gram_counts, nplus1_gram_counts,vocabulary, k=k)

    suggestion = None
    max_prob = 0

    for word, prob in probabilities.items():
        if start_with != None: 
            if not word.startswith(start_with):
                continue 

        if prob > max_prob: 

            suggestion = word
            max_prob = prob

    return suggestion, max_prob


def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        nplus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = auto_complete(previous_tokens, n_gram_counts,
                                    nplus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(final_train, n)
    n_gram_counts_list.append(n_model_counts)


previous_tokens = ["i", "am", "to"]
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

Leave a Reply

Your email address will not be published. Required fields are marked *