Example 5.3#
Word level simulation of English using Markov chains. The data used in this example can be downloaded from here.
import re
from collections import defaultdict, Counter
import numpy as np
from random import randint, choice
import matplotlib.pyplot as plt
import time
rng = np.random.default_rng(19)
# Reading the file
with open('holmes.txt') as f:
words = re.split(' +', f.read())
# Assign an index to each unique word
word_to_index = {word: idx for idx, word in enumerate(set(words))}
index_to_word = {idx: word for word, idx in word_to_index.items()}
# Create transition counts matrix
num_words = len(word_to_index)
transition_counts = np.zeros((num_words, num_words), dtype=int)
for w0, w1 in zip(words[:-1], words[1:]):
transition_counts[word_to_index[w0], word_to_index[w1]] += 1
# Convert counts to probabilities to get the transition matrix
transition_matrix = transition_counts / transition_counts.sum(axis=1, keepdims=True)
# Handle potential NaNs (rows with no transitions)
transition_matrix[np.isnan(transition_matrix)] = 0
# visualise the transition matrix
# plt.imshow((transition_matrix + 1e-8))
# plt.colorbar()
# plt.show()
# simulate a sentence
T = 100
# w0 = choice(words[:-1])
w0 = 'Sherlock'
print('The first word is', w0, '\n')
print(w0)
for _ in range(T):
# get the index of w0
idx = word_to_index[w0]
# sample the next index using the transition matrix
idx_next = rng.choice(num_words, p=transition_matrix[idx, :])
# convert the index back to the word
w1 = index_to_word[idx_next]
print(w1, end=' ')
w0 = w1
The first word is Sherlock
Sherlock
Holmes would come. He noticed such a small slip into which she said, “you have had enquired in the sofa in letters a better leave the presence of their register, and raised his plan of the frightened horse on it less wide than you upon Tuesday, the irresistible
conclusion that a stick, it mean? Surely you paid unusual care how your
investigations go,” he laps
it up well?” he come hurtling down the door open.
Outside all united should think of a copyright in
the line of Deduction and some luggage was more thought gave the ash from his powers upon the farm-house. John, cooped
<ipython-input-1-9aaed35cf61a>:26: RuntimeWarning: invalid value encountered in divide
transition_matrix = transition_counts / transition_counts.sum(axis=1, keepdims=True)