# Example 5.3

Word level simulation of English using Markov chains. The data used in this example can be downloaded from [here](holmes.txt).

In [4]:
import re
from collections import defaultdict, Counter
import numpy as np
from random import randint, choice
import matplotlib.pyplot as plt
import time

rng = np.random.default_rng(19)

# Reading the file
with open('holmes.txt') as f:
    words = re.split(' +', f.read())

# Assign an index to each unique word
word_to_index = {word: idx for idx, word in enumerate(set(words))}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Create transition counts matrix
num_words = len(word_to_index)
transition_counts = np.zeros((num_words, num_words), dtype=int)

for w0, w1 in zip(words[:-1], words[1:]):
    transition_counts[word_to_index[w0], word_to_index[w1]] += 1

# Convert counts to probabilities to get the transition matrix
transition_matrix = transition_counts / transition_counts.sum(axis=1, keepdims=True)

# Handle potential NaNs (rows with no transitions)
transition_matrix[np.isnan(transition_matrix)] = 0

# visualise the transition matrix

# plt.imshow((transition_matrix + 1e-8))
# plt.colorbar()
# plt.show()

# simulate a sentence
T = 100
# w0 = choice(words[:-1])
w0 = 'Sherlock'
print('The first word is', w0, '\n')

print(w0)

for _ in range(T):
    # get the index of w0
    idx = word_to_index[w0]
    # sample the next index using the transition matrix
    idx_next = rng.choice(num_words, p=transition_matrix[idx, :])
    # convert the index back to the word
    w1 = index_to_word[idx_next]
    print(w1, end=' ')
    w0 = w1


The first word is Sherlock 

Sherlock
Holmes with my plans
arranged by the organization which sent to give us leave,” the other side.
I gave a first-class chemist; but, as well on. “Surely there was so
extraordinarily ample and went in her in
astonishment, and see that is that they were ushered
into a way by the mountains and going
over it, and my way into a
hearty laugh.

“I wouldn’t have it down.

“That’s better,” said I; “but I saw was in the
valley, and played with a lot of the roadway without leaving the United States without leaving the distance, Jefferson Hope promptly, remembering the
countersign which you do not said the time before me 

  transition_matrix = transition_counts / transition_counts.sum(axis=1, keepdims=True)
