import string
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from urllib.request import urlopen

# Load Shakespeare dataset
with open('data/t8.shakespeare.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Prepare characters
# all_chars = list(string.ascii_letters + string.digits + string.punctuation + ' \n\t')
all_chars = list(string.ascii_lowercase)

# Create bigrams and count occurrences
bigrams = [(text[i], text[i+1]) for i in range(len(text) - 1)]
bigram_counts = Counter(bigrams)

# Prepare matrix of bigram occurrences
bigram_matrix = np.zeros((len(all_chars), len(all_chars)))

# Fill matrix with occurrences
for (char1, char2), count in bigram_counts.items():
    if char1 in all_chars and char2 in all_chars:
        i = all_chars.index(char1)
        j = all_chars.index(char2)
        bigram_matrix[i, j] = count

# Normalize matrix for relative occurrence
relative_matrix = bigram_matrix / bigram_matrix.sum()

# Plot
plt.figure(figsize=(10, 8))
plt.imshow(relative_matrix, cmap='hot', interpolation='nearest')
plt.colorbar(label='Relative Occurrence')
plt.xticks(ticks=np.arange(len(all_chars)), labels=all_chars, rotation=90, fontsize=18)
plt.yticks(ticks=np.arange(len(all_chars)), labels=all_chars, fontsize=18)
# plt.title('Relative Occurrence of Bigrams in Shakespeare\'s Text')
# plt.xlabel('Next Character')
# plt.ylabel('Current Character')
if True:
    plt.savefig('bigram-model.png')
plt.show()
