Spell correction using Keras RNN in python

Spell correction using Keras RNN in python



I am doing spell correction using RNN, Below is the code I am using


from _future_ import print_function, division, unicode_literals

import os
import errno
from collections import Counter
from hashlib import sha256
import re
import json
import itertools
import logging
import requests
import numpy as np
import pandas as pd
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros # pylint:disable=no-name-in-module

from keras.models import Sequential, load_model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout, recurrent
from keras.callbacks import Callback

# Set a logger for the module
LOGGER = logging.getLogger(__name__) # Every log will use the module name
LOGGER.addHandler(logging.StreamHandler())
LOGGER.setLevel(logging.DEBUG)

random_seed(123) # Reproducibility

class Configuration(object):
"""Dump stuff here"""
CONFIG = Configuration()
#pylint:disable=attribute-defined-outside-init
# Parameters for the model:
CONFIG.input_layers = 2
CONFIG.output_layers = 2
CONFIG.amount_of_dropout = 0.2
CONFIG.hidden_size = 500
CONFIG.initialization = "he_normal" # : Gaussian initialization scaled by fan-in (He et al., 2014)
CONFIG.number_of_chars = 100
CONFIG.max_input_len = 20
CONFIG.inverted = True

# parameters for the training:
CONFIG.batch_size = 100 # As the model changes in size, play with the batch size to best fit the process in memory
CONFIG.epochs = 500 # due to mini-epochs.
CONFIG.steps_per_epoch = 1000 # This is a mini-epoch. Using News 2013 an epoch would need to be ~60K.
CONFIG.validation_steps = 10
CONFIG.number_of_iterations = 10

dataset=pd.read_csv("input_spell.csv")
input_data=dataset['input'].tolist()
input_data1=str(input_data)
output_data=dataset['output'].tolist()
output_data1=str(output_data)


chars=list("abcdefghijklmnopqrstuvwxyz")

MIN_INPUT_LEN = 1
AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len
CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")

class CharacterTable(object):
"""
Given a set of characters:
+ Encode them to a one hot integer representation
+ Decode the one hot integer representation to their character output
+ Decode a vector of probabilities to their character output
"""
def __init__(self, chars):
self.chars = sorted(set(chars))
self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

@property
def size(self):
"""The number of chars"""
return len(self.chars)

def encode(self, C, maxlen):
"""Encode as one-hot"""
X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member
for i, c in enumerate(C):
X[i, self.char_indices[c]] = 1
return X

def decode(self, X, calc_argmax=True):
"""Decode from one-hot"""
if calc_argmax:
X = X.argmax(axis=-1)
return ''.join(self.indices_char[x] for x in X if x)



def _vectorize(questions, answers, ctable):
"""Vectorize the data as numpy arrays"""
len_of_questions = len(questions)
X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int)
print("inputchars")
for i in range(len(questions)):
print(i)
sentence = questions.pop()
print(sentence)
for j, c in enumerate(sentence):
print(j)
print(c)
try:
X[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass # Padding
y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=int)
print("outputchars")
for i in range(len(answers)):
print(i)
sentence = answers.pop()
print(sentence)
for j, c in enumerate(sentence):
try:
y[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass # Padding
return X, y



def vectorize(questions, answers, chars=None):
"""Vectorize the questions and expected answers"""
print('Vectorization...')
chars = chars or CHARS
ctable = CharacterTable(chars)
print("inputdata before _vec")
print(questions)
X, y = _vectorize(questions, answers, ctable)
# Explicitly set apart 10% for validation data that we never train over
#print("input after _vec")
#print(X)
#print("output after _vec")
#print(y)

print(X.shape)
print(y.shape)

return X, y, CONFIG.max_input_len, ctable

def generate_model(output_len, chars=None):
"""Generate the model"""
print('Build model...')
chars = chars or CHARS
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of hidden_size
# note: in a situation where your input sequences have a variable length,
# use input_shape=(None, nb_feature).
for layer_number in range(CONFIG.input_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization,
return_sequences=layer_number + 1 < CONFIG.input_layers))
model.add(Dropout(CONFIG.amount_of_dropout))
# For the decoder's input, we repeat the encoded input for each time step
model.add(RepeatVector(output_len))
# The decoder RNN could be multiple layers stacked or a single layer
for _ in range(CONFIG.output_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization))
model.add(Dropout(CONFIG.amount_of_dropout))

# For each of step of the output sequence, decide which character should be chosen
model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model


def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
"""Iterative Training"""
# Train the model each generation and show predictions against the validation dataset
for iteration in range(1, CONFIG.number_of_iterations):
#print()
#print('-' * 50)
#print('Iteration', iteration)
model.fit(X_train, y_train, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs,
validation_data=(X_val, y_val))
#print_random_predictions(model, ctable, X_val, y_val)


def print_random_predictions(model, ctable, X_val, y_val):
"""Select 10 samples from the validation set at random so we can visualize errors"""
print()
for _ in range(10):
#ind = random_randint(0, len(X_val))
#rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member
rowX, rowy = X_val, y_val
preds = model.predict_classes(rowX, verbose=0)
print("preds")
print(preds)
q = ctable.decode(rowX[0])
print("q-value")
print(q)
correct = ctable.decode(rowy[0])
print("correct")
print(correct)
guess = ctable.decode(preds[0], calc_argmax=False)
print("predicted")
print(guess)
return guess


X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data, chars)
print ("y_maxlen, chars", y_maxlen, "".join(chars))
model = generate_model(y_maxlen, chars)
iterate_training(model, X_train, y_train, X_train, y_train, ctable)
for inp in X_train:
inputarray = ctable.decode(inp)
print(inputarray)
prediction=model.predict_classes(X_train, verbose=0)
for p in prediction:
guess = ctable.decode(p, calc_argmax=False)
print(guess)



Below is the content of file input_spell.csv


input output
sol solid
kt kit
whl wheel
abr abrasive
unv universal
pp pipe
plt plate
accum accumulator



I have taken code from deepspell



Below is my prediction result for training set


ccumultorrrrrrrrrr
plteeeeeeeeeeeeeeee
pipeeellllllllllllll
universllllllllllll
brsiveeellllllllll
wheellllllllllllllll
kitteeeeeeeeeeeellll
solidddddddddddddddd



Input and output vector is of size



Input:(8, 20, 26)
output:(8, 20, 26)



so I am getting prediction result of length 20



I have a very basic understanding of RNN and LSTM



Updating
When i try to visualize model.summary() i got


model.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_1 (LSTM) (None, None, 500) 1054000
_________________________________________________________________
dropout_1 (Dropout) (None, None, 500) 0
_________________________________________________________________
lstm_2 (LSTM) (None, 500) 2002000
_________________________________________________________________
dropout_2 (Dropout) (None, 500) 0
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 20, 500) 0
_________________________________________________________________
lstm_3 (LSTM) (None, 20, 500) 2002000
_________________________________________________________________
dropout_3 (Dropout) (None, 20, 500) 0
_________________________________________________________________
lstm_4 (LSTM) (None, 20, 500) 2002000
_________________________________________________________________
dropout_4 (Dropout) (None, 20, 500) 0
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 26) 13026
_________________________________________________________________
activation_1 (Activation) (None, 20, 26) 0
=================================================================
Total params: 7,073,026
Trainable params: 7,073,026
Non-trainable params: 0
_________________________________________________________________



can anyone tell me where i have gone wrong?






Have you tried to visualize the model by printing model.summary() to try and spot the error?

– Mohamed Elzarei
Mar 19 '18 at 19:22


model.summary()






@MohamedElzare I have edited and included model.summary() in question

– Ranjana Girish
Mar 20 '18 at 9:24


model.summary()




1 Answer
1



I suggest double checking the max_input_len, output_len, and y_maxlen variables, Does your model train succesfully? What it's validation error?
If it seems to be training succesfully, then I suspect a problem with the shapes of the vectorized data of the generalization step.


max_input_len


output_len


y_maxlen



Thanks for contributing an answer to Stack Overflow!



But avoid



To learn more, see our tips on writing great answers.



Required, but never shown



Required, but never shown




By clicking "Post Your Answer", you acknowledge that you have read our updated terms of service, privacy policy and cookie policy, and that your continued use of the website is subject to these policies.

Popular posts from this blog

𛂒𛀶,𛀽𛀑𛂀𛃧𛂓𛀙𛃆𛃑𛃷𛂟𛁡𛀢𛀟𛁤𛂽𛁕𛁪𛂟𛂯,𛁞𛂧𛀴𛁄𛁠𛁼𛂿𛀤 𛂘,𛁺𛂾𛃭𛃭𛃵𛀺,𛂣𛃍𛂖𛃶 𛀸𛃀𛂖𛁶𛁏𛁚 𛂢𛂞 𛁰𛂆𛀔,𛁸𛀽𛁓𛃋𛂇𛃧𛀧𛃣𛂐𛃇,𛂂𛃻𛃲𛁬𛃞𛀧𛃃𛀅 𛂭𛁠𛁡𛃇𛀷𛃓𛁥,𛁙𛁘𛁞𛃸𛁸𛃣𛁜,𛂛,𛃿,𛁯𛂘𛂌𛃛𛁱𛃌𛂈𛂇 𛁊𛃲,𛀕𛃴𛀜 𛀶𛂆𛀶𛃟𛂉𛀣,𛂐𛁞𛁾 𛁷𛂑𛁳𛂯𛀬𛃅,𛃶𛁼

ữḛḳṊẴ ẋ,Ẩṙ,ỹḛẪẠứụỿṞṦ,Ṉẍừ,ứ Ị,Ḵ,ṏ ṇỪḎḰṰọửḊ ṾḨḮữẑỶṑỗḮṣṉẃ Ữẩụ,ṓ,ḹẕḪḫỞṿḭ ỒṱṨẁṋṜ ḅẈ ṉ ứṀḱṑỒḵ,ḏ,ḊḖỹẊ Ẻḷổ,ṥ ẔḲẪụḣể Ṱ ḭỏựẶ Ồ Ṩ,ẂḿṡḾồ ỗṗṡịṞẤḵṽẃ ṸḒẄẘ,ủẞẵṦṟầṓế

⃀⃉⃄⃅⃍,⃂₼₡₰⃉₡₿₢⃉₣⃄₯⃊₮₼₹₱₦₷⃄₪₼₶₳₫⃍₽ ₫₪₦⃆₠₥⃁₸₴₷⃊₹⃅⃈₰⃁₫ ⃎⃍₩₣₷ ₻₮⃊⃀⃄⃉₯,⃏⃊,₦⃅₪,₼⃀₾₧₷₾ ₻ ₸₡ ₾,₭⃈₴⃋,€⃁,₩ ₺⃌⃍⃁₱⃋⃋₨⃊⃁⃃₼,⃎,₱⃍₲₶₡ ⃍⃅₶₨₭,⃉₭₾₡₻⃀ ₼₹⃅₹,₻₭ ⃌