Spell correction using Keras RNN in python
Spell correction using Keras RNN in python
I am doing spell correction using RNN, Below is the code I am using
from _future_ import print_function, division, unicode_literals
import os
import errno
from collections import Counter
from hashlib import sha256
import re
import json
import itertools
import logging
import requests
import numpy as np
import pandas as pd
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros # pylint:disable=no-name-in-module
from keras.models import Sequential, load_model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout, recurrent
from keras.callbacks import Callback
# Set a logger for the module
LOGGER = logging.getLogger(__name__) # Every log will use the module name
LOGGER.addHandler(logging.StreamHandler())
LOGGER.setLevel(logging.DEBUG)
random_seed(123) # Reproducibility
class Configuration(object):
"""Dump stuff here"""
CONFIG = Configuration()
#pylint:disable=attribute-defined-outside-init
# Parameters for the model:
CONFIG.input_layers = 2
CONFIG.output_layers = 2
CONFIG.amount_of_dropout = 0.2
CONFIG.hidden_size = 500
CONFIG.initialization = "he_normal" # : Gaussian initialization scaled by fan-in (He et al., 2014)
CONFIG.number_of_chars = 100
CONFIG.max_input_len = 20
CONFIG.inverted = True
# parameters for the training:
CONFIG.batch_size = 100 # As the model changes in size, play with the batch size to best fit the process in memory
CONFIG.epochs = 500 # due to mini-epochs.
CONFIG.steps_per_epoch = 1000 # This is a mini-epoch. Using News 2013 an epoch would need to be ~60K.
CONFIG.validation_steps = 10
CONFIG.number_of_iterations = 10
dataset=pd.read_csv("input_spell.csv")
input_data=dataset['input'].tolist()
input_data1=str(input_data)
output_data=dataset['output'].tolist()
output_data1=str(output_data)
chars=list("abcdefghijklmnopqrstuvwxyz")
MIN_INPUT_LEN = 1
AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len
CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
class CharacterTable(object):
"""
Given a set of characters:
+ Encode them to a one hot integer representation
+ Decode the one hot integer representation to their character output
+ Decode a vector of probabilities to their character output
"""
def __init__(self, chars):
self.chars = sorted(set(chars))
self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
@property
def size(self):
"""The number of chars"""
return len(self.chars)
def encode(self, C, maxlen):
"""Encode as one-hot"""
X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member
for i, c in enumerate(C):
X[i, self.char_indices[c]] = 1
return X
def decode(self, X, calc_argmax=True):
"""Decode from one-hot"""
if calc_argmax:
X = X.argmax(axis=-1)
return ''.join(self.indices_char[x] for x in X if x)
def _vectorize(questions, answers, ctable):
"""Vectorize the data as numpy arrays"""
len_of_questions = len(questions)
X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int)
print("inputchars")
for i in range(len(questions)):
print(i)
sentence = questions.pop()
print(sentence)
for j, c in enumerate(sentence):
print(j)
print(c)
try:
X[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass # Padding
y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=int)
print("outputchars")
for i in range(len(answers)):
print(i)
sentence = answers.pop()
print(sentence)
for j, c in enumerate(sentence):
try:
y[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass # Padding
return X, y
def vectorize(questions, answers, chars=None):
"""Vectorize the questions and expected answers"""
print('Vectorization...')
chars = chars or CHARS
ctable = CharacterTable(chars)
print("inputdata before _vec")
print(questions)
X, y = _vectorize(questions, answers, ctable)
# Explicitly set apart 10% for validation data that we never train over
#print("input after _vec")
#print(X)
#print("output after _vec")
#print(y)
print(X.shape)
print(y.shape)
return X, y, CONFIG.max_input_len, ctable
def generate_model(output_len, chars=None):
"""Generate the model"""
print('Build model...')
chars = chars or CHARS
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of hidden_size
# note: in a situation where your input sequences have a variable length,
# use input_shape=(None, nb_feature).
for layer_number in range(CONFIG.input_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization,
return_sequences=layer_number + 1 < CONFIG.input_layers))
model.add(Dropout(CONFIG.amount_of_dropout))
# For the decoder's input, we repeat the encoded input for each time step
model.add(RepeatVector(output_len))
# The decoder RNN could be multiple layers stacked or a single layer
for _ in range(CONFIG.output_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization))
model.add(Dropout(CONFIG.amount_of_dropout))
# For each of step of the output sequence, decide which character should be chosen
model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
"""Iterative Training"""
# Train the model each generation and show predictions against the validation dataset
for iteration in range(1, CONFIG.number_of_iterations):
#print()
#print('-' * 50)
#print('Iteration', iteration)
model.fit(X_train, y_train, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs,
validation_data=(X_val, y_val))
#print_random_predictions(model, ctable, X_val, y_val)
def print_random_predictions(model, ctable, X_val, y_val):
"""Select 10 samples from the validation set at random so we can visualize errors"""
print()
for _ in range(10):
#ind = random_randint(0, len(X_val))
#rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member
rowX, rowy = X_val, y_val
preds = model.predict_classes(rowX, verbose=0)
print("preds")
print(preds)
q = ctable.decode(rowX[0])
print("q-value")
print(q)
correct = ctable.decode(rowy[0])
print("correct")
print(correct)
guess = ctable.decode(preds[0], calc_argmax=False)
print("predicted")
print(guess)
return guess
X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data, chars)
print ("y_maxlen, chars", y_maxlen, "".join(chars))
model = generate_model(y_maxlen, chars)
iterate_training(model, X_train, y_train, X_train, y_train, ctable)
for inp in X_train:
inputarray = ctable.decode(inp)
print(inputarray)
prediction=model.predict_classes(X_train, verbose=0)
for p in prediction:
guess = ctable.decode(p, calc_argmax=False)
print(guess)
Below is the content of file input_spell.csv
input output
sol solid
kt kit
whl wheel
abr abrasive
unv universal
pp pipe
plt plate
accum accumulator
I have taken code from deepspell
Below is my prediction result for training set
ccumultorrrrrrrrrr
plteeeeeeeeeeeeeeee
pipeeellllllllllllll
universllllllllllll
brsiveeellllllllll
wheellllllllllllllll
kitteeeeeeeeeeeellll
solidddddddddddddddd
Input and output vector is of size
Input:(8, 20, 26)
output:(8, 20, 26)
so I am getting prediction result of length 20
I have a very basic understanding of RNN and LSTM
Updating
When i try to visualize model.summary() i got
model.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_1 (LSTM) (None, None, 500) 1054000
_________________________________________________________________
dropout_1 (Dropout) (None, None, 500) 0
_________________________________________________________________
lstm_2 (LSTM) (None, 500) 2002000
_________________________________________________________________
dropout_2 (Dropout) (None, 500) 0
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 20, 500) 0
_________________________________________________________________
lstm_3 (LSTM) (None, 20, 500) 2002000
_________________________________________________________________
dropout_3 (Dropout) (None, 20, 500) 0
_________________________________________________________________
lstm_4 (LSTM) (None, 20, 500) 2002000
_________________________________________________________________
dropout_4 (Dropout) (None, 20, 500) 0
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 26) 13026
_________________________________________________________________
activation_1 (Activation) (None, 20, 26) 0
=================================================================
Total params: 7,073,026
Trainable params: 7,073,026
Non-trainable params: 0
_________________________________________________________________
can anyone tell me where i have gone wrong?
model.summary()
@MohamedElzare I have edited and included
model.summary()
in question– Ranjana Girish
Mar 20 '18 at 9:24
model.summary()
1 Answer
1
I suggest double checking the max_input_len
, output_len
, and y_maxlen
variables, Does your model train succesfully? What it's validation error?
If it seems to be training succesfully, then I suspect a problem with the shapes of the vectorized data of the generalization step.
max_input_len
output_len
y_maxlen
Thanks for contributing an answer to Stack Overflow!
But avoid …
To learn more, see our tips on writing great answers.
Required, but never shown
Required, but never shown
By clicking "Post Your Answer", you acknowledge that you have read our updated terms of service, privacy policy and cookie policy, and that your continued use of the website is subject to these policies.
Have you tried to visualize the model by printing
model.summary()
to try and spot the error?– Mohamed Elzarei
Mar 19 '18 at 19:22