XOR Deep Learning Example

The Problem

OpenAI recently released some open research questions. As a beginner in AI I decided to tackle the begineer ‘Warmups’ they have offered. You can view their blog post here:

⭐ Train an LSTM to solve the XOR problem: that is, given a sequence of bits, determine its parity. The LSTM should consume the sequence, one bit at a time, and then output the correct answer at the sequence’s end. Test the two approaches below

  • Generate a dataset of random 100,000 binary strings of length 50. Train the LSTM; what performance do you get?
  • Generate a dataset of random 100,000 binary strings, where the length of each string is independently and randomly chosen between 1 and 50. Train the LSTM. Does it succeed? What explains the difference?

The XOR Function

Input Input 2 Output
0 0 0
0 1 1
1 0 1
1 1 0

Problem 1: 100,000 Binary Strings Of Length 50

The code for solving this problem is thanks to christopher5106

Setup


'''
Trains a 1D Grid LSTM network to learn XOR answers.
'''

from __future__ import print_function
import numpy as np
import theano
import theano.tensor as T
import lasagne

#Lasagne Seed for Reproducibility
lasagne.random.set_rng(np.random.RandomState(1))

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--iterations', type=int, default=100000, help='Number of iterations')
parser.add_argument('--bits', type=int, default=50, help='Number of bits in the input strings')
parser.add_argument('--hidden', type=int, default=2, help='Number of units in the two hidden (LSTM) layers')
parser.add_argument('--learning_rate', type=float, default=0.5, help='Optimization learning rate')
#parser.add_argument('--grad_clip', type=int, default=100, help='All gradients above this will be clipped')
parser.add_argument('--print_freq', type=int, default=500, help='How often should we check the output?')
parser.add_argument('--batch_size', type=int, default=1, help='Batch size')
parser.add_argument('--layers', type=int, default=2, help='Number of layers')
args = parser.parse_args()

print("Parameters:")
print(args)
args.print_batch_freq = args.print_freq / args.batch_size + 1

Random String Generator


def gen_data(bits=args.bits, batch_size = args.batch_size):
    x = np.random.randint(2, size=(batch_size,bits))
    y = x.sum(axis=1) % 2
    return x, y

Network


print("Building network ...")

l_in = lasagne.layers.InputLayer(shape=(None,1))
l_in_zero = lasagne.layers.InputLayer(shape=(None, args.layers, 1))
l_lin = lasagne.layers.DenseLayer(l_in, num_units = args.hidden, nonlinearity = None)

l_forward = lasagne.layers.LSTMLayer(
   l_in_zero, args.hidden,
    nonlinearity=lasagne.nonlinearities.tanh, hid_init = l_lin, only_return_final=True)

l_lin_out = lasagne.layers.DenseLayer(l_forward, num_units = 2, nonlinearity = None)
l_out = lasagne.layers.DenseLayer(l_lin_out, num_units=2, nonlinearity=lasagne.nonlinearities.softmax)

target_values = T.ivector('target_output')

network_output = lasagne.layers.get_output(l_out)

cost = T.nnet.categorical_crossentropy(network_output,target_values).mean()
accuracy = lasagne.objectives.categorical_accuracy(network_output,target_values).mean()

all_params = lasagne.layers.get_all_params(l_out,trainable=True)
all_params2 = lasagne.layers.get_all_params(l_out,trainable=False)

Theano Functions

print("Computing updates ...")
updates = lasagne.updates.adadelta(cost, all_params, args.learning_rate)
updates2 = lasagne.updates.adagrad(cost, all_params2, args.learning_rate)

print("Compiling functions ...")
train = theano.function([l_in.input_var, l_in_zero.input_var, target_values], cost, updates=updates, allow_input_downcast=True)
train2 = theano.function([l_in.input_var, l_in_zero.input_var, target_values], cost, updates=updates2, allow_input_downcast=True)
compute_cost = theano.function([l_in.input_var, l_in_zero.input_var, target_values], cost, allow_input_downcast=True)
compute_accuracy = theano.function([l_in.input_var, l_in_zero.input_var, target_values],
    accuracy, allow_input_downcast=True)

probs = theano.function([l_in.input_var, l_in_zero.input_var],network_output,allow_input_downcast=True)

Training

print("Training ...")
print("The average loss and accuracy will be printed every {} iterations".format(args.print_batch_freq*args.batch_size))
num_batch_print_iter = args.iterations / args.batch_size / args.print_batch_freq + 1
act_num_batches =  int(num_batch_print_iter * args.print_batch_freq)
all_cost = np.zeros((act_num_batches))
all_accuracy = np.zeros((act_num_batches))

for it_out in range(int(num_batch_print_iter)):
    for it_in in range(int(args.print_batch_freq)):
        x,y = gen_data()
        stoplen = (len(x[0])-1)
        for i in range(0,stoplen):
            x_zero = np.zeros((args.batch_size,args.layers,1),dtype='int32')
            batch_cost = train2(np.reshape(x[0][i], newshape=(1,1)), x_zero, y)
        x_zero = np.zeros((args.batch_size, args.layers, 1), dtype='int32')
        batch_iter = int(it_out * args.print_batch_freq + it_in + 1)
        batch_cost = train(np.reshape(x[0][len(x[0])-1], newshape=(1,1)), x_zero, y)
        batch_accuracy = compute_accuracy(np.reshape(x[0][len(x[0])-1], newshape=(1,1)), x_zero, y)

        all_cost[batch_iter - 1] = batch_cost
        all_accuracy[batch_iter - 1] = batch_accuracy
    start_index = int(it_out * args.print_batch_freq)
    end_index = int((it_out + 1) * args.print_batch_freq)
    av_cost = all_cost[start_index:end_index].mean()
    av_accuracy = all_accuracy[start_index:end_index].mean()
    np.savetxt('cost.txt', all_cost[:end_index], delimiter=',')  #average in batch
    np.savetxt('accuracy.txt', all_accuracy[:end_index], delimiter=',')

    print("Iteration {} average loss = {} average accuracy = {}".format(batch_iter*args.batch_size,
av_cost,av_accuracy))

Problem 2: 100,000 Binary Strings Of Random Length 1:50

Code differences

parser.add_argument('--hidden', type=int, default=100, help='Number of units in the two hidden (LSTM) layers')
parser.add_argument('--layers', type=int, default=2, help='Number of layers')
args = parser.parse_args()
def gen_data(bits=args.bits, batch_size = args.batch_size):
    bitsran = np.random.randint(1, bits)
    x = np.random.randint(2, size=(batch_size,bitsran))
    y = x.sum(axis=1) % 2
    return x, y

Conclusion

Related

comments powered by Disqus