程序代写代做代考 chain algorithm “””

“””
Implementation of the character-level Elman RNN model.
Written by Ngoc-Quan Pham based on Andreij Karparthy’s lecture Cs231n.
BSD License
“””
import numpy as np
from random import uniform
import sys

# Since numpy doesn’t have a function for sigmoid
# We implement it manually here
def sigmoid(x):
return 1 / (1 + np.exp(-x))

# The derivative of the sigmoid function
def dsigmoid(y):
return y * (1 – y)

# The derivative of the tanh function
def dtanh(x):
return 1 – x * x

# The numerically stable softmax implementation
def softmax(x):
# assuming x shape is [feature_size, batch_size]
e_x = np.exp(x – np.max(x, axis=0))
return e_x / e_x.sum(axis=0)

# data I/O
# should be simple plain text file. The sample from “Hamlet – Shakespeares” is provided in data/
data = open(‘data/input.txt’, ‘r’).read()
chars = sorted(list(set(data))) # added sorted so that the character list is deterministic
print(chars)
data_size, vocab_size = len(data), len(chars)
print(‘data has %d characters, %d unique.’ % (data_size, vocab_size))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

# hyper-parameters deciding the network size
emb_size = 4 # word/character embedding size
seq_length = 32 # number of steps to unroll the RNN for the truncated back-propagation algorithm
hidden_size = 32
# learning rate for the Adagrad algorithm. (this one is not ‘optimized’, only required to make the model learn)
learning_rate = 0.02
std = 0.02 # The standard deviation for parameter initilization
batch_size = 4

# model parameters
# Here we initialize the parameters based an random uniform distribution, with the std of 0.01

# word embedding: each character in the vocabulary is mapped to a vector with $emb_size$ neurons
# Transform one-hot vectors to embedding X
Wex = np.random.randn(emb_size, vocab_size) * std

# weight to transform input X to hidden H
Wxh = np.random.randn(hidden_size, emb_size) * std

# weight to transform previous hidden states H_{t-1} to hidden H_t
Whh = np.random.randn(hidden_size, hidden_size) * std # hidden to hidden

# Output layer: transforming the hidden states H to output layer
Why = np.random.randn(vocab_size, hidden_size) * std # hidden to output

# The biases are typically initialized as zeros. But sometimes people init them with uniform distribution too.
bh = np.random.randn(hidden_size, 1) * std # hidden bias
by = np.random.randn(vocab_size, 1) * std # hidden bias

# These variables are momentums for the Adagrad algorithm
# Each parameter in the network needs one momentum correspondingly
mWex, mWxh, mWhh, mWhy = np.zeros_like(Wex), np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)

# this will load the data into memory
data_stream = np.asarray([char_to_ix[char] for char in data])
print(data_stream.shape)

bound = (data_stream.shape[0] // (seq_length * batch_size)) * (seq_length * batch_size)
cut_stream = data_stream[:bound]
cut_stream = np.reshape(cut_stream, (batch_size, -1))

def forward(inputs, labels, memory, batch_size=1):
prev_h = memory
“””
# here we use dictionaries to store the activations over time
# note from back-propagation implementation:
# back-propagation uses dynamic programming to estimate gradients efficiently
# so we need to store the activations over the course of the forward pass
# in the backward pass we will use the activations to compute the gradients
# (otherwise we will need to recompute them)
“””

# those variables stand for:
# xs: inputs to the RNNs at timesteps (embeddings)
# cs: characters at timesteps
# hs: hidden states at timesteps
# ys: output layers at timesteps
# ps: probability distributions at timesteps
xs, cs, hs, os, ps, ys = {}, {}, {}, {}, {}, {}

# the first memory (before training) is the previous (or initial) hidden state
hs[-1] = np.copy(prev_h)

# the loss will be accumulated over time
loss = 0

for t in range(inputs.shape[1]):

# one-hot vector representation for character input at time t
cs[t] = np.zeros((vocab_size, batch_size))

for b in range(batch_size):
cs[t][inputs[b][t]][b] = 1

# transform the one hot vector to embedding
# x = Wemb x c
xs[t] = np.dot(Wex, cs[t])

# computation for the hidden state of the network
# H = tanh ( Wh . H + Wx . x )
h_pre_activation = np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t – 1]) + bh
hs[t] = np.tanh(h_pre_activation)

# output layer:
# this is the unnormalized log probabilities for next chars (across all chars in the vocabulary)
os[t] = np.dot(Why, hs[t]) + by

# softmax layer to get normalized probabilities:
ps[t] = softmax(os[t])

# the label is also an one-hot vector
ys[t] = np.zeros((vocab_size, batch_size))
for b in range(batch_size):
ys[t][labels[b][t]][b] = 1

# cross entropy loss at time t:
loss_t = np.sum(-np.log(ps[t]) * ys[t])

loss += loss_t

# packaging the activations to use in the backward pass
activations = (xs, cs, hs, os, ps, ys)
last_hidden = hs[inputs.shape[1] – 1]
return loss, activations, last_hidden

def backward(activations, clipping=True, scale=True):
“””
during the backward pass we follow the track of the forward pass
the activations are needed so that we can avoid unnecessary re-computation
“””

# Gradient initialization
# Each parameter has a corresponding gradient (of the loss with respect to that gradient)
dWex, dWxh, dWhh, dWhy = np.zeros_like(Wex), np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)

xs, cs, hs, os, ps, ys = activations

# here we need the gradient w.r.t to the hidden layer at the final time step
# since this hidden layer is not connected to any future (final time step)
# then we can initialize it as zero vectors
dh = np.zeros_like(hs[0])
bsz = dh.shape[-1]

# the backward pass starts from the final step of the chain in the forward pass
for t in reversed(range(inputs.shape[1])):

# first, we need to compute the gradients of the variable closest to the loss function,
# which is the softmax output p
# but here I skip it directly to the gradients of the unnormalized scores o because
# basically dL / do = p – y
# from the cross entropy gradients. (the explanation is a bit too long to write here)
do = ps[t] – ys[t]

if scale:
do = do / bsz

# the gradients w.r.t to the weights and the bias that were used to create o[t]
dWhy += np.dot(do, hs[t].T)
dby += np.sum(do, axis=-1, keepdims=True)

# because h is connected to both o and the next h, we sum the gradients up
dh = np.dot(Why.T, do) + dh

# backprop through the activation function (tanh)
dtanh_h = 1 – hs[t] * hs[t]
dh_pre_activation = dtanh_h * dh # because h = tanh(h_pre_activation)

# next, since H = tanh ( Wh . H + Wx . x + bh )
# we use dh to backprop to dWh and dWx

# gradient of the bias and weight, this is similar to dby and dWhy
# for the H term
dbh += np.sum(dh_pre_activation, axis=-1, keepdims=True)
dWhh += np.dot(dh_pre_activation, hs[t – 1].T)
# we need this term for the recurrent connection (previous bptt step needs this)
dh = np.dot(Whh.T, dh_pre_activation)

# similarly for the x term
dWxh += np.dot(dh_pre_activation, xs[t].T)

# backward through the embedding
dx = np.dot(Wxh.T, dh_pre_activation)

# finally backward to the embedding projection
dWex += np.dot(dx, cs[t].T)

if clipping:
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients

gradients = (dWex, dWxh, dWhh, dWhy, dbh, dby)
return gradients

def sample(h, seed_ix, n):
“””
sample a sequence of integers from the model
h is memory state, seed_ix is seed letter for first time step
“””
c = np.zeros((vocab_size, 1))
c[seed_ix] = 1
generated_chars = []
for t in range(n):
x = np.dot(Wex, c)
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
o = np.dot(Why, h) + by
p = softmax(o)

# the the distribution, we randomly generate samples:
ix = np.random.multinomial(1, p.ravel())
c = np.zeros((vocab_size, 1))

for j in range(len(ix)):
if ix[j] == 1:
index = j
c[index] = 1
generated_chars.append(index)

return generated_chars

option = sys.argv[1] # train or gradcheck

if option == ‘train’:

n, p = 0, 0
data_length = cut_stream.shape[1]

# I am not perfectly sure about this (learnt from others that the initial “perplexity” of the model
# should be the vocabulary for every position. So this is the loss at iteration 0
smooth_loss = -np.log(1.0 / vocab_size) * seq_length # loss at iteration 0

while True:
# prepare inputs (we’re sweeping from left to right in steps seq_length long)
if p + seq_length + 1 >= data_length or n == 0:
hprev = np.zeros((hidden_size, batch_size)) # reset RNN memory
p = 0 # go back to start of data

inputs = cut_stream[:, p:p + seq_length]
targets = cut_stream[:, p + 1:p + 1 + seq_length]

# sample from the model now and then
if n % 1000 == 0:
h_zero = np.zeros((hidden_size, 1))
sample_ix = sample(h_zero, inputs[0][0], 1500)
txt = ”.join(ix_to_char[ix] for ix in sample_ix)
print(‘—-\n %s \n—-‘ % (txt,))

# forward seq_length characters through the net and fetch gradient
loss, activations, hprev = forward(inputs, targets, hprev, batch_size=batch_size)
gradients = backward(activations)
dWex, dWxh, dWhh, dWhy, dbh, dby = gradients
smooth_loss = smooth_loss * 0.999 + loss / batch_size * 0.001
if n % 20 == 0:
print(‘iter %d, loss: %f’ % (n, smooth_loss)) # print progress

# perform parameter update with Adagrad
for param, dparam, mem in zip([Wex, Wxh, Whh, Why, bh, by],
[dWex, dWxh, dWhh, dWhy, dbh, dby],
[mWex, mWxh, mWhh, mWhy, mbh, mby]):
mem += dparam * dparam
param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

p += seq_length # move data pointer
n += 1 # iteration counter

elif option == ‘gradcheck’:

p = 0
inputs = cut_stream[:, p:p + seq_length]
targets = cut_stream[:, p + 1:p + 1 + seq_length]

delta = 0.0001

hprev = np.zeros((hidden_size, batch_size))
memory = hprev

loss, activations, memory = forward(inputs, targets, hprev, batch_size=batch_size)
# for gradient-checking we don’t clip the gradients
gradients = backward(activations, clipping=False, scale=False)
dWex, dWxh, dWhh, dWhy, dbh, dby = gradients

for weight, grad, name in zip([Wex, Wxh, Whh, Why, bh, by],
[dWex, dWxh, dWhh, dWhy, dbh, dby],
[‘dWex’, ‘dWxh’, ‘dWhh’, ‘dWhhy’, ‘dbh’, ‘dby’]):

str_ = (“Dimensions dont match between weight and gradient %s and %s.” % (weight.shape, grad.shape))
assert (weight.shape == grad.shape), str_

count_idx = 0
grad_num_sum = 0
grad_ana_sum = 0
rel_error_sum = 0
error_idx = []

print(name)
for i in range(weight.size):

# evaluate cost at [x + delta] and [x – delta]
w = weight.flat[i]
weight.flat[i] = w + delta
loss_positive, _, _ = forward(inputs, targets, hprev, batch_size=batch_size)
weight.flat[i] = w – delta
loss_negative, _, _ = forward(inputs, targets, hprev, batch_size=batch_size)
weight.flat[i] = w # reset old value for this parameter
# fetch both numerical and analytic gradient
grad_analytic = grad.flat[i]
grad_numerical = (loss_positive – loss_negative) / (2 * delta)
grad_num_sum += grad_numerical
grad_ana_sum += grad_analytic
rel_error = abs(grad_analytic – grad_numerical) / abs(grad_numerical + grad_analytic)
if rel_error is None:
rel_error = 0.
rel_error_sum += rel_error

if rel_error > 0.0001:
print(‘WARNING %f, %f => %e ‘ % (grad_numerical, grad_analytic, rel_error))
count_idx += 1
error_idx.append(i)

print(‘For %s found %i bad gradients; with %i total parameters in the vector/matrix!’ % (
name, count_idx, weight.size))
print(‘ Average numerical grad: %0.9f \n Average analytical grad: %0.9f \n Average relative grad: %0.9f’ % (
grad_num_sum / float(weight.size), grad_ana_sum / float(weight.size), rel_error_sum / float(weight.size)))
print(‘ Indizes at which analytical gradient does not match numerical:’, error_idx)