# optimizers.py
from abc import ABC, abstractmethod
from collections import Counter
Copyright By PowCoder代写 加微信 powcoder
import numpy as np
from typing import List
class Optimizer(ABC):
Optimizer that aims to *maximize* a given function.
def score(self, feats: List[int]):
:param feats: List[int] feature vector indices (i.e., sparse representation of a feature vector)
:return: floating-point score
score = 0.0
while i < len(feats):
score += self.access(feats[i])
return score
@abstractmethod
def apply_gradient_update(self, gradient: Counter, batch_size: int):
@abstractmethod
def access(self, i: int):
@abstractmethod
def get_final_weights(self):
class SGDOptimizer(Optimizer):
SGD optimizer implementation, designed to have the same interface as the Adagrad optimizers
Attributes:
weights: numpy array containing initial settings of the weights. Usually initialize to the 0 vector unless
you have a very good reason not to.
alpha: step size
def __init__(self, init_weights: np.ndarray, alpha):
self.weights = init_weights
self.alpha = alpha
def apply_gradient_update(self, gradient: Counter, batch_size: int):
Take a sparse representation of the gradient and make an update, normalizing by the batch size to keep
hyperparameters constant as the batch size is varied
:param gradient: Counter containing the gradient values (i.e., sparse representation of the gradient)
:param batch_size: how many examples the gradient was computed on
:return: nothing, modifies weights in-place
for i in gradient.keys():
self.weights[i] = self.weights[i] + self.alpha * gradient[i]
def access(self, i: int):
:param i: index of the weight to access
:return: value of that weight
return self.weights[i]
def get_final_weights(self):
return self.weights
class L1RegularizedAdagradTrainer(Optimizer):
Wraps a weight vector and applies the Adagrad update using second moments of features to make custom step sizes.
This version incorporates L1 regularization: while this regularization should be applied to squash the feature vector
on every gradient update, we instead evaluate the regularizer lazily only when the particular feature is touched
(either by gradient update or by access). approximate lets you turn this off for faster access, but regularization is
now applied somewhat inconsistently.
See section 5.1 of http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf for more details
def __init__(self, init_weights, lamb=1e-8, eta=1.0, use_regularization=False, approximate=True):
:param init_weights: a numpy array of the correct dimension, usually initialized to 0
:param lamb: float lambda constant for the regularizer. Values above 0.01 will often cause all features to be zeroed out.
:param eta: float step size. Values from 0.01 to 10 often work well.
:param use_regularization:
:param approximate: turns off gradient updates on access, only uses them when weights are written to.
So regularization is applied inconsistently, but it makes things faster.
self.weights = init_weights
self.lamb = lamb
self.eta = eta
self.use_regularization = use_regularization
self.approximate = approximate
self.curr_iter = 0
self.last_iter_touched = [0 for i in range(0, self.weights.shape[0])]
self.diag_Gt = np.zeros_like(self.weights, dtype=float)
def apply_gradient_update(self, gradient: Counter, batch_size: int):
Take a sparse representation of the gradient and make an update, normalizing by the batch size to keep
hyperparameters constant as the batch size is varied
:param gradient Counter containing the gradient values (i.e., sparse representation of the gradient)
:param batch_size: how many examples the gradient was computed on
:return: nothing, modifies weights in-place
batch_size_multiplier = 1.0 / batch_size
self.curr_iter += 1
for i in gradient.keys():
xti = self.weights[i]
# N.B. We negate the gradient here because the Adagrad formulas are all for minimizing
# and we're trying to maximize, so think of it as minimizing the negative of the objective
# which has the opposite gradient
# See section 5.1 in http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf for more details
# eta is the step size, lambda is the regularization
gti = -gradient[i] * batch_size_multiplier
old_eta_over_Htii = self.eta / (1 + np.sqrt(self.diag_Gt[i]))
self.diag_Gt[i] += gti * gti
Htii = 1 + np.sqrt(self.diag_Gt[i])
eta_over_Htii = self.eta / Htii
new_xti = xti - eta_over_Htii * gti
# Apply the regularizer for every iteration since touched
iters_since_touched = self.curr_iter - self.last_iter_touched[i]
self.last_iter_touched[i] = self.curr_iter
self.weights[i] = np.sign(new_xti) * max(0, np.abs(new_xti) - self.lamb * eta_over_Htii - (iters_since_touched - 1) * self.lamb * old_eta_over_Htii)
def access(self, i: int):
:param i: index of the weight to access
:return: value of that weight
if not self.approximate and self.last_iter_touched[i] != self.curr_iter:
xti = self.weights[i]
Htii = 1 + np.sqrt(self.diag_Gt[i])
eta_over_Htii = self.eta / Htii
iters_since_touched = self.curr_iter - self.last_iter_touched[i]
self.last_iter_touched[i] = self.curr_iter
self.weights[i] = np.sign(xti) * max(0, np.abs(xti) - iters_since_touched * self.lamb * self.eta * eta_over_Htii);
return self.weights[i]
def get_final_weights(self):
:return: a numpy array containing the final weight vector values -- manually calls access to force each weight to
have an up-to-date value.
for i in range(0, self.weights.shape[0]):
self.access(i)
return self.weights
class UnregularizedAdagradTrainer(Optimizer):
Applies the Adagrad update with no regularization. Will be substantially faster than the L1 regularized version
due to less computation required to update each feature. Same interface as the regularized version.
def __init__(self, init_weights, eta=1.0):
self.weights = init_weights
self.eta = eta
self.diag_Gt = np.zeros_like(self.weights, dtype=float)
def apply_gradient_update(self, gradient: Counter, batch_size: int):
batch_size_multiplier = 1.0 / batch_size
for i in gradient.keys():
xti = self.weights[i]
gti = -gradient[i] * batch_size_multiplier
self.diag_Gt[i] += gti * gti
Htii = 1 + np.sqrt(self.diag_Gt[i])
eta_over_Htii = self.eta / Htii
self.weights[i] = xti - eta_over_Htii * gti
def access(self, i: int):
return self.weights[i]
def get_final_weights(self):
return self.weights
程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com