CS代写 # optimizers.py

# optimizers.py

from abc import ABC, abstractmethod
from collections import Counter

Copyright By PowCoder代写 加微信 powcoder

import numpy as np
from typing import List

class Optimizer(ABC):
Optimizer that aims to *maximize* a given function.

def score(self, feats: List[int]):
:param feats: List[int] feature vector indices (i.e., sparse representation of a feature vector)
:return: floating-point score
score = 0.0
while i < len(feats): score += self.access(feats[i]) return score @abstractmethod def apply_gradient_update(self, gradient: Counter, batch_size: int): @abstractmethod def access(self, i: int): @abstractmethod def get_final_weights(self): class SGDOptimizer(Optimizer): SGD optimizer implementation, designed to have the same interface as the Adagrad optimizers Attributes: weights: numpy array containing initial settings of the weights. Usually initialize to the 0 vector unless you have a very good reason not to. alpha: step size def __init__(self, init_weights: np.ndarray, alpha): self.weights = init_weights self.alpha = alpha def apply_gradient_update(self, gradient: Counter, batch_size: int): Take a sparse representation of the gradient and make an update, normalizing by the batch size to keep hyperparameters constant as the batch size is varied :param gradient: Counter containing the gradient values (i.e., sparse representation of the gradient) :param batch_size: how many examples the gradient was computed on :return: nothing, modifies weights in-place for i in gradient.keys(): self.weights[i] = self.weights[i] + self.alpha * gradient[i] def access(self, i: int): :param i: index of the weight to access :return: value of that weight return self.weights[i] def get_final_weights(self): return self.weights class L1RegularizedAdagradTrainer(Optimizer): Wraps a weight vector and applies the Adagrad update using second moments of features to make custom step sizes. This version incorporates L1 regularization: while this regularization should be applied to squash the feature vector on every gradient update, we instead evaluate the regularizer lazily only when the particular feature is touched (either by gradient update or by access). approximate lets you turn this off for faster access, but regularization is now applied somewhat inconsistently. See section 5.1 of http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf for more details def __init__(self, init_weights, lamb=1e-8, eta=1.0, use_regularization=False, approximate=True): :param init_weights: a numpy array of the correct dimension, usually initialized to 0 :param lamb: float lambda constant for the regularizer. Values above 0.01 will often cause all features to be zeroed out. :param eta: float step size. Values from 0.01 to 10 often work well. :param use_regularization: :param approximate: turns off gradient updates on access, only uses them when weights are written to. So regularization is applied inconsistently, but it makes things faster. self.weights = init_weights self.lamb = lamb self.eta = eta self.use_regularization = use_regularization self.approximate = approximate self.curr_iter = 0 self.last_iter_touched = [0 for i in range(0, self.weights.shape[0])] self.diag_Gt = np.zeros_like(self.weights, dtype=float) def apply_gradient_update(self, gradient: Counter, batch_size: int): Take a sparse representation of the gradient and make an update, normalizing by the batch size to keep hyperparameters constant as the batch size is varied :param gradient Counter containing the gradient values (i.e., sparse representation of the gradient) :param batch_size: how many examples the gradient was computed on :return: nothing, modifies weights in-place batch_size_multiplier = 1.0 / batch_size self.curr_iter += 1 for i in gradient.keys(): xti = self.weights[i] # N.B. We negate the gradient here because the Adagrad formulas are all for minimizing # and we're trying to maximize, so think of it as minimizing the negative of the objective # which has the opposite gradient # See section 5.1 in http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf for more details # eta is the step size, lambda is the regularization gti = -gradient[i] * batch_size_multiplier old_eta_over_Htii = self.eta / (1 + np.sqrt(self.diag_Gt[i])) self.diag_Gt[i] += gti * gti Htii = 1 + np.sqrt(self.diag_Gt[i]) eta_over_Htii = self.eta / Htii new_xti = xti - eta_over_Htii * gti # Apply the regularizer for every iteration since touched iters_since_touched = self.curr_iter - self.last_iter_touched[i] self.last_iter_touched[i] = self.curr_iter self.weights[i] = np.sign(new_xti) * max(0, np.abs(new_xti) - self.lamb * eta_over_Htii - (iters_since_touched - 1) * self.lamb * old_eta_over_Htii) def access(self, i: int): :param i: index of the weight to access :return: value of that weight if not self.approximate and self.last_iter_touched[i] != self.curr_iter: xti = self.weights[i] Htii = 1 + np.sqrt(self.diag_Gt[i]) eta_over_Htii = self.eta / Htii iters_since_touched = self.curr_iter - self.last_iter_touched[i] self.last_iter_touched[i] = self.curr_iter self.weights[i] = np.sign(xti) * max(0, np.abs(xti) - iters_since_touched * self.lamb * self.eta * eta_over_Htii); return self.weights[i] def get_final_weights(self): :return: a numpy array containing the final weight vector values -- manually calls access to force each weight to have an up-to-date value. for i in range(0, self.weights.shape[0]): self.access(i) return self.weights class UnregularizedAdagradTrainer(Optimizer): Applies the Adagrad update with no regularization. Will be substantially faster than the L1 regularized version due to less computation required to update each feature. Same interface as the regularized version. def __init__(self, init_weights, eta=1.0): self.weights = init_weights self.eta = eta self.diag_Gt = np.zeros_like(self.weights, dtype=float) def apply_gradient_update(self, gradient: Counter, batch_size: int): batch_size_multiplier = 1.0 / batch_size for i in gradient.keys(): xti = self.weights[i] gti = -gradient[i] * batch_size_multiplier self.diag_Gt[i] += gti * gti Htii = 1 + np.sqrt(self.diag_Gt[i]) eta_over_Htii = self.eta / Htii self.weights[i] = xti - eta_over_Htii * gti def access(self, i: int): return self.weights[i] def get_final_weights(self): return self.weights 程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com