CS代考程序代写 “””

“””
CSCC11 – Introduction to Machine Learning, Winter 2021, Assignment 3
B. Chan, Z. Zhang, D. Fleet
“””

import numpy as np

from utils import softmax

class LogisticRegression:
def __init__(self,
num_features,
num_classes,
rng=np.random):
“”” This class represents a multinomial logistic regression model.
NOTE: We assume lables are 0 to K – 1, where K is number of classes.

self.parameters contains the model weights.
NOTE: Bias term is the first term

TODO: You will need to implement the methods of this class:
– _compute_loss_and_gradient: ndarray, ndarray -> float, ndarray

Implementation description will be provided under each method.

For the following:
– N: Number of samples.
– D: Dimension of input features.
– K: Number of classes.

Args:
– num_features (int): The number of features in the input data.
– num_classes (int): The number of classes in the task.
– rng (RandomState): The random number generator to initialize weights.
“””
self.num_features = num_features
self.num_classes = num_classes
self.rng = rng

# Initialize parameters
self.parameters = np.zeros(shape=(num_classes, self.num_features + 1))

def init_weights(self, factor=1, bias=0):
“”” This randomly initialize the model weights.

Args:
– factor (float): A constant factor of the randomly initialized weights.
– bias (float): The bias value
“””
self.parameters[:, 1:] = factor * self.rng.rand(self.num_classes, self.num_features)
self.parameters[:, 0] = bias

def _compute_loss_and_gradient(self, X, y, alpha_inverse=0, beta_inverse=0):
“”” This computes the negative log likelihood (NLL) or negative log posterior and its gradient.

NOTE: When we have alpha_inverse != 0 or beta_inverse != 0, we have negative log posterior (NLP) instead.
NOTE: For the L2 term, drop all the log constant terms and cosntant factor.
For the NLL term, divide by the number of data points (i.e. we are taking the mean).
The new loss should take the form:
E_new(w) = (NLL_term / N) + L2_term
NOTE: Compute the gradient based on the modified loss E_new(w)

Args:
– X (ndarray (shape: (N, D))): A NxD matrix consisting N D-dimensional inputs.
– y (ndarray (shape: (N, 1))): A N-column vector consisting N scalar outputs (labels).
– alpha_inverse (float): 1 / variance for an optional isotropic Gaussian prior (for the weights) on NLP.
NOTE: 0 <= alpha_inverse. Setting alpha_inverse to 0 means no prior on weights. - beta_inverse (float): 1 / variance for an optional Gaussian prior (for the bias term) on NLP. NOTE: 0 <= beta_inverse. Setting beta_inverse to 0 means no prior on the bias term. Output: - nll (float): The NLL (or NLP) of the given inputs and outputs. - grad (ndarray (shape: (K, D + 1))): A Kx(D + 1) weight matrix (including bias) consisting the gradient of NLL (or NLP) (i.e. partial derivatives of NLL (or NLP) w.r.t. self.parameters). """ (N, D) = X.shape # ==================================================== # TODO: Implement your solution within the box # ==================================================== return nll, grad def learn(self, train_X, train_y, num_epochs=1000, step_size=1e-3, check_grad=False, verbose=False, alpha_inverse=0, beta_inverse=0, eps=np.finfo(np.float).eps): """ This performs gradient descent to learn the parameters given the training data. NOTE: This method mutates self.parameters Args: - train_X (ndarray (shape: (N, D))): A NxD matrix consisting N D-dimensional training inputs. - train_y (ndarray (shape: (N, 1))): A N-column vector consisting N scalar training outputs (labels). - num_epochs (int): Number of gradient descent steps NOTE: 1 <= num_epochs - step_size (float): Gradient descent step size - check_grad (bool): Whether or not to check gradient using finite difference. - verbose (bool): Whether or not to print gradient information for every step. - alpha_inverse (float): 1 / variance for an optional isotropic Gaussian prior (for the weights) on NLL. NOTE: 0 <= alpha_inverse. Setting alpha_inverse to 0 means no prior on weights. - beta_inverse (float): 1 / variance for an optional Gaussian prior (for the bias term) on NLL. NOTE: 0 <= beta_inverse. Setting beta_inverse to 0 means no prior on the bias term. - eps (float): Machine epsilon ASIDE: The design for applying gradient descent to find local minimum is usually different from this. You should think about a better way to do this! Scipy is a good reference for such design. """ assert len(train_X.shape) == len(train_y.shape) == 2, f"Input/output pairs must be 2D-arrays. train_X: {train_X.shape}, train_y: {train_y.shape}" (N, D) = train_X.shape assert N == train_y.shape[0], f"Number of samples must match for input/output pairs. train_X: {N}, train_y: {train_y.shape[0]}" assert D == self.num_features, f"Expected {self.num_features} features. Got: {D}" assert train_y.shape[1] == 1, f"train_Y must be a column vector. Got: {train_y.shape}" assert 1 <= num_epochs, f"Must take at least 1 gradient step. Got: {num_epochs}" nll, grad = self._compute_loss_and_gradient(train_X, train_y, alpha_inverse, beta_inverse) # Check gradient using finite difference if check_grad: original_parameters = np.copy(self.parameters) grad_approx = np.zeros(shape=(self.num_classes, self.num_features + 1)) h = 1e-8 # Compute finite difference w.r.t. each weight vector component for ii in range(self.num_classes): for jj in range(self.num_features + 1): self.parameters = np.copy(original_parameters) self.parameters[ii][jj] += h grad_approx[ii][jj] = (self._compute_loss_and_gradient(train_X, train_y, alpha_inverse, beta_inverse)[0] - nll) / h # Reset parameters back to original self.parameters = np.copy(original_parameters) print(f"Negative Log Probability: {nll}") print(f"Analytic Gradient: {grad.T}") print(f"Numerical Gradient: {grad_approx.T}") print("The gradients should be nearly identical.") # Perform gradient descent for epoch_i in range(num_epochs): original_parameters = np.copy(self.parameters) # Check gradient flow if np.linalg.norm(grad) < eps: print(f"Gradient is close to 0: {eps}. Terminating gradient descent.") break # Determine the suitable step size. step_size *= 2 self.parameters = original_parameters - step_size * grad E_new, grad_new = self._compute_loss_and_gradient(train_X, train_y, alpha_inverse, beta_inverse) assert np.isfinite(E_new), f"Error is NaN/Inf" while E_new >= nll and step_size > 0:
step_size /= 2
self.parameters = original_parameters – step_size * grad
E_new, grad_new = self._compute_loss_and_gradient(train_X, train_y, alpha_inverse, beta_inverse)
assert np.isfinite(E_new), f”Error is NaN/Inf”

if step_size <= eps: print(f"Infinitesimal step: {step_size}. Terminating gradient descent.") break if verbose: print(f"Epoch: {epoch_i}, Step size: {step_size}, Gradient Norm: {np.linalg.norm(grad)}, NLL: {nll}") # Update next loss and next gradient grad = grad_new nll = E_new def predict(self, X): """ This computes the probability of labels given X. Args: - X (ndarray (shape: (N, D))): A NxD matrix consisting N D-dimensional inputs. Output: - probs (ndarray (shape: (N, K))): A NxK matrix consisting N K-probabilities for each input. """ (N, D) = X.shape assert D == self.num_features, f"Expected {self.num_features} features. Got: {D}" # Pad 1's for bias term X = np.hstack((np.ones(shape=(N, 1), dtype=np.float), X)) # This receives the probabilities of class 1 given inputs probs = softmax(X @ self.parameters.T) return probs