\documentclass[10pt]{article}
%\pagestyle{empty}
\usepackage[left = 0.9in, right = 0.9in, top = 0.9in, bottom = 0.9in]{geometry}
\usepackage{amsmath,amssymb,comment, enumerate,hyperref,xcolor,bm,graphicx}
\def\A{\mathbf{A}}
\def\B{\mathbf{B}}
\def\C{\mathbf{C}}
\def\P{\mathbf{P}}
\def\D{\mathbf{D}}
\def\T{T}
\def\V{\mathbf{V}}
\def\U{\mathbf{U}}
\def\X{\mathbf{X}}
\def\Z{\mathbf{Z}}
\def\b{\mathbf{b}}
\def\c{\mathbf{c}}
\def\e{\mathbf{e}}
\def\x{\mathbf{x}}
\def\z{\mathbf{z}}
\def\v{\mathbf{v}}
\def\y{\mathbf{y}}
\def\w{\mathbf{w}}
\def\real{\mathcal{R}}
\begin{document}
\noindent
{\large {\bf Math 123 } \hspace{4em} {\bf Problem Set 6} \hspace{4em} {\bf Due: Tuesday 12/14 at 11:59 p.m.\,EST} \hspace{7.5em} \bigskip
\noindent
\textbf{Instruction}: Read the homework policy. You should submit a PDF copy of the homework and any associated codes on Canvas. Your PDF must be a single file, not multiple images.
\bigskip\noindent
{\bf 1.} In this problem, we consider the least squares problem.
\begin{enumerate}[(a)]
\item Prove that the least squares loss $f(\x) = ||\A\x-\b||_2^2$ is a convex function.
\item Sketch the gradient descent algorithm to solve the least squares problem.
\item When is it preferable to use gradient descent instead of the normal equations? Explain in brief terms.
\end{enumerate}
\bigskip\noindent
{\bf 2.} Consider the following minimization problem
\[
\underset{\x \in \real^{n}}{\min}\,\, \x^T\mathbf{C}\x + \mathbf{b}^T\x + d
\]
where $\C$ is an $n\times n$ symmetric matrix, $\mathbf{b}\in \real^{n}$ is a fixed vector and $d \in \real$ is a constant.
\begin{enumerate}[(a)]
\item Derive the gradient of the objective function.
\item Under what conditions on $\C$ is the objective convex?
\item Consider the case where $\C$ is symmetric and positive-definite. What is the solution to the minimization problem?
\end{enumerate}
\bigskip\noindent
{\bf 3.} Consider the optimization problem corresponding to the soft SVM problem,
\[
\underset{\mathbf{w}\in {\mathcal{R}^{d}},b,\xi_{i}\ge 0}{\min}\,\, ||\mathbf{w}||^{2} + C\sum_{i=1}^{n} \xi_i\quad \text{subject to} \quad y_i(\mathbf{w}^T\x_i+b)\ge 1
\]
We consider the effect of the regularization parameter $C$.
\begin{enumerate}[(a)]
\item What kind of problem do we obtain as $C \rightarrow \infty$? Justify your answer.
\item For very small $C$, what could you say about the margin? Briefly explain your answer.
\item For very large $C$, what could you say about the margin? Briefly explain your answer.
\end{enumerate}
\bigskip\noindent
{\bf 4.} [\textbf{Extra Credit: 10 points}] Given $n$ points and their labels $(\x_i,y_i)$ for $1\le i\le n$ with each $\x_i \in \mathcal{R}^{d}$, consider the dual optimization program corresponding to the hard margin support vector machines classification problem.
\[
\underset{\alpha_i\ge 0, i = 1,…,n}{\max}\quad \sum_{i=1}^{n} \alpha_i – \frac{1}{2} \sum_{j,k} \alpha_j\alpha_k y_jy_k\x_j^T\x_k\,\,\text{ subject to } \sum_{i=1}^{n} \alpha_i y_i = 0
\]
Let $\{\alpha_{i}^{*}\}_{i=1}^{n}$ denote the optimal solution to the above program. Assume that there exists a point $\x_s$ such that $y_s(\mathbf{w}^{*}\x_s+b^{*})=1$ where $y_s$ is the label of
$\x_s$ and $\mathbf{w}^{*}$ and $b^{*}$ denote the optimal solutions to the primal optimization program.
\begin{enumerate}[(a)]
\item What is the form of the classifier function in terms of $\{\alpha_{i}^{*}\}_{i=1}^{n}$ and $\{(\x_i,y_i), 1\le i\le n\}$? \\*
\noindent [\textbf{Remark}: The classifer function should only depend on the dual variables and the data points and their labels. It should not be given in terms of $b^{*}$ and $\mathbf{w}^{*}$].
\item Suppose that we employ kernel for the support vector machines classification problem employing the Gaussian kernel $K(\x,\hat{\x}) = \exp\left(\frac{||\x-\hat{\x}||^{2}}{2\sigma^2}\right)$.
What is the form of the classifier function in terms of $\{\alpha_{i}^{*}\}_{i=1}^{n}$ and $\{(\x_i,y_i), 1\le i\le n\}$ and the Gaussian kernel? \\*
\noindent [\textbf{Remark}: The classifer function should only depend on the
dual variables and the data points and their labels. It should not be given in terms of $b^{*}$ and $\mathbf{w}^{*}$].
\item Let $n= 1000$. Assume that $\alpha_{3}^{*} = 0$. Suppose we remove the third training data point $(\x_3,y_3)$. How does the classifier function change? Justify your answer.
\item Characterize the support vectors in terms of $\{\alpha_{i}^{*}\}_{i=1}^{n}$.
\item What is the computational complexity of evaluating the classifier function in (b) given a test data point $\x_{\text{test}}$? The complexity should be in terms of $n$ and $d$.
\end{enumerate}
\bigskip\noindent
{\bf 5.} [\textbf{Extra Credit: 20 points}] Given $n$ points and their labels $(\x_i,y_i)$ for $1\le i\le n$ with each $\x_i \in \mathcal{R}^{d}$, consider the optimization program corresponding to the hard margin support vector machines classification problem.
\begin{equation}\label{eq:hard-svm}
\underset{\mathbf{b},\mathbf{w}\in \mathcal{R}^{d}}{\min}\,\, \frac{1}{2}\mathbf{w}^{T}\mathbf{w} \,\, \text{ subject to }\,\, y_j(\mathbf{w}^{T}\x_j+b)\ge 1\text{ for } j = 1,…,n
\end{equation}
Let $d = 2$. Let $\x_1= (0,0)$, $\x_2 = (2,2)$, $\x_3 = (2,0)$ and $\x_4 = (3,0)$ be the training points with labels $y_1 = -1$, $y_2 = -1$, $y_3 = 1$ and $y_4 = 1$ respectively.
\begin{enumerate}[(a)]
\item Derive the optimal solution to \eqref{eq:hard-svm} i.e. find the equation of the optimal hyperplane.
\item Based your solution in (a), find an explicit form for the classifier function.
\item Using the classifier in (b), find the labels of the test points $(6,2)$ and $(1,1)$.
\item Compute the margin.
\item What are the support vectors?
\item Consider we remove the data point $\x_4 = (3,0)$. How does the solution of the optimization problem change? In particular, what is the form of the optimal hyperplane?
Justify your answer.
\end{enumerate}
\noindent [\textbf{Remark}: Show detailed work. Answers that simply guess the the equation of the optimal hyperplane receive no credit for (a). To get full credit, you need to solve the problem by considering the optimization objective and the constraints in \eqref{eq:hard-svm}].
\noindent [\textbf{Remark}: For part (a) and (b), you can directly use the relations/results that can be established from the equivalence of the primal hard SVM problem and dual SVM problem. ]
\bigskip\noindent
{\bf 6.} [\textbf{Extra Credit: 30 points}]
Given $n$ points and their labels $(\x_i,y_i)$ for $1\le i\le n$ with each $\xi_i \in \mathcal{R}^{2}$, consider the dual optimization program corresponding to the soft margin support vector machines classification problem.
\[
\underset{\w,b,\xi_i\ge 0}{\min}\quad \frac{1}{2}\w^T\w + C\sum_{i=1}^{n} \xi_i \,\,\text{ such that } y_i(\w^T\x_i+b)\ge 1-\xi_i \,\,\,\forall (\x_i,y_i)
\]
$C$ is a regularization constant that controls overfitting.
\begin{enumerate}[(a)]
\item What is the Lagrangian $L$ corresponding to the above optimization problem. $L$ should be a function of $\w$, $b$ and $\mathbf{\alpha}$. $\mathbf{\alpha}= \{\alpha_i\}_{i=1}^{n}$ are the penalty parameters.
\item What is the primal problem?
\item Recall that the dual problem is given by $\underset{\mathbf{\alpha}\ge 0}{\max}\quad \underset{\w,b}{\min}\,\, L(\w,b,\mathbf{\alpha})$. We study the inner minimization problem,
$\underset{\w,b}{\min}\,\, L(\w,b,\mathbf{\alpha})$.
\begin{enumerate}
\item Compute $\frac{\partial L}{\partial \w}$.
\item Compute $\frac{\partial L}{\partial b}$.
\end{enumerate}
\item What is the dual problem? [\textbf{Hint}: Set the partial derivatives evaluated in (c) to zero].
\end{enumerate}
\end{document}