\documentclass[11pt]{article}
\pdfoutput=1
\usepackage[T1]{fontenc}
\usepackage{lmodern}
\usepackage[protrusion=true,expansion=true]{microtype}
\usepackage{amsmath,amssymb,amsfonts,amsthm}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{fullpage}
\usepackage{setspace}
\usepackage[backref=page]{hyperref}
\usepackage{color}
\usepackage{wrapfig}
\usepackage{tikz}
\usetikzlibrary{decorations.pathreplacing}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage[framemethod=tikz]{mdframed}
\usepackage{xspace}
\usepackage{pgfplots}
\usepackage{framed}
\newcommand{\Ex}[1]{\ensuremath{\mathbb{E}\left[#1\right]}}
\newcommand{\Var}[1]{\ensuremath{\text{Var}\left[#1\right]}}
\newcommand{\PPr}[1]{\ensuremath{\mathbf{Pr}\left[#1\right]}}
\newcommand{\eps}{\varepsilon}
\newcommand{\tail}{\textsc{Tail}}
\newcommand{\CountSketch}{\textsc{CountSketch}}
\allowdisplaybreaks
\begin{document}
\begin{center}
{\Large\textsc{CSCE 658: Randomized Algorithms -- Spring 2024 \\
Problem Set 3}}
\vskip 0.1in
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%% Toggle the next two lines %%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%YOUR NAME(S) HERE
Due: Thursday, March 7, 2024, 5:00 pm CT
\end{center}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%% Problem 1 Begins Here %%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\noindent
\textbf{Problem 1.} (30 points total)
$\CountSketch$ tail bounds.
\vskip 0.1in\noindent
For any vector $x\in\mathbb{R}^n$ and any integer $k\ge 1$, we define $\tail_k(x)$ to be the vector $x$, but with the $k$ entries of largest magnitude to be set to $0$, breaking ties arbitrarily.
For example if $x=(-100, 40, 40, 1)$, then $\tail_2(x)$ can be either $(0, 0, 40, 1)$ or $(0, 40, 0, 1)$.
\begin{enumerate}
\item (5 points)
Show that for any parameter $\alpha\ge 1$ and $k\le n-1$, there exists $x\in\mathbb{R}^n$ such that
\[\alpha\cdot\|\tail_k(x)\|_2<\|x\|_2.\]
That is, the length of a tail vector of $x$ can be arbitrarily smaller than the length of the vector $x$.
\end{enumerate}
\noindent\textbf{Solution:}
\begin{enumerate}
\setcounter{enumi}{1}
\item (20 points)
Show that $\CountSketch$ actually provides an $L_2$ tail guarantee.
More specifically, for $\eps\in(0,1)$, suppose we use $\CountSketch$ with $\mathcal{O}\left(\frac{1}{\eps^2}\cdot\log n\right)$ buckets to extract estimates $\widehat{x_i}$ for the value of each coordinate $x_i$.
Show that with probability $1-\frac{1}{n^2}$, we simultaneously have that for all $i\in[n]$,
\[|\widehat{x_i}-x_i|\le\eps\cdot\|\tail_k(x)\|_2,\]
where $k=\frac{1}{\eps^2}$.
\vskip 0.1in\noindent
HINT: The analysis in class demonstrated an error of $\eps\cdot\|x\|_2$. For each $i\in[n]$, what event needs to occur for the top $k$ coordinates to not affect the estimate $\widehat{x_i}$ of $x_i$?
\end{enumerate}
\noindent\textbf{Solution:}
\begin{enumerate}
\setcounter{enumi}{2}
\item
Conclude that at the end of an insertion-deletion stream, $\CountSketch$ with $\mathcal{O}(k\log n)$ buckets can with high probability, recover the exact coordinates of a vector that is $k$-sparse, even if at intermediate times in the stream, the underlying frequency is not $k$-sparse.
\end{enumerate}
\noindent\textbf{Solution:}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%% Problem 2 Begins Here %%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage\noindent
$F_p$ moment estimation.
\vskip 0.1in\noindent
Let $p\ge 1$ be a fixed constant.
Suppose $f\in\mathbb{R}^n$ is defined by an insertion-only stream of length $m$, where each update increments a coordinate of $f$.
Suppose we sample an update $t\in[m]$ in the stream, uniformly at random, and set a counter $c$ to be the number of times the item appears in the stream after time $t$ (including time $t$).
After the stream ends, we set $Z=c^p-(c-1)^p$.
\vskip 0.1in\noindent
For example, suppose the stream consists of the updates $1,2,2,1,4,1,2,1$, which induces the frequency vector $f=(4,3,0,1)$ and suppose we sample the fourth update of the stream, corresponding to a $1$.
Then we see a total of three instances of $1$, after that time (inclusive), so that $c=3$ and $Z=3^p-2^p$.
For $p=3$ then, we would have $Z=27-8=19$.
\begin{enumerate}
\item (5 points)
Show that $\Ex{Z}=f_j^{p-1}$, \emph{conditioned} on sampling $j\in[n]$.
\end{enumerate}
\noindent\textbf{Solution:}
\begin{enumerate}
\setcounter{enumi}{1}
\item (5 points)
Let $F=m\cdot Z$.
Show that $\Ex{F}=\|f\|_p^p$.
\end{enumerate}
\noindent\textbf{Solution:}
\begin{enumerate}
\setcounter{enumi}{2}
\item (10 points)
Show that $\Var{F}\le p\cdot\|f\|_1\cdot\|f\|_{2p-1}^{2p-1}$.
\vskip 0.1in\noindent
HINT: You may use the fact that for all $x\ge 1$ and $p\ge 1$, we have $x^p-(x-1)^p\le px^{p-1}$.
\end{enumerate}
\noindent\textbf{Solution:}
\begin{enumerate}
\setcounter{enumi}{3}
\item (10 points)
Give an algorithm that uses $\mathcal{O}\left(\frac{1}{\eps^2}n^{1-1/p}\right)\cdot\log(nm)$ bits of space and with probability at least $\frac{2}{3}$, outputs an estimate $\widehat{F}$ such that
\[(1-\eps)\|f\|_p^p\le\widehat{F}\le(1+\eps)\|f\|_p^p.\]
Justify both its correctness-of-approximation and space complexity.
\vskip 0.1in\noindent
HINT: You may use the fact that for all $\|f\|_1\cdot\|f\|_{2p-1}^{2p-1}\le n^{1-1/p}\|f\|_p^{2p}$.
\end{enumerate}
\noindent\textbf{Solution:}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%% Problem 3 Begins Here %%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage\noindent
\textbf{Problem 3.} (30 points total)
Easy as 123 (approximate counting).
\begin{enumerate}
\item (3 points)
Suppose we want to count the number of updates, i.e., the length of a data stream.
Describe a na\"{i}ve streaming algorithm that uses $\mathcal{O}(\log m)$ bits of space if the stream has length $m$, where $m$ is not known in advance.
\end{enumerate}
\noindent\textbf{Solution:}
\vskip 0.1in\noindent
Consider the following algorithm:
\begin{algorithm}[!htb]
\caption{Approximate counting}
\begin{algorithmic}[1]
\State{$C\gets 0$}
\For{each stream update}
\State{Flip a coin that is HEADS with probability $\frac{1}{2^C}$}
\If{the coin is HEADS}
\State{$C\gets C+1$}
\EndIf
\EndFor
\State{\Return $Z=2^C-1$}
\end{algorithmic}
\end{algorithm}
\begin{enumerate}
\setcounter{enumi}{1}
\item (9 points)
Compute, with proof, $\Ex{Z}$.
\vskip 0.1in\noindent
HINT: Use induction on the length $m$ of the stream.
\end{enumerate}
\noindent\textbf{Solution:}
\begin{enumerate}
\setcounter{enumi}{2}
\item (9 points)
Compute $\Var{Z}$ by showing, with proof, that $\Ex{2^{2C}}=\frac{3}{2}m^2+\frac{3}{2}m+1$.
\vskip 0.1in\noindent
HINT: Use induction on the length $m$ of the stream.
\end{enumerate}
\noindent\textbf{Solution:}
\begin{enumerate}
\setcounter{enumi}{3}
\item (9 points)
Give an algorithm that with probability at least $\frac{2}{3}$, uses $\mathcal{O}(\log\log m)$ bits of space and outputs an estimate $\widehat{M}$ such that
\[\frac{m}{2}\le\widehat{M}\le 2m,\]
where $m$ is the length of the stream, but is not known in advance.
Justify both its correctness-of-approximation and space complexity.
\end{enumerate}
\noindent\textbf{Solution:}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%% Problem 4 Begins Here %%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage\noindent
\textbf{Problem 4.} (30 points total)
Communication complexity.
\vskip 0.1in\noindent
In the index problem, Alice has a vector $x\in\{0,1\}^n$ and Bob has a position $i\in[n]$ and their goal is for Bob to determine whether $x_i=0$ or $x_i=1$ after receiving a message from Alice.
It is known that any protocol for indexing that succeeds with probability at least $\frac{2}{3}$ requires $\Omega(n)$ communication from Alice and Bob.
\begin{enumerate}
\item (10 points)
Suppose a frequency vector $x\in\mathbb{R}^n$ is implicitly defined through a insertion-only data stream requires $\Omega(n)$ space.
Let $\mathcal{A}$ be a streaming algorithm that processes $x$, receives a query $i\in[n]$ \emph{after the data stream}, and outputs $x_i$ with probability at least $\frac{2}{3}$.
Show by a reduction from indexing that $\mathcal{A}$ must use $\Omega(n)$ bits of space.
\end{enumerate}
\noindent\textbf{Solution:}
\vskip 0.1in\noindent
In the set-disjointness communication, Alice has a vector $x\in\{0,1\}^n$ and Bob has a vector $y\in\{0,1\}^n$ and their goal is to determine whether there exists an index $i\in[n]$ such that $x_i=y_i=1$.
It is known that any protocol for set-disjointness that succeeds with probability at least $\frac{2}{3}$ requires $\Omega(n)$ communication between Alice and Bob.
\vskip 0.1in\noindent
\begin{enumerate}
\setcounter{enumi}{1}
\item (10 points)
Show that any streaming algorithm that with probability at least $\frac{2}{3}$, outputs the largest coordinate $i\in[n]$ of a frequency vector $x\in\mathbb{R}^n$ that is implicitly defined through a insertion-only data stream requires $\Omega(n)$ space.
\end{enumerate}
\noindent\textbf{Solution:}
\begin{enumerate}
\setcounter{enumi}{2}
\item (10 points)
Consider an insertion-only data stream consisting of edges of a graph $G$ with $n$ vertices.
Show that any streaming algorithm that with probability at least $\frac{2}{3}$, detects whether a graph contains a triangle requires $\Omega(n^2)$ space.
\end{enumerate}
\noindent\textbf{Solution:}
\end{document}