handouts/Advanced/De Bruijn/parts/2 bruijn.tex

\section{De Bruijn Words}

Before we continue, we'll need to review some basic
graph theory.

\definition{}
A \textit{directed graph} consists of nodes and directed edges. \par
An example is shown below. It consists of three vertices (labeled $a, b, c$), \par
and five edges (labeled $0, ... , 4$).

\begin{center}
	\begin{tikzpicture}
		\begin{scope}[layer = nodes]
			\node[main] (a) at (0, 0) {$a$};
			\node[main] (b) at (2, 0) {$b$};
			\node[main] (c) at (4, 0) {$c$};
		\end{scope}

		\draw[->]
			(a) edge node[label] {$0$} (b)
			(a) edge[loop above] node[label] {$1$} (a)
			(b) edge[bend left] node[label] {$2$} (c)
			(b) edge[loop above] node[label] {$3$} (b)
			(c) edge[bend left] node[label] {$4$} (b)
		;
	\end{tikzpicture}
\end{center}

\definition{}
A \textit{path} in a graph is a sequence of adjacent edges, \par
In a directed graph, edges $a$ and $b$ are adjacent if $a$ ends at the node which $b$ starts at. \par
\vspace{2mm}
For example, consider the graph above. \par
The edges $1$ and $0$ are adjacent, since you can take edge $0$ after taking edge $1$. \par
$0$ starts where $1$ ends. \par
$0$ and $1$, however, are not: $1$ does not start at the edge at which $0$ ends.


\definition{}
An \textit{Eulerian path} is a path that visits each edge of a graph exactly once. \par
An \textit{Eulerian cycle} is an Eulerian path that starts and ends on the same node.

\problem{}
Find the single unique Eulerian cycle in the graph below.
\begin{center}
	\begin{tikzpicture}
		\begin{scope}[layer = nodes]
			\node[main] (a) at (0, 0) {$a$};
			\node[main] (b) at (2, 0) {$b$};
			\node[main] (c) at (4, 0) {$c$};
		\end{scope}

		\draw[->]
			(a) edge[bend left] node[label] {$0$} (b)
			(b) edge[bend left] node[label] {$1$} (a)
			(b) edge[bend left] node[label] {$2$} (c)
			(c) edge[bend left] node[label] {$3$} (b)
			(c) edge[loop right] node[label] {$4$} (c)
		;
	\end{tikzpicture}
\end{center}

\begin{solution}
	$24310$ is one way to write this cycle. \par
	There are other options, but they're all the same.
\end{solution}

\vfill

\theorem{}<eulerexists>
A directed graph contains an Eulerian cycle iff...
\begin{itemize}
	\item There is a path between every pair of nodes, and
	\item every node has as many \say{in} edges as it has \say{out} edges.
\end{itemize}

If the a graph contains an Eulerian cycle, it must contain an Eulerian path. \note{(why?)} \par
Some graphs contain an Eulerian path, but not a cycle. In this case, both conditions above must
still hold, but the following exceptions are allowed:
\begin{itemize}
	\item There may be at most one node where $(\text{number in} - \text{number out}) = 1$
	\item There may be at most one node where $(\text{number in} - \text{number out}) = -1$
\end{itemize}
\note[Note]{Either both exceptions occur, or neither occurs. Bonus problem: why?}
We won't provide a proof of this theorem today. However, you should convince yourself that it is true:
if any of these conditions are violated, why do we know that an Eulerian cycle (or path) cannot exist?

\pagebreak


\definition{}
Now, consider the $n$-subword problem over $\{\texttt{0}, \texttt{1}\}$. \par
We'll call the optimal solution to this problem a \textit{De Bruijn\footnotemark{} word} of order $n$. \par

\footnotetext{Dutch. Rhymes with \say{De Grown.}}


\problem{}<dbbounds>
Let $w$ be the an order-$n$ De Bruijn word, and denote its length with $|w|$. \par
Show that the following bounds always hold:
\begin{itemize}
	\item $|w| \leq n2^n$
	\item $|w| \geq 2^n + n - 1$
\end{itemize}

\begin{solution}
	\begin{itemize}
		\item There are $2^n$ binary words with length $n$. \par
		Concatenate these to get a word with length $n2^n$.
		\item A word must have at least $2^n + n - 1$ letters to have $2^n$ subwords with length $n$.
	\end{itemize}
\end{solution}


\remark{}
Now, we'd like to show that the length of a De Bruijn word is always $2^n + n - 1$ \par
That is, that the optimal solution to the subword problem always has $2^n + n - 1$ letters. \par
We'll do this by construction: for a given $n$, we want to build a word with length $2^n + n - 1$
that solves the binary $n$-subword problem.


\definition{}
Consider a $n$-length word $w$. \par
The \textit{prefix} of $w$ is the word formed by the first $n-1$ letters of $w$. \par
The \textit{suffix} of $w$ is the word formed by the last $n-1$ letters of $w$. \par
For example, the prefix of the word \texttt{1101} is \texttt{110}, and its suffix is \texttt{101}.
The prefix and suffix of any one-letter word are both $\varnothing$.

\definition{}
A \textit{De Bruijn graph} of order $n$, denoted $G_n$, is constructed as follows:
\begin{itemize}
	\item Nodes are created for each word of length $n - 1$.
	\item A directed edge is drawn from $a$ to $b$ if the suffix of
	$a$ matches the prefix of $b$. \par
	Note that a node may have an edge to itself.
	\item We label each edge with the last letter of $b$.
\end{itemize}
$G_2$ and $G_3$ are shown below.

\null\hfill
\begin{minipage}{0.48\textwidth}
	\begin{center}
		$G_2$

		\begin{tikzpicture}
			\begin{scope}[layer = nodes]
				\node[main] (0) at (0, 0) {\texttt{0}};
				\node[main] (1) at (2, 0) {\texttt{1}};
			\end{scope}

			\draw[->]
				(0) edge[loop left] node[label] {$0$} (0)
				(1) edge[loop right] node[label] {$1$} (1)
				(1) edge[bend left] node[label] {$0$} (0)
				(0) edge[bend left] node[label] {$1$} (1)
			;
		\end{tikzpicture}
	\end{center}
\end{minipage}
\hfill
\begin{minipage}{0.48\textwidth}
	\begin{center}
		$G_3$

		\begin{tikzpicture}[scale = 0.9]
			\begin{scope}[layer = nodes]
				\node[main] (00) at (0, 0) {\texttt{00}};
				\node[main] (01) at (2, 1) {\texttt{01}};
				\node[main] (10) at (2, -1) {\texttt{10}};
				\node[main] (11) at (4, 0) {\texttt{11}};
			\end{scope}

			\draw[->]
				(00) edge[loop left] node[label] {$0$} (00)
				(11) edge[loop right] node[label] {$1$} (11)
				(00) edge[bend left] node[label] {$1$} (01)
				(01) edge[bend left] node[label] {$0$} (10)
				(10) edge[bend left] node[label] {$1$} (01)
				(10) edge[bend left] node[label] {$0$} (00)
				(01) edge[bend left] node[label] {$1$} (11)
				(11) edge[bend left] node[label] {$0$} (10)
			;
		\end{tikzpicture}
	\end{center}
\end{minipage}
\hfill\null

\vfill
\pagebreak


\problem{}
Draw $G_4$.

\begin{solution}
	\begin{center}
		\begin{tikzpicture}
			\begin{scope}[layer = nodes]
				\node[main] (7) at (0, 0) {\texttt{111}};
				\node[main] (3) at (0, -2) {\texttt{011}};
				\node[main] (6) at (2, -2) {\texttt{110}};
				\node[main] (4) at (4, -2) {\texttt{100}};
				\node[main] (1) at (-4, -4) {\texttt{001}};
				\node[main] (5) at (0, -4) {\texttt{101}};
				\node[main] (2) at (-2, -4) {\texttt{010}};
				\node[main] (0) at (-2, -6) {\texttt{000}};
			\end{scope}

			\draw[->]
				(0) edge[loop left, looseness = 7] node[label] {\texttt{0}} (0)
				(7) edge[loop above, looseness = 7] node[label] {\texttt{1}} (7)

				(0) edge[out=90,in=-90] node[label] {\texttt{1}} (1)
				(1) edge node[label] {\texttt{0}} (2)
				(1) edge[out=45,in=-135] node[label] {\texttt{1}} (3)
				(2) edge[bend left] node[label] {\texttt{1}} (5)
				(3) edge node[label] {\texttt{0}} (6)
				(3) edge node[label] {\texttt{1}} (7)
				(5) edge[bend left] node[label] {\texttt{0}} (2)
				(5) edge node[label] {\texttt{1}} (3)
				(6) edge[bend left] node[label] {\texttt{0}} (4)
				(6) edge[out=-90,in=0] node[label] {\texttt{1}} (5)
				(7) edge[out=0,in=90] node[label] {\texttt{0}} (6)
			;

			\draw[->, rounded corners = 10mm]
				(4) to (4, 2) to node[label] {\texttt{1}} (-4, 2) to (1)
			;

			\draw[->, rounded corners = 10mm]
				(4) to (4, -6) to node[label] {\texttt{0}} (0)
			;

			\draw[->, rounded corners = 5mm]
				(2) to (-2, -5) to node[label] {\texttt{0}} (3, -5) to (3, -2) to (4)
			;
		\end{tikzpicture}
	\end{center}

	\begin{instructornote}
		This graph also appears as a solution to a different
		problem in the DFA handout.
	\end{instructornote}
\end{solution}

\vfill
\pagebreak

\problem{}
\begin{itemize}
	\item Show that $G_n$ has $2^{n-1}$ nodes and $2^n$ edges;
	\item that each node has two outgoing edges;
	\item and that there are as many edges labeled $0$ as are labeled $1$.
\end{itemize}

\begin{solution}
	\begin{itemize}
		\item There $2^{n-1}$ binary words of length $n-1$.
		\item The suffix of a given word is the prefix of two other words, \par
		so there are two edges leaving each node.
		\item One of those words will end with one, and the other will end with zero.
		\item Our $2^{n-1}$ nodes each have $2$ outgoing edges---we thus have $2^n$ edges in total.
	\end{itemize}
\end{solution}

\vfill

\problem{}<dbpath>
Show that $G_4$ always contains an Eulerian path. \par
\hint{\ref{eulerexists}}

\vfill

\theorem{}<dbeuler>
We can now easily construct De Bruijn words for a given $n$: \par
\begin{itemize}
	\item Construct $G_n$,
	\item find an Eulerian cycle in $G_n$,
	\item then, construct a De Bruijn word by writing the label of our starting vertex,
	then appending the label of every edge we travel.
\end{itemize}

\problem{}
Find De Bruijn words of orders $2$, $3$, and $4$.

\begin{solution}
	\begin{itemize}
		\item
		One Eulerian cycle in $G_2$ starts at node \texttt{0}, and takes the edges labeled $[1, 1, 0, 0]$. \par
		We thus have the word \texttt{01100}.

		\item
		In $G_3$, we have an Eulerian cycle that visits nodes in the following order: \par
		$
		\texttt{00}
		\rightarrow \texttt{01}
		\rightarrow \texttt{11}
		\rightarrow \texttt{11}
		\rightarrow \texttt{10}
		\rightarrow \texttt{01}
		\rightarrow \texttt{10}
		\rightarrow \texttt{00}
		\rightarrow \texttt{00}
		$\par
		This gives us the word \texttt{0011101000}

		\item Similarly, we $G_4$ gives us the word \texttt{0001 0011 0101 1110 000}. \par
		\note{Spaces have been added for convenience.}
	\end{itemize}
\end{solution}

\vfill
\pagebreak


Let's quickly show that the process described in \ref{dbeuler}
indeed produces a valid De Bruijn word.

\problem{}<dblength>
How long will a word generated by the above process be?

\begin{solution}
	A De Bruijn graph has $2^n$ edges, each of which is traversed exactly once.
	The starting node consists of $n - 1$ letters.

	\vspace{2mm}

	Thus, the resulting word contains $2^n + n - 1$ symbols.
\end{solution}

\vfill

\problem{}<dbsubset>
Show that a word generated by the process in \ref{dbeuler}
contains every possible length-$n$ subword. \par
In other words, show that $\mathcal{S}_n(w) = 2^n$ for a generated word $w$.

\begin{solution}
	Any length-$n$ subword of $w$ is the concatenation of a vertex label and an edge label.
	By construction, the next length-$n$ subword is the concatenation of the next vertex and edge
	in the Eulerian cycle.

	\vspace{2mm}

	This cycle traverses each edge exactly once, so each length-$n$ subword is distinct. \par
	Since $w$ has length $2^n + n - 1$, there are $2^n$ total subwords. \par
	These are all different, so $\mathcal{S}_n \geq 2^n$. \par
	However, $\mathcal{S}_n \leq 2^n$ by \ref{sbounds}, so $\mathcal{S}_n = 2^n$.

\end{solution}

\vfill

\remark{}
\begin{itemize}
	\item We found that \ref{dbeuler} generates a word with length $2^n + n - 1$ in \ref{dblength}, \par
	\item and we showed that this word always solves the $n$-subword problem in \ref{dbsubset}.

	\item From \ref{dbbounds}, we know that any solution to the binary $n$-subword problem \par
	must have at least $2^n + n - 1$ letters.

	\item Finally, \ref{dbpath} guarantees that it is possible to generate such a word in any $G_n$.
\end{itemize}

Thus, we have shown that the process in \ref{dbeuler} generates ideal solutions
to the $n$-subword problem, and that such solutions always exist.
We can now conclude that for any $n$, the binary $n$-subword problem may be solved with a word of length $2^n + n - 1$.

\pagebreak