207 lines
6.9 KiB
TeX
Raw Normal View History

2024-03-20 19:38:35 -07:00
\section{Words}
\definition{}
An \textit{alphabet} is a set of symbols. \par
For example, $\{\texttt{0}, \texttt{1}\}$ is an alphabet of two symbols, \par
and $\{\texttt{a}, \texttt{b}, \texttt{c}\}$ is an alphabet of three.
\definition{}
A \textit{word} over an alphabet $A$ is a sequence of symbols in that alphabet. \par
For example, $\texttt{00110}$ is a word over the alphabet $\{\texttt{0}, \texttt{1}\}$. \par
We'll let $\varnothing$ denote the empty word, which exists over every alphabet.
\definition{}
Let $v$ and $w$ be words over the same alphabet. \par
We say $v$ is a \textit{subword} of $w$ if $v$ is contained in $w$. \par
For example, \texttt{11} is a subword of \texttt{011}, but \texttt{00} is not.
\definition{}
Recall \ref{lockproblem}. From now on, we'll call this the \textit{$n$-subword problem}: \par
2024-03-22 16:55:08 -07:00
Given an alphabet $A$ and a positive integer $n$,
we want a word over $A$ that contains all possible length-$n$ subwords. \par
That shortest word that solves a given $n$-subword problem is called the \textit{optimal solution}.
2024-03-20 19:38:35 -07:00
\problem{}
List all subwords of \texttt{110}. \par
\hint{There are six.}
\begin{solution}
They are $\varnothing$, \texttt{0}, \texttt{1}, \texttt{10}, \texttt{11}, and \texttt{110}.
\end{solution}
\vfill
\definition{}
Let $\mathcal{S}_n(w)$ be the number of subwords of length $n$ in a word $w$.
\problem{}
Find the following:
\begin{itemize}
\item $\mathcal{S}_n(\texttt{101001})$ for $n \in \{0, 1, ..., 6\}$
\item $\mathcal{S}_n(\texttt{abccac})$ for $n \in \{0, 1, ..., 6\}$
\end{itemize}
\begin{solution}
In order from $\mathcal{S}_0$ to $\mathcal{S}_6$:
\begin{itemize}
\item 1, 2, 3, 3, 3, 2, 1
\item 1, 3, 5, 4, 3, 2, 1
\end{itemize}
\end{solution}
\vfill
\pagebreak
2024-03-22 16:55:08 -07:00
\problem{}<sbounds>
2024-03-20 19:38:35 -07:00
Let $w$ be a word over an alphabet of size $k$. \par
Prove the following:
\begin{itemize}
\item $\mathcal{S}_n(w) \leq k^n$
\item $\mathcal{S}_n(w) \geq \mathcal{S}_{n-1}(w) - 1$
\item $\mathcal{S}_n(w) \leq k \times \mathcal{S}_{n-1}(w)$
\end{itemize}
\begin{solution}
\begin{itemize}
\item There are $k$ choices for each of $n$ letters in the subword.
So, there are $k^n$ possible words of length $n$, and $\mathcal{S}_n(w) \leq k^n$.
\item For almost every distinct subword counted by $\mathcal{S}_{n-1}$,
concatenating the next letter creates a distinct length $n$ subword.
The only exception is the last subword with length $n-1$, so
$\mathcal{S}_n(w) \geq \mathcal{S}_{n-1}(w) - 1$
\item For each subword counted by $\mathcal{S}_{n-1}$, there are $k$ possibilities
for the letter that follows in $w$. Each element in the count $\mathcal{S}_n$ comes from
one of $k$ different length $n$ words starting with an element counted by $\mathcal{S}_{n-1}$.
Thus, $\mathcal{S}_n(w) \leq k \times \mathcal{S}_{n-1}(w)$
\end{itemize}
\end{solution}
\vfill
\pagebreak
\definition{}
Let $v$ and $w$ be words over the same alphabet. \par
The word $vw$ is the word formed by writing $v$ after $w$. \par
For example, if $v = \texttt{1001}$ and $w = \texttt{10}$, $vw$ is $\texttt{100110}$.
\problem{}
Let $F_k$ denote the word over the alphabet $\{\texttt{0}, \texttt{1}\}$ obtained from the following relation:
\begin{equation*}
F_0 = \texttt{0}; ~~ F_1 = \texttt{1}; ~~ F_k = F_{k-1}F_{k-2}
\end{equation*}
We'll call this the \textit{Fibonacci word} of order $k$.
\begin{itemize}
\item What are $F_3$, $F_4$, and $F_5$?
\item Compute $\mathcal{S}_0$ through $\mathcal{S}_5$ for $F_5$.
\item Show that the length of $F_k$ is the $(k + 2)^\text{th}$ Fibonacci number. \par
\hint{Induction.}
\end{itemize}
\begin{solution}
\begin{itemize}
\item $F_3 = \texttt{101}$
\item $F_4 = \texttt{10110}$
\item $F_5 = \texttt{10110101}$
\end{itemize}
\linehack{}
\begin{itemize}
\item $\mathcal{S}_0 = 1$
\item $\mathcal{S}_1 = 2$
\item $\mathcal{S}_2 = 3$
\item $\mathcal{S}_3 = 4$
\item $\mathcal{S}_4 = 5$
\item $\mathcal{S}_5 = 4$
\end{itemize}
\linehack
As stated, use induction. The base case is trivial. \par
Let $N_k$ represent the Fibonacci numbers, with $N_0 = 0$, $N_1 = 1$, and $N_{k} = N_{k-1} + N_{k-2}$
\vspace{2mm}
Assume that $F_k$ has length $N_{k+2}$ for all $k \leq n$.
We want to show that $F_{k+1}$ has length $N_{k+3}$. \par
Since $F_{k} = F_{k-1}F_{k-2}$, it has the length $|F_{k-1}| + |F_{k-2}|$. \par
By our assumption, $|F_{k-1}| = N_{k+1}$ and $|F_{k-2}| = N_{k}$. \par
So, $|F_{k}| = |F_{k-1}| + |F_{k-2}| = N_{k+1} + N_{k} = N_{k + 2}$.
\end{solution}
\vfill
\pagebreak
% C_k is called the "Champernowne word" of order k.
\problem{}<cword>
Let $C_k$ denote the word over the alphabet $\{\texttt{0}, \texttt{1}\}$ obtained by \par
2024-03-22 16:55:08 -07:00
concatenating the binary representations of the integers $0,~...,~2^k -1$. \par
2024-03-20 19:38:35 -07:00
For example, $C_1 = \texttt{0}$, $C_2 = \texttt{011011}$, and $C_3 = \texttt{011011100101110111}$.
\begin{itemize}
\item How many symbols does the word $C_k$ contain?
\item Compute $\mathcal{S}_0$, $\mathcal{S}_1$, $\mathcal{S}_2$, and $\mathcal{S}_3$ for $C_3$.
\item Show that $\mathcal{S}_k(C_k) = 2^k - 1$.
\item Show that $\mathcal{S}_n(C_k) = 2^n$ for $n < k$.
\end{itemize}
\hint{
If $v$ is a subword of $w$ and $w$ is a subword of $u$, $v$ must be a subword of $u$. \par
In other words, the \say{subword} relation is transitive.
}
\begin{solution}
$\mathcal{S}_0 = 1$, $\mathcal{S}_1 = 2$, $\mathcal{S}_2 = 4$, and $\mathcal{S}_3 = 7$.
\linehack{}
First, we show that $\mathcal{S}_k(C_k) = 2^k - 1$. \par
Consider an arbitrary word $w$ of length $k$. We'll consider three cases:
\begin{itemize}
\item If $w$ consists only of zeros, $w$ does not appear in $C_k$.
\item If $w$ starts with a \texttt{1}, $w$ must appear in $C_k$ by construction.
\item If $w$ does starts with a \texttt{0} and contains a \texttt{1}, $w$ has the form
2024-03-21 12:03:37 -07:00
$\texttt{0}^x\texttt{1}\overline{\texttt{y}}$ \par
2024-03-20 19:38:35 -07:00
\note{
That is, $x$ copies of \texttt{0} followed by a \texttt{1}, followed by \par
2024-03-21 12:03:37 -07:00
an arbitrary sequence $\overline{\texttt{y}}$ with length $(k-x-1)$.
2024-03-20 19:38:35 -07:00
} \par
2024-03-21 12:03:37 -07:00
Now consider the word $\texttt{1}\overline{\texttt{y}}\texttt{0}^x\texttt{1}\overline{\texttt{y}}\texttt{0}^{(x-1)}\texttt{1}$. \par
2024-03-20 19:38:35 -07:00
This is the concatenation of two consecutive binary numbers with $k$ digits, and thus appears in $C_k$.
$w$ is a subword of this word, and therefore also appears in $C_k$.
\end{itemize}
\linehack{}
We can use the above result to conclude that $\mathcal{S}_n(C_k) = 2^n$ for $n < k$: \par
If we take any word of length $n < k$ and repeatedly append \texttt{1} to create a word of length $k$, \par
we end up with a subword of $C_k$ by the reasoning above. \par
Thus, any word of length $n$ is a subword of $w$, of which there are $2^n$.
\end{solution}
\vfill
\problem{}
Convince yourself that $C_{n+1}$ provides a solution to the $n$-subword problem over $\{\texttt{0}, \texttt{1}\}$. \par
\note[Note]{$C_{n+1}$ may or may not be an \textit{optimal} solution---but it is a \textit{valid} solution} \par
Which part of \ref{cword} shows that this is true?
\pagebreak