diff --git a/Advanced/Compression/main.tex b/Advanced/Compression/main.tex new file mode 100755 index 0000000..33012d2 --- /dev/null +++ b/Advanced/Compression/main.tex @@ -0,0 +1,28 @@ +% use [nosolutions] flag to hide solutions. +% use [solutions] flag to show solutions. +\documentclass[ + solutions, + singlenumbering, + unfinished +]{../../resources/ormc_handout} +\usepackage{../../resources/macros} + +\input{tikzset.tex} + + +\uptitlel{Advanced 2} +\uptitler{\smallurl{}} +\title{Compression} +\subtitle{Prepared by Mark on \today{}} + + +\begin{document} + + \maketitle + + \input{parts/0 intro.tex} + \input{parts/1 runlength.tex} + \input{parts/2 lzss.tex} + \input{parts/3 huffman.tex} + +\end{document} \ No newline at end of file diff --git a/Advanced/Compression/media/box.png b/Advanced/Compression/media/box.png new file mode 100644 index 0000000..42cb086 Binary files /dev/null and b/Advanced/Compression/media/box.png differ diff --git a/Advanced/Compression/media/noise.png b/Advanced/Compression/media/noise.png new file mode 100644 index 0000000..8b6dee6 Binary files /dev/null and b/Advanced/Compression/media/noise.png differ diff --git a/Advanced/Compression/parts/0 intro.tex b/Advanced/Compression/parts/0 intro.tex new file mode 100644 index 0000000..eda06ca --- /dev/null +++ b/Advanced/Compression/parts/0 intro.tex @@ -0,0 +1,37 @@ +\section{Introduction} + +\definition{} +An \textit{alphabet} is a set of symbols. Two examples are +$\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}\}$ and $\{\texttt{0}, \texttt{1}\}$. + +\definition{} +A \textit{string} is a sequence of symbols from an alphabet. \par +For example, \texttt{CBCAADDD} is a string over the alphabet $\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}\}$. + +\problem{} +Say we want to store a length-$n$ string over the alphabet $\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}\}$ as a binary blob. \par +How many bits will we need? \par +\hint{ + Our alphabet has four symbols, so we can encode each symbol using two bits, \par + mapping $\texttt{A} \rightarrow \texttt{00}$, + $\texttt{B} \rightarrow \texttt{01}$, + $\texttt{C} \rightarrow \texttt{10}$, and + $\texttt{D} \rightarrow \texttt{11}$. +} + +\begin{solution} + $2n$ bits. +\end{solution} + +\vfill + + +\problem{} +Similarly, we can use a na\"ive coding scheme to encode an $n$-symbol string over an alphabet of size $k$ \par +using $n \times \lceil \log_2k \rceil$ bits. Convince yourself that this is true. + + +\vfill +Of course, this isn't ideal---we can do much better than $n \times \lceil \log_2k \rceil$. +We will spend the rest of this handout exploring more efficient ways of encoding such sequences of symbols. +\pagebreak diff --git a/Advanced/Compression/parts/1 runlength.tex b/Advanced/Compression/parts/1 runlength.tex new file mode 100644 index 0000000..6b4d5e6 --- /dev/null +++ b/Advanced/Compression/parts/1 runlength.tex @@ -0,0 +1,145 @@ +% TODO: +% Basic run-length +% LZ77 + +\section{Run-length Coding} + + +\definition{} +\textit{Entropy} is a measure of information in a certain sequence. \par +A sequence with high entropy contains a lot of information, and a sequence with low entropy contains relatively little. +For example, consider the following two ten-symbol ASCII\footnotemark{} strings: +\begin{itemize} + \item \texttt{AAAAAAAAAA} + \item \texttt{pDa3:7?j;F} +\end{itemize} +The first string clearly contains less information than the second. +It's much harder to describe \texttt{pDa3:7?j;F} than it is \texttt{AAAAAAAAAA}. +Thus, we say that the first has low entropy, and the second has fairly high entropy. + +\vspace{2mm} + +The definition above is intentionally hand-wavy. \par +Formal definitions of entropy exist, but we won't need them today---we just need +an intuitive understanding of the \say{density} of information in a given string. + + +\footnotetext{ + American Standard Code for Information Exchange, an early character encoding for computers. \par + It contains 128 symbols, including numbers, letters, and + \texttt{!"\#\$\%\&`()*+,-./:;<=>?@[\textbackslash]\^\_\{|\}\textasciitilde} +} + + +\vspace{5mm} + + +\problem{} +Using a na\"ive coding scheme, encode \texttt{AAAA$\cdot$AAAA$\cdot$BCD$\cdot$AAAA$\cdot$AAAA} as binary blob. \par +\note[Note]{ + We're still using the four-symbol alphabet $\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}\}$. \par + Dots ($\cdot$) in the string are drawn for readability. Ignore them. +} + +\begin{solution} + There are eight \texttt{A}s on each end of that string. Mapping symbols as before, \par + we get \texttt{[00 00 00 00 00 00 00 00 01 10 11 00 00 00 00 00 00 00 00]} +\end{solution} + + +\vfill +In \ref{runlenone}---and often, in the real world---the strings we want to encode have fairly low entropy. +We can leverage this fact to develop efficient encoding schemes. + +\example{} +The simplest such coding scheme is \textit{run-length encoding}. Instead of simply listing letters of a string +in their binary form, we'll add a \textit{count} to each letter, compressing repeated sequences of the same symbol. + +\vspace{2mm} + +We'll encode our string into a sequence of 6-bit blocks, interpreted as follows: + +\begin{center} + \begin{tikzpicture} + \node[anchor=west,color=gray] at (-2.3, 0) {Bits}; + \node[anchor=west,color=gray] at (-2.3, -0.5) {Meaning}; + \draw[color=gray] (-2.3, -0.25) -- (5.5, -0.25); + \draw[color=gray] (-2.3, 0.15) -- (-2.3, -0.65); + + \node at (0, 0) {\texttt{0}}; + \node at (1, 0) {\texttt{0}}; + \node at (2, 0) {\texttt{1}}; + \node at (3, 0) {\texttt{1}}; + \node at (4, 0) {\texttt{0}}; + \node at (5, 0) {\texttt{1}}; + + \draw (-0.5, 0.25) -- (5.5, 0.25); + \draw (-0.5, -0.25) -- (5.5, -0.25); + \draw (-0.5, -0.75) -- (5.5, -0.75); + + \draw (-0.5, 0.25) -- (-0.5, -0.75); + \draw (3.5, 0.25) -- (3.5, -0.75); + \draw (5.5, 0.25) -- (5.5, -0.75); + + \node at (1.5, -0.5) {number of copies}; + \node at (4.5, -0.5) {symbol}; + \end{tikzpicture} +\end{center} +So, the sequence \texttt{BBB} will be encoded as \texttt{[0011-01]}. \par +\note[Notation]{Just like spaces, dashes in a binary blob are added for readability.} + +\problem{} +Encode \texttt{AAAA$\cdot$AAAA$\cdot$BCD$\cdot$AAAA$\cdot$AAAA} using this scheme. \par +Is this more or less efficient than \ref{runlenone}? + +\begin{solution} + \texttt{[1000-00 0001-01 0001-10 0001-11 1000-00]} \par + This requires 30 bits, as compared to 38 in \ref{runlenone}. +\end{solution} + +\vfill +\pagebreak + +\problem{} +Is run-length coding always efficient? When does it work well, and when does it fail? + +\vfill + + +\problem{} +Our coding scheme wastes a lot of space when our string has few runs of the same symbol. \par +Fix this problem: modify the scheme so that single occurrences of symbols do not waste space. \par +\hint{We don't need a run length for every symbol. We only need one for \textit{repeated} symbols.} + +\begin{solution} + One idea is as follows: \par + \begin{itemize} + \item Encode single symbols na\"ively: \texttt{ABCD} becomes \texttt{[00 01 10 11]} + \item Signal runs using two copies of the same symbol: \texttt{AAAAAA} becomes \texttt{[00 00 0110]}. \par + When our decoder sees two copies of the same symbol, it will interpret the next four bits as + a run length. + \end{itemize} + \texttt{BDC$\cdot$DDDDD$\cdot$AADBDC} will be encoded as \texttt{[01 11 10 11-11-0101 01-01-0010 11 01 11 10]}. +\end{solution} + +\vfill + +\problem{} +Consider the following string: \texttt{ABCD$\cdot$ABCD$\cdot$BABABA$\cdot$ABCD$\cdot$ABCD}. \par +\begin{itemize} + \item How many bits do we need to encode this na\"ively? \par + \item How about with the (unmodified) run-length scheme described above? +\end{itemize} +\hint{You don't need to encode this string---just find the length of its encoded form.} + +\begin{solution} + Na\"ively: \tab 22 bits \par + Run-length: \tab $6 \times 21 = 126$ bits. Watch out for the two repeated \texttt{A}s! +\end{solution} + + +\vfill + +Neither solution to \ref{firstlz} is ideal. Run-length is very wasteful due to the lack of runs, and na\"ive coding +does not take advantage of repetition in the string. We'll need a better coding scheme. +\pagebreak diff --git a/Advanced/Compression/parts/2 lzss.tex b/Advanced/Compression/parts/2 lzss.tex new file mode 100644 index 0000000..c710993 --- /dev/null +++ b/Advanced/Compression/parts/2 lzss.tex @@ -0,0 +1,155 @@ +\section{LZ Codes} + +The LZ-family\footnotemark{} of codes (LZ77, LZ78, LZSS, LZMA, and others) take advantage of repeated sequences of symbols +in a string. They are the basis of most modern compression algorithms, including DEFLATE, which is used in the ZIP, PNG, +and GZIP formats. + +\footnotetext{ + Named after Abraham Lempel and Jacob Ziv, the original inventors. \par + LZ77 is the algorithm described in their first paper on the topic, which was published in 1977. \par + LZ78, LZSS, and LZMA are minor variations on the same general idea. +} + +\vspace{2mm} + +The idea behind LZ is to represent repeated substrings as \textit{pointers} to previous parts of the string. \par +Pointers take the form \texttt{}, where \texttt{pos} is the position of the string to repeat and +\texttt{len} is the number of symbols to copy. + +\vspace{2mm} + +For example, we can encode the string \texttt{ABRACADABRA} as \texttt{[ABRACAD<7, 4>]}. \par +The pointer \texttt{<7, 4>} tells us to look back 7 positions (to the first \texttt{A}), and copy the next 4 symbols. \par +Note that pointers refer to the partially decoded output---\textit{not} to the encoded string. \par +This allows pointers to reference other pointers, and ensures codes like \texttt{A<1,9>} are valid. + +\problem{} +Encode \texttt{ABCD$\cdot$ABCD$\cdot$BABABA$\cdot$ABCD$\cdot$ABCD} using LZ. +Then, decode the following: +\begin{itemize} + \item \texttt{[ABCD<4,4>]} + \item \texttt{[A<1,9>]} + \item \texttt{[DAC<3,5>]} +\end{itemize} + +\begin{solution} + + \texttt{ABCD$\cdot$ABCD$\cdot$BABABA$\cdot$ABCD$\cdot$ABCD} becomes \texttt{[ABCD<4, 4> BA<2,4> ABCD<4,4>]}. + + \linehack{} + + In parts two and three, remember that we're reading the \textit{output string.} \par + The nine \texttt{A}s in part two are produced one by one, \par + with the decoder's \say{read head} following its \say{write head.} + + \begin{itemize} + \item \texttt{ABCD$\cdot$ABCD} + \item \texttt{AAAAA$\cdot$AAAAA} + \item \texttt{DACDACDA} + \end{itemize} +\end{solution} + +\vfill + +\problem{} +Convince yourself that LZ is a generalization of the run-length code we discussed in the previous section. +\hint{\texttt{[A<1,9>]} and \texttt{[00-1001]} are the same thing!} + +\remark{} +Note that we left a few things out of this section: we didn't discuss the algorithm that converts a string to an LZ-encoded blob, +nor did we discuss how we should represent strings encoded with LZ in binary. We skipped these details because they are +problems of implementation---they're the engineer's headache, not the mathematician's. If you're interested, a brief explanation is below. +Ask an instructor to explain. + +\begin{center} + \begin{tikzpicture} + \node[anchor=west,color=gray] at (-2.3, 0) {Bits}; + \node[anchor=west,color=gray] at (-2.3, -0.5) {Meaning}; + \draw[color=gray] (-2.3, -0.25) -- (5.5, -0.25); + \draw[color=gray] (-2.3, 0.15) -- (-2.3, -0.65); + + \node at (0, 0) {\texttt{0}}; + \node at (1, 0) {\texttt{0}}; + \node at (2, 0) {\texttt{1}}; + \node at (3, 0) {\texttt{0}}; + \node at (4, 0) {\texttt{1}}; + \node at (5, 0) {\texttt{1}}; + \node at (6, 0) {\texttt{0}}; + \node at (7, 0) {\texttt{0}}; + \node at (8, 0) {\texttt{1}}; + + \draw (-0.5, 0.25) -- (8.5, 0.25); + \draw (-0.5, -0.25) -- (8.5, -0.25); + \draw (-0.5, -0.75) -- (8.5, -0.75); + + \draw (-0.5, 0.25) -- (-0.5, -0.75); + \draw (0.5, 0.25) -- (0.5, -0.75); + \draw (8.5, 0.25) -- (8.5, -0.75); + + \node at (0, -0.5) {flag}; + \node at (4.5, -0.5) {if flag \texttt{}, else eight-bit symbol}; + \end{tikzpicture} +\end{center} + + +\begin{center} + \begin{tikzpicture} + % Text tape + \node[color=gray] at (-0.75, 0) {\texttt{...}}; + \node[color=gray] at (0.0, 0) {\texttt{D}}; + \node at (0.5, 0) {\texttt{A}}; + \node at (1.0, 0) {\texttt{B}}; + \node at (1.5, 0) {\texttt{C}}; + \node at (2.0, 0) {\texttt{D}}; + \node at (2.5, 0) {\texttt{A}}; + \node at (3.0, 0) {\texttt{B}}; + \node at (3.5, 0) {\texttt{C}}; + \node at (4.0, 0) {\texttt{D}}; + \node[color=gray] at (4.5, 0) {\texttt{B}}; + \node[color=gray] at (5.0, 0) {\texttt{D}}; + \node[color=gray] at (5.5, 0) {\texttt{A}}; + \node[color=gray] at (6.0, 0) {\texttt{C}}; + \node[color=gray] at (6.75, 0) {\texttt{...}}; + + \draw (-1.75, 0.25) -- (7.25, 0.25); + \draw (-1.75, -0.25) -- (7.25, -0.25); + + + \draw[line width = 0.7mm, color=oblue, dotted] (2.25, 0.5) -- (2.25, -0.5); + \draw[line width = 0.7mm, color=oblue] + (-1.25, 0.5) + -- (4.25, 0.5) + -- (4.25, -0.5) + -- (-1.25, -0.5) + -- cycle + ; + + \draw + (4.2, -0.625) + -- (4.2, -0.75) + to node[anchor=north, midway] {lookahead} (2.3, -0.75) + -- (2.3, -0.625) + ; + + \draw + (2.2, -0.625) + -- (2.2, -0.75) + to node[anchor=north, midway] {search buffer} (-1.1, -0.75) + -- (-1.1, -0.625) + ; + + \draw[color=gray] + (2.2, 0.625) + -- (2.2, 0.75) + to node[anchor=south, midway] {match!} (0.3, 0.75) + -- (0.3, 0.625) + ; + + %\draw[->, color=gray] (2.5, 0.3) -- (2.5, 0.8) to[out=90,in=90] (0.5, 0.8); + \node at (7.0, -0.75) {Result: \texttt{[$\cdot\cdot\cdot$DABCD<4,4>$\cdot\cdot\cdot$]}}; + \end{tikzpicture} +\end{center} + + +\vfill +\pagebreak \ No newline at end of file diff --git a/Advanced/Compression/parts/3 huffman.tex b/Advanced/Compression/parts/3 huffman.tex new file mode 100644 index 0000000..5824fc6 --- /dev/null +++ b/Advanced/Compression/parts/3 huffman.tex @@ -0,0 +1,27 @@ +\section{Huffman Codes} + + +\remark{} +As a first example, consider the alphabet $\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}, \texttt{E}\}$. \par +With a na\"ive coding scheme, we can encode a length-$n$ string with $3n$ bits, by mapping... +\begin{itemize} + \item $\texttt{A}$ to $\texttt{000}$ + \item $\texttt{B}$ to $\texttt{001}$ + \item $\texttt{C}$ to $\texttt{010}$ + \item $\texttt{D}$ to $\texttt{011}$ + \item $\texttt{E}$ to $\texttt{100}$ +\end{itemize} +With this scheme, the string \texttt{ADEBCE} becomes \texttt{[000 011 100 001 010 100]}. \par +This matches what we computed in \ref{naivelen}: ~ $6 \times \lceil \log_2(5) \rceil = 6 \times 3 = 18$. \par +\note[Notation]{ + The spaces in \texttt{[000 011 100 001 010 100]} are provided for convenience. \par + This is equivalent to \texttt{[000011100001010100]}, but is easier to read. \par + In this handout, encoded binary blobs will always be written in square brackets. +} + +\vspace{2mm} + +You could argue that this coding scheme is wasteful: we're not using three of the eight possible three-bit sequences! + +\vfill +\pagebreak \ No newline at end of file diff --git a/Advanced/Compression/tikzset.tex b/Advanced/Compression/tikzset.tex new file mode 100644 index 0000000..d83fa32 --- /dev/null +++ b/Advanced/Compression/tikzset.tex @@ -0,0 +1,65 @@ +\usetikzlibrary{arrows.meta} +\usetikzlibrary{shapes.geometric} +\usetikzlibrary{patterns} + +% We put nodes in a separate layer, so we can +% slightly overlap with paths for a perfect fit +\pgfdeclarelayer{nodes} +\pgfdeclarelayer{path} +\pgfsetlayers{main,nodes} + +% Layer settings +\tikzset{ + % Layer hack, lets us write + % later = * in scopes. + layer/.style = { + execute at begin scope={\pgfonlayer{#1}}, + execute at end scope={\endpgfonlayer} + }, + % + % Arrowhead tweak + >={Latex[ width=2mm, length=2mm ]}, + % + % Labels inside edges + label/.style = { + rectangle, + % For automatic red background in solutions + fill = \ORMCbgcolor, + draw = none, + rounded corners = 0mm + }, + % + % Nodes + main/.style = { + draw, + circle, + fill = white, + line width = 0.35mm + }, + % + % Loop tweaks + loop above/.style = { + min distance = 2mm, + looseness = 8, + out = 45, + in = 135 + }, + loop below/.style = { + min distance = 5mm, + looseness = 10, + out = 315, + in = 225 + }, + loop right/.style = { + min distance = 5mm, + looseness = 10, + out = 45, + in = 315 + }, + loop left/.style = { + min distance = 5mm, + looseness = 10, + out = 135, + in = 215 + } +} \ No newline at end of file