From 173705112ff307c38f2f7b4688bea2397095dab9 Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 12 Apr 2024 13:11:24 -0700 Subject: [PATCH] Added compression parts --- Advanced/Compression/main.tex | 28 ++++ Advanced/Compression/media/box.png | Bin 0 -> 4543 bytes Advanced/Compression/media/noise.png | Bin 0 -> 5630 bytes Advanced/Compression/parts/0 intro.tex | 37 +++++ Advanced/Compression/parts/1 runlength.tex | 145 +++++++++++++++++++ Advanced/Compression/parts/2 lzss.tex | 155 +++++++++++++++++++++ Advanced/Compression/parts/3 huffman.tex | 27 ++++ Advanced/Compression/tikzset.tex | 65 +++++++++ 8 files changed, 457 insertions(+) create mode 100755 Advanced/Compression/main.tex create mode 100644 Advanced/Compression/media/box.png create mode 100644 Advanced/Compression/media/noise.png create mode 100644 Advanced/Compression/parts/0 intro.tex create mode 100644 Advanced/Compression/parts/1 runlength.tex create mode 100644 Advanced/Compression/parts/2 lzss.tex create mode 100644 Advanced/Compression/parts/3 huffman.tex create mode 100644 Advanced/Compression/tikzset.tex diff --git a/Advanced/Compression/main.tex b/Advanced/Compression/main.tex new file mode 100755 index 0000000..33012d2 --- /dev/null +++ b/Advanced/Compression/main.tex @@ -0,0 +1,28 @@ +% use [nosolutions] flag to hide solutions. +% use [solutions] flag to show solutions. +\documentclass[ + solutions, + singlenumbering, + unfinished +]{../../resources/ormc_handout} +\usepackage{../../resources/macros} + +\input{tikzset.tex} + + +\uptitlel{Advanced 2} +\uptitler{\smallurl{}} +\title{Compression} +\subtitle{Prepared by Mark on \today{}} + + +\begin{document} + + \maketitle + + \input{parts/0 intro.tex} + \input{parts/1 runlength.tex} + \input{parts/2 lzss.tex} + \input{parts/3 huffman.tex} + +\end{document} \ No newline at end of file diff --git a/Advanced/Compression/media/box.png b/Advanced/Compression/media/box.png new file mode 100644 index 0000000000000000000000000000000000000000..42cb086cb521d5aad496cf878bc53b4b142bc8db GIT binary patch literal 4543 zcmeHLdu$X%7~iASVzm`(D+DpFOArzF_IbB=Ymc_}N_)aVEd`2TWFNEb(%$WLZ`)pp z4@AWWK6nTLtOg^3AweZTB!VDOM8pR%;EM#CA`xR0jmDx7@teJCdyOWhiTu~So158h ze)F4ee)G+DxA#QtjOv2?vH1i+6hvymbx@b%GV*fx+-kO-gz6}2h!~N`t;9&U0i-9F z@L&uSPgZ+`Cx^%Z+cGFc(8d^C#(FMu>!+Z7eWv{!v~S5^@EFcBf+&Ex3knbIVyL%5 zF&|6MwGZ0(8tJ~I5rm8AtE(f?T9Ws2tXJ^Ec$ODs7Qac3lQ~i51eo8malj+Y^!>4F z3H*D+tgNh!R92D++cL}+L=dU2`&Wi)-Yp-q?3>Lcuhi%MNcOCFN&c#;YFu;4leIS; z%ikuiTkvOIbnL3)(bWfbb$5TN9zSq!-rT&o6{9YATH6zK&dQ?w)ADI`@2TC7_3rCE z{nPE`O>@>8j}|T2e8RYSO>y#%+1qgPh9&E8|x|PSsy*oy?5p~gyv0{GVw+B`=#$ayI^kV1E&_)^SamHB|N`= z;xk3dU-8a)Yu%dY)2R)C&5PvIR|cP*{mJ~UU5j4Wb@uSRV4HZO^@e zDn52sMTx&>=MS42J~-O>*`czYKPT9ezTLWHrRMQ9t+=P`+>IB0+BUmu!*8po$j0vu zwoZ9}$;o>*9#3?g)ef025Cu|1IlA!o)xS-xUVeio=fpb)+p1^%zU0Ua3(p;1 z5&de-(w&c}*QCl`wm!_Me6u|D@elLQkK1x}`P#*6_IU~mANr*Ty?%c0p4azS7eC*5 zzBi|2hy9SZXZJ*cD4cKBH#iN^>9S_UyozqA$eW5K@P-kkWvPUsH6w>qQKK0TQD^t; zr$|!|QFDD!HkznJO=itv8_ilgqh4FwtOa$dY)XD7-U$U;dvSmbh0z2vd_O*A{yKsJOdU#EZMKH0+ELsYpG~1_{o@mD zc8j|lU1LxSiUBkUvvPx`#3!zQ1eT!Dj3wMrAod`nV;Tcu4e|~5xXa0Q1cvwX4np^> z-30?G8kNJA)`s1SghLdbU)C+n)MdA+2m!?Lz5p%yg&-|*euWNdz91bGRU}EGz>2z( zMHPuB9VMI-8E;Dsbkv(MG|N%2nw-S(yiXK;VnFaq0bu|%2iZx8M9k^t zPPYediwuhaVHF&wK;YV8Epnxe6vwjbEvqF&^=IAw?kF55U2&AK;vgXH&%^b7d6;G; znN7Qo>7tfy8lC@z#)pS21w3qdjhTe`J6%)yM9o62>AUo8i|HOEl5`J+tZ3;aB$alg zyM6*!x=U+P;*AKhM6Of}jQ^!_a)RARwL=C4-|Cl@HKtP!A%(;91284rWhU zhSQc1g#)E-ES!J7I&j24Muk~A-oG97|r507L4`Fyp$1)x%5la4c)K$Rf*PA zq|jmj&WeHrp7sXCF1X9SgsornI)DoI4$;VYAN%NZBGNT@+e4?wn2V| zbkFqX`;2BdvtGh06K zio*}R4tO5)`bX}7XUWUeY1LukB7SXrd&_FDT#=}mmn4WQuf@efZ0H&T#ylqyt;#!- zn_E;^c&wouma{Z>r$~$^fWJ7huZ8nR$G^Pn!J=h#A7XQ)YDW0AI~Fee E8@#~gxBvhE literal 0 HcmV?d00001 diff --git a/Advanced/Compression/media/noise.png b/Advanced/Compression/media/noise.png new file mode 100644 index 0000000000000000000000000000000000000000..8b6dee648b56ad23ba29f74fbe5de02165ee1534 GIT binary patch literal 5630 zcmeHKdpMNa8lOoviM^ddu~TzKD4P3yGNp{o#7IVkRKA({hA-yAj9aCobkU9IMpQ!R zauW7d#8#4&3~VD%=YoQ48y>3^*y>AH zb2|e|&HTA`;_=-vWITz8r_g~vkxXS1)gKIr#3nM>6c#|EkA0bunyJvZ{S#o>f;c$1 z^Bf#7iV(R334&3mh$B%69Oogh?-4^l-wGwuHU?eXNr385rs**ZhSO z>>H$%#6@uz26OKY#2H3y*2Q2cHx4CcP}leLqeVBE4isGVj?%Re01J}b}bnJ{yw_r1pLCwGW) zvjnB_{a&3VSG`X)dk-8L+}ajXdj4jp&k1MfA1?8jg$vA$Js}IXjj2WxP46%M@XpD% zjEj%;H7174Irf9W`5v8(n;+#D#t9Nvc`zruDPi8aHuZM#!~(ND{ZY@EXC`Cb>HBt@ zZ#*;M=7xQ(T>*I(ettg}74-#FPgh*f(w}d7es-}(`~mkl<6ii`agFrrGU-~~|FpUF zTDMkwrgx@1db5LH3HM&lyg&PT(pt(mz3)Gw(1jt9tXe}`5Q7njr?ACYmeFN({s7(~p$E~oK{e1!uXfH0i2B=gDHh~Zx9*z&E;N>Cy1QLtIA`r<0G8qRDIAx?v z1x4UwN((i`Fb5Y_ib4>D3X#h&YEDQf4^?rnSkRC8JU^*|&mX0iDMwTQdJrNY1%ZSo z5~NbXSP!MjF$|E5Oz1B?l%A0anBV~`<)I-W*f9*2sVv4)h()9RiqMcC&2q#d0vrTO z0aOV_C4Dud`oxWTs3q`6qza7}5c?~n3Xyyf>#Mn`do;@#I}zYM%KH`i^V&6Ffa3Gn zT)8MzJv|!BA8AUQb=?LeGJ9o5ClXe z6f|~LYAP|HA`vAtCQD4j(HSfjjw%sRa4cBJz`+y=%oGx7Bv>NRP>DtCU*sWD2y7=J zh5TWHLgugOPz%mp;LhV<$#~+I7WW`XB>@f`tScf5jrh{xiAZ6A3R0^{VvxvW8kI;V z(phvmh4clq3=UC(NK|uba;K(8y)A687$7X9j#EIOX$NazJA}ZHN*>}Vmj`jMqgi*f zn-7kY7*au8NCgAZ(LC(=IS=DNHYN^FUrooCixElW|3a$|55^AQ!InECN-%z;rfK*@ z3E<%2r{TvSM01od7|o$zL!#j&D4{S|teGd^8txJWKr(+AWRH<>{oIcHMJSL!z(7JW z11F+0L^vv0NC%-nrQ%?SD3XAUq|=z9ui2Gyi7Fflffx7#9f4LLJT<>im|2>Qn*B98 zJOEbj0uT&GRDVVUqY_61BaAGVpw1beS8PZ4FHP(;fH6r1_zky#7k}djfcpC&-=yz%xxUNwO$vMy_ z^u0rC_7UwOo$WEJ{iAl>;DBjmpO=m=T!lY&(>3a>Yv{pq1Jh-?h0nGmzrFl+Yk!+vqB$G6wrRTTka-Du zQKzH}-$l=cB$5A$G+}MpiE{J1Zr#*&_d$fcF>9Jh4lQRQ#-{jXOfsx<7rhOccE|pA zPpcl^9lf~Nrk?9sle$>>s^q|qqI~-)S*Ctj!2|0jTlPj*v~8lWdNyNS$s5CYNlJkU zDXmPNxg;r|eo-Pf^EAJ6627N`lSNYuK8RZKeyNsZPQj9+TE~0m+t^u^EI}nFy{sM| zth`mCaNygy9j7C9mLB-#;!wHqZ!3SP6qYPvCoMZt*;qa)fPUEK;_rpcKKz*W__=7m zxuv{KKbL1D!>c>?>#`mp6kFp19uppHd1z&-^t$@?ZMD{?{S^W)kEeGlBK<9|*JpQf zOcd!KE1wJAtr=WyPWE1XqNI#F>#Eea-ca8=LK(`Oyw>KTQ4#hf*=>PiY)dR_`P9tD zhF25MRw23kUmrE&6i36c7QS_Ls02!x$<4;;=A`sV*)wdp`T;Yu-R;oco9ktLgm(!F zF@1}+%R28@NvAg>uae$W{M#tJDt+RWtQ|Vbv!Cg@_?zk(uEg!_juf@8slD!zMOiUr z-_*LUj9xwKYg!j|Gu~BM7(7Is!-2vbJ8w+!l4UO8lI&Ft6Etyt|1N-mG z?hKVJfAVnO{-X7EHBI8Ze2ZTn>S&i6)q5h{9vuSug|@+!lqQ;Uqi^C>SL-Ctz4`B_ zMczlPcBrxJTbv4Cl->9#YG(A7*s0@7jV5>$7w5UUL?z7ZIZ1hGpz9+V%(Jg5B|@1ugeKmZsY~ z%%!Cs+u9PYU$AE5-0A1xeQ@>sD$5)$VvSB+Gxx%h6B$-Xlr+DKeCBOIX_MzcenQMY zNw7+vw%uwMw?1ZIs$HdG514{z$s# zp=3wR%5iXTpB1l#EK;&NT`i>tvBkS?!5gX>=Kja#?CBh5Lp~n*G1_kpwa742*YH~P zxngI0p)}>UJ?obqt(iRCeWlR7=5|Hh(Tu$Ar_NU(a%RmBu_?*FODl}$?U2Z2F|?e5 z%R7rnIlsDvXZT2v-AHNl4%W^|Pdc0T$Pkf57WwP> z*s>z6%A3#zyAM{@7`3cCWCtlpT$?@72^M{O&UIjYt!$8B%S(wmmdTA?=-E5_yIU0= zX@OY}qz?6=5pL`rat!Tye}UXvlSm?g5C=Ok6-KBL`(AB>d&mBqEb$u?i}*5O~JXG zNZyNovA8K+7~S{*=iTzgbm-Hzw?m!fTeeby&OW5?XqvLMDyH8pE^E%S8@8N{s@K`8 z^kUtRHP6M8nv6=bna|MI1#Pp+!hZ50+nnijF8eT!{~HbM(P2~bvHhvg;XGa^I%q~i z(`&sb^Q#XHtyZtzr6&-(v{)QEn^zZaa%M06PIt<@m{UVbU0Ul@`At@~V$bbCcqdlw kt~{sS{SS?rU#%ZPuU5(mnptUAK;}mA9NoBQ7y8Bg2VU1%(*OVf literal 0 HcmV?d00001 diff --git a/Advanced/Compression/parts/0 intro.tex b/Advanced/Compression/parts/0 intro.tex new file mode 100644 index 0000000..eda06ca --- /dev/null +++ b/Advanced/Compression/parts/0 intro.tex @@ -0,0 +1,37 @@ +\section{Introduction} + +\definition{} +An \textit{alphabet} is a set of symbols. Two examples are +$\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}\}$ and $\{\texttt{0}, \texttt{1}\}$. + +\definition{} +A \textit{string} is a sequence of symbols from an alphabet. \par +For example, \texttt{CBCAADDD} is a string over the alphabet $\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}\}$. + +\problem{} +Say we want to store a length-$n$ string over the alphabet $\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}\}$ as a binary blob. \par +How many bits will we need? \par +\hint{ + Our alphabet has four symbols, so we can encode each symbol using two bits, \par + mapping $\texttt{A} \rightarrow \texttt{00}$, + $\texttt{B} \rightarrow \texttt{01}$, + $\texttt{C} \rightarrow \texttt{10}$, and + $\texttt{D} \rightarrow \texttt{11}$. +} + +\begin{solution} + $2n$ bits. +\end{solution} + +\vfill + + +\problem{} +Similarly, we can use a na\"ive coding scheme to encode an $n$-symbol string over an alphabet of size $k$ \par +using $n \times \lceil \log_2k \rceil$ bits. Convince yourself that this is true. + + +\vfill +Of course, this isn't ideal---we can do much better than $n \times \lceil \log_2k \rceil$. +We will spend the rest of this handout exploring more efficient ways of encoding such sequences of symbols. +\pagebreak diff --git a/Advanced/Compression/parts/1 runlength.tex b/Advanced/Compression/parts/1 runlength.tex new file mode 100644 index 0000000..6b4d5e6 --- /dev/null +++ b/Advanced/Compression/parts/1 runlength.tex @@ -0,0 +1,145 @@ +% TODO: +% Basic run-length +% LZ77 + +\section{Run-length Coding} + + +\definition{} +\textit{Entropy} is a measure of information in a certain sequence. \par +A sequence with high entropy contains a lot of information, and a sequence with low entropy contains relatively little. +For example, consider the following two ten-symbol ASCII\footnotemark{} strings: +\begin{itemize} + \item \texttt{AAAAAAAAAA} + \item \texttt{pDa3:7?j;F} +\end{itemize} +The first string clearly contains less information than the second. +It's much harder to describe \texttt{pDa3:7?j;F} than it is \texttt{AAAAAAAAAA}. +Thus, we say that the first has low entropy, and the second has fairly high entropy. + +\vspace{2mm} + +The definition above is intentionally hand-wavy. \par +Formal definitions of entropy exist, but we won't need them today---we just need +an intuitive understanding of the \say{density} of information in a given string. + + +\footnotetext{ + American Standard Code for Information Exchange, an early character encoding for computers. \par + It contains 128 symbols, including numbers, letters, and + \texttt{!"\#\$\%\&`()*+,-./:;<=>?@[\textbackslash]\^\_\{|\}\textasciitilde} +} + + +\vspace{5mm} + + +\problem{} +Using a na\"ive coding scheme, encode \texttt{AAAA$\cdot$AAAA$\cdot$BCD$\cdot$AAAA$\cdot$AAAA} as binary blob. \par +\note[Note]{ + We're still using the four-symbol alphabet $\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}\}$. \par + Dots ($\cdot$) in the string are drawn for readability. Ignore them. +} + +\begin{solution} + There are eight \texttt{A}s on each end of that string. Mapping symbols as before, \par + we get \texttt{[00 00 00 00 00 00 00 00 01 10 11 00 00 00 00 00 00 00 00]} +\end{solution} + + +\vfill +In \ref{runlenone}---and often, in the real world---the strings we want to encode have fairly low entropy. +We can leverage this fact to develop efficient encoding schemes. + +\example{} +The simplest such coding scheme is \textit{run-length encoding}. Instead of simply listing letters of a string +in their binary form, we'll add a \textit{count} to each letter, compressing repeated sequences of the same symbol. + +\vspace{2mm} + +We'll encode our string into a sequence of 6-bit blocks, interpreted as follows: + +\begin{center} + \begin{tikzpicture} + \node[anchor=west,color=gray] at (-2.3, 0) {Bits}; + \node[anchor=west,color=gray] at (-2.3, -0.5) {Meaning}; + \draw[color=gray] (-2.3, -0.25) -- (5.5, -0.25); + \draw[color=gray] (-2.3, 0.15) -- (-2.3, -0.65); + + \node at (0, 0) {\texttt{0}}; + \node at (1, 0) {\texttt{0}}; + \node at (2, 0) {\texttt{1}}; + \node at (3, 0) {\texttt{1}}; + \node at (4, 0) {\texttt{0}}; + \node at (5, 0) {\texttt{1}}; + + \draw (-0.5, 0.25) -- (5.5, 0.25); + \draw (-0.5, -0.25) -- (5.5, -0.25); + \draw (-0.5, -0.75) -- (5.5, -0.75); + + \draw (-0.5, 0.25) -- (-0.5, -0.75); + \draw (3.5, 0.25) -- (3.5, -0.75); + \draw (5.5, 0.25) -- (5.5, -0.75); + + \node at (1.5, -0.5) {number of copies}; + \node at (4.5, -0.5) {symbol}; + \end{tikzpicture} +\end{center} +So, the sequence \texttt{BBB} will be encoded as \texttt{[0011-01]}. \par +\note[Notation]{Just like spaces, dashes in a binary blob are added for readability.} + +\problem{} +Encode \texttt{AAAA$\cdot$AAAA$\cdot$BCD$\cdot$AAAA$\cdot$AAAA} using this scheme. \par +Is this more or less efficient than \ref{runlenone}? + +\begin{solution} + \texttt{[1000-00 0001-01 0001-10 0001-11 1000-00]} \par + This requires 30 bits, as compared to 38 in \ref{runlenone}. +\end{solution} + +\vfill +\pagebreak + +\problem{} +Is run-length coding always efficient? When does it work well, and when does it fail? + +\vfill + + +\problem{} +Our coding scheme wastes a lot of space when our string has few runs of the same symbol. \par +Fix this problem: modify the scheme so that single occurrences of symbols do not waste space. \par +\hint{We don't need a run length for every symbol. We only need one for \textit{repeated} symbols.} + +\begin{solution} + One idea is as follows: \par + \begin{itemize} + \item Encode single symbols na\"ively: \texttt{ABCD} becomes \texttt{[00 01 10 11]} + \item Signal runs using two copies of the same symbol: \texttt{AAAAAA} becomes \texttt{[00 00 0110]}. \par + When our decoder sees two copies of the same symbol, it will interpret the next four bits as + a run length. + \end{itemize} + \texttt{BDC$\cdot$DDDDD$\cdot$AADBDC} will be encoded as \texttt{[01 11 10 11-11-0101 01-01-0010 11 01 11 10]}. +\end{solution} + +\vfill + +\problem{} +Consider the following string: \texttt{ABCD$\cdot$ABCD$\cdot$BABABA$\cdot$ABCD$\cdot$ABCD}. \par +\begin{itemize} + \item How many bits do we need to encode this na\"ively? \par + \item How about with the (unmodified) run-length scheme described above? +\end{itemize} +\hint{You don't need to encode this string---just find the length of its encoded form.} + +\begin{solution} + Na\"ively: \tab 22 bits \par + Run-length: \tab $6 \times 21 = 126$ bits. Watch out for the two repeated \texttt{A}s! +\end{solution} + + +\vfill + +Neither solution to \ref{firstlz} is ideal. Run-length is very wasteful due to the lack of runs, and na\"ive coding +does not take advantage of repetition in the string. We'll need a better coding scheme. +\pagebreak diff --git a/Advanced/Compression/parts/2 lzss.tex b/Advanced/Compression/parts/2 lzss.tex new file mode 100644 index 0000000..c710993 --- /dev/null +++ b/Advanced/Compression/parts/2 lzss.tex @@ -0,0 +1,155 @@ +\section{LZ Codes} + +The LZ-family\footnotemark{} of codes (LZ77, LZ78, LZSS, LZMA, and others) take advantage of repeated sequences of symbols +in a string. They are the basis of most modern compression algorithms, including DEFLATE, which is used in the ZIP, PNG, +and GZIP formats. + +\footnotetext{ + Named after Abraham Lempel and Jacob Ziv, the original inventors. \par + LZ77 is the algorithm described in their first paper on the topic, which was published in 1977. \par + LZ78, LZSS, and LZMA are minor variations on the same general idea. +} + +\vspace{2mm} + +The idea behind LZ is to represent repeated substrings as \textit{pointers} to previous parts of the string. \par +Pointers take the form \texttt{}, where \texttt{pos} is the position of the string to repeat and +\texttt{len} is the number of symbols to copy. + +\vspace{2mm} + +For example, we can encode the string \texttt{ABRACADABRA} as \texttt{[ABRACAD<7, 4>]}. \par +The pointer \texttt{<7, 4>} tells us to look back 7 positions (to the first \texttt{A}), and copy the next 4 symbols. \par +Note that pointers refer to the partially decoded output---\textit{not} to the encoded string. \par +This allows pointers to reference other pointers, and ensures codes like \texttt{A<1,9>} are valid. + +\problem{} +Encode \texttt{ABCD$\cdot$ABCD$\cdot$BABABA$\cdot$ABCD$\cdot$ABCD} using LZ. +Then, decode the following: +\begin{itemize} + \item \texttt{[ABCD<4,4>]} + \item \texttt{[A<1,9>]} + \item \texttt{[DAC<3,5>]} +\end{itemize} + +\begin{solution} + + \texttt{ABCD$\cdot$ABCD$\cdot$BABABA$\cdot$ABCD$\cdot$ABCD} becomes \texttt{[ABCD<4, 4> BA<2,4> ABCD<4,4>]}. + + \linehack{} + + In parts two and three, remember that we're reading the \textit{output string.} \par + The nine \texttt{A}s in part two are produced one by one, \par + with the decoder's \say{read head} following its \say{write head.} + + \begin{itemize} + \item \texttt{ABCD$\cdot$ABCD} + \item \texttt{AAAAA$\cdot$AAAAA} + \item \texttt{DACDACDA} + \end{itemize} +\end{solution} + +\vfill + +\problem{} +Convince yourself that LZ is a generalization of the run-length code we discussed in the previous section. +\hint{\texttt{[A<1,9>]} and \texttt{[00-1001]} are the same thing!} + +\remark{} +Note that we left a few things out of this section: we didn't discuss the algorithm that converts a string to an LZ-encoded blob, +nor did we discuss how we should represent strings encoded with LZ in binary. We skipped these details because they are +problems of implementation---they're the engineer's headache, not the mathematician's. If you're interested, a brief explanation is below. +Ask an instructor to explain. + +\begin{center} + \begin{tikzpicture} + \node[anchor=west,color=gray] at (-2.3, 0) {Bits}; + \node[anchor=west,color=gray] at (-2.3, -0.5) {Meaning}; + \draw[color=gray] (-2.3, -0.25) -- (5.5, -0.25); + \draw[color=gray] (-2.3, 0.15) -- (-2.3, -0.65); + + \node at (0, 0) {\texttt{0}}; + \node at (1, 0) {\texttt{0}}; + \node at (2, 0) {\texttt{1}}; + \node at (3, 0) {\texttt{0}}; + \node at (4, 0) {\texttt{1}}; + \node at (5, 0) {\texttt{1}}; + \node at (6, 0) {\texttt{0}}; + \node at (7, 0) {\texttt{0}}; + \node at (8, 0) {\texttt{1}}; + + \draw (-0.5, 0.25) -- (8.5, 0.25); + \draw (-0.5, -0.25) -- (8.5, -0.25); + \draw (-0.5, -0.75) -- (8.5, -0.75); + + \draw (-0.5, 0.25) -- (-0.5, -0.75); + \draw (0.5, 0.25) -- (0.5, -0.75); + \draw (8.5, 0.25) -- (8.5, -0.75); + + \node at (0, -0.5) {flag}; + \node at (4.5, -0.5) {if flag \texttt{}, else eight-bit symbol}; + \end{tikzpicture} +\end{center} + + +\begin{center} + \begin{tikzpicture} + % Text tape + \node[color=gray] at (-0.75, 0) {\texttt{...}}; + \node[color=gray] at (0.0, 0) {\texttt{D}}; + \node at (0.5, 0) {\texttt{A}}; + \node at (1.0, 0) {\texttt{B}}; + \node at (1.5, 0) {\texttt{C}}; + \node at (2.0, 0) {\texttt{D}}; + \node at (2.5, 0) {\texttt{A}}; + \node at (3.0, 0) {\texttt{B}}; + \node at (3.5, 0) {\texttt{C}}; + \node at (4.0, 0) {\texttt{D}}; + \node[color=gray] at (4.5, 0) {\texttt{B}}; + \node[color=gray] at (5.0, 0) {\texttt{D}}; + \node[color=gray] at (5.5, 0) {\texttt{A}}; + \node[color=gray] at (6.0, 0) {\texttt{C}}; + \node[color=gray] at (6.75, 0) {\texttt{...}}; + + \draw (-1.75, 0.25) -- (7.25, 0.25); + \draw (-1.75, -0.25) -- (7.25, -0.25); + + + \draw[line width = 0.7mm, color=oblue, dotted] (2.25, 0.5) -- (2.25, -0.5); + \draw[line width = 0.7mm, color=oblue] + (-1.25, 0.5) + -- (4.25, 0.5) + -- (4.25, -0.5) + -- (-1.25, -0.5) + -- cycle + ; + + \draw + (4.2, -0.625) + -- (4.2, -0.75) + to node[anchor=north, midway] {lookahead} (2.3, -0.75) + -- (2.3, -0.625) + ; + + \draw + (2.2, -0.625) + -- (2.2, -0.75) + to node[anchor=north, midway] {search buffer} (-1.1, -0.75) + -- (-1.1, -0.625) + ; + + \draw[color=gray] + (2.2, 0.625) + -- (2.2, 0.75) + to node[anchor=south, midway] {match!} (0.3, 0.75) + -- (0.3, 0.625) + ; + + %\draw[->, color=gray] (2.5, 0.3) -- (2.5, 0.8) to[out=90,in=90] (0.5, 0.8); + \node at (7.0, -0.75) {Result: \texttt{[$\cdot\cdot\cdot$DABCD<4,4>$\cdot\cdot\cdot$]}}; + \end{tikzpicture} +\end{center} + + +\vfill +\pagebreak \ No newline at end of file diff --git a/Advanced/Compression/parts/3 huffman.tex b/Advanced/Compression/parts/3 huffman.tex new file mode 100644 index 0000000..5824fc6 --- /dev/null +++ b/Advanced/Compression/parts/3 huffman.tex @@ -0,0 +1,27 @@ +\section{Huffman Codes} + + +\remark{} +As a first example, consider the alphabet $\{\texttt{A}, \texttt{B}, \texttt{C}, \texttt{D}, \texttt{E}\}$. \par +With a na\"ive coding scheme, we can encode a length-$n$ string with $3n$ bits, by mapping... +\begin{itemize} + \item $\texttt{A}$ to $\texttt{000}$ + \item $\texttt{B}$ to $\texttt{001}$ + \item $\texttt{C}$ to $\texttt{010}$ + \item $\texttt{D}$ to $\texttt{011}$ + \item $\texttt{E}$ to $\texttt{100}$ +\end{itemize} +With this scheme, the string \texttt{ADEBCE} becomes \texttt{[000 011 100 001 010 100]}. \par +This matches what we computed in \ref{naivelen}: ~ $6 \times \lceil \log_2(5) \rceil = 6 \times 3 = 18$. \par +\note[Notation]{ + The spaces in \texttt{[000 011 100 001 010 100]} are provided for convenience. \par + This is equivalent to \texttt{[000011100001010100]}, but is easier to read. \par + In this handout, encoded binary blobs will always be written in square brackets. +} + +\vspace{2mm} + +You could argue that this coding scheme is wasteful: we're not using three of the eight possible three-bit sequences! + +\vfill +\pagebreak \ No newline at end of file diff --git a/Advanced/Compression/tikzset.tex b/Advanced/Compression/tikzset.tex new file mode 100644 index 0000000..d83fa32 --- /dev/null +++ b/Advanced/Compression/tikzset.tex @@ -0,0 +1,65 @@ +\usetikzlibrary{arrows.meta} +\usetikzlibrary{shapes.geometric} +\usetikzlibrary{patterns} + +% We put nodes in a separate layer, so we can +% slightly overlap with paths for a perfect fit +\pgfdeclarelayer{nodes} +\pgfdeclarelayer{path} +\pgfsetlayers{main,nodes} + +% Layer settings +\tikzset{ + % Layer hack, lets us write + % later = * in scopes. + layer/.style = { + execute at begin scope={\pgfonlayer{#1}}, + execute at end scope={\endpgfonlayer} + }, + % + % Arrowhead tweak + >={Latex[ width=2mm, length=2mm ]}, + % + % Labels inside edges + label/.style = { + rectangle, + % For automatic red background in solutions + fill = \ORMCbgcolor, + draw = none, + rounded corners = 0mm + }, + % + % Nodes + main/.style = { + draw, + circle, + fill = white, + line width = 0.35mm + }, + % + % Loop tweaks + loop above/.style = { + min distance = 2mm, + looseness = 8, + out = 45, + in = 135 + }, + loop below/.style = { + min distance = 5mm, + looseness = 10, + out = 315, + in = 225 + }, + loop right/.style = { + min distance = 5mm, + looseness = 10, + out = 45, + in = 315 + }, + loop left/.style = { + min distance = 5mm, + looseness = 10, + out = 135, + in = 215 + } +} \ No newline at end of file