handouts/Fast Inverse Root/parts/4 float.tex


\section{\texttt{int}s and \texttt{float}s}

\definition{}
A \textit{signed 32-bit integer} (equivalently, a \texttt{long int}) consists of thirty-two binary digits, \par
and is used represent a subset of the integers.

\vspace{2mm}

The first bit of a \texttt{long} tells us its sign:
\begin{itemize}
	\item if the first bit of a \texttt{long} is \texttt{1}, it represents a negative number;
	\item if the first bit is \texttt{0}, it represents a positive number.
\end{itemize}
We do not need negative numbers today, so we will assume that the first bit is always zero. \par
\note{If you'd like to know how negative integers are written, look up \say{two's complement} after class.}

\vspace{2mm}

We'll denote binary strings with the prefix \texttt{0b}. \par
Underscores are added between every eight digits for readability, and have no meaning.

\vspace{2mm}


The value of a positive signed \texttt{long} is simply the value of its binary digits. \par
For example:
\begin{itemize}
	\item $\texttt{0b00000000\_00000000\_00000000\_00000000} = 0$
	\item $\texttt{0b00000000\_00000000\_00000000\_00000011} = 3$
	\item $\texttt{0b00000000\_00000000\_00000000\_00100000} = 32$
	\item $\texttt{0b00000000\_00000000\_00000000\_10000010} = 130$
\end{itemize}

Remember---we only need positive integers today. Assume the \say{sign} bit is always \texttt{0}.

\problem{}
What is the largest number that can be represented with a \texttt{long}?

\begin{solution}
	$\texttt{0b01111111\_11111111\_11111111\_11111111} = 2^{31}$
\end{solution}

\vfill


\problem{}
What is the smallest possible number that can be represented with a \texttt{long}? \par
\hint{
	You do not need to know \textit{how} negative numbers are represented. \par
	Assume that we do not skip any integers, and don't forget about zero.
}

\begin{solution}
	There are $2^{64}$ possible 32-bit patterns,
	of which 1 represents zero and $2^{31}$ represent positive numbers.

	\vspace{2mm}

	We therefore have access to $2^{64} - 1 - 2^{31}$ negative numbers,
	giving us a minimum representable value of $-2^{31} + 1$.
\end{solution}

\problem{}
What is the value of the following longs?
\begin{itemize}
	\item \texttt{0b00000000\_00000000\_00000101\_00111001}
	\item \texttt{0b00000000\_00000000\_00000001\_00101100}
	\item \texttt{0b00000000\_00000000\_00000100\_10110000}
\end{itemize}
\hint{The third conversion is easy---look carefully at the second.}

\begin{solution}
	\begin{itemize}
		\item $\texttt{0b00000000\_00000000\_00000101\_00111001} = 1337$
		\item $\texttt{0b00000000\_00000000\_00000001\_00101100} = 300$
		\item $\texttt{0b00000000\_00000000\_00000010\_01011000} = 1200$
	\end{itemize}

	Notice that the third long is the second shifted left twice (i.e, multiplied by 4)
\end{solution}

\vfill
\pagebreak


\definition{}
A \textit{signed 32-bit floating-point decimal} (equivalently, a \textit{float})
consists of 32 binary digits, and is used to represent a subset of the real numbers.
These 32 bits are interpreted as follows:

\begin{center}
	\begin{tikzpicture}


		\node[anchor=south west] at (0, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (0.25, 0) {\texttt{\texttt{b}}};
		\node[anchor=south west] at (0.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (0.75, 0) {\texttt{\texttt{\_}}};

		\node[anchor=south west] at (1.00, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (1.25, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (1.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (1.75, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (2.00, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (2.25, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (2.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (2.75, 0) {\texttt{\texttt{0}}};

		\node[anchor=south west] at (3.00, 0) {\texttt{\texttt{\_}}};
		\node[anchor=south west] at (3.25, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (3.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (3.75, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (4.00, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (4.25, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (4.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (4.75, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (5.00, 0) {\texttt{\texttt{\_}}};

		\node[anchor=south west] at (5.25, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (5.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (5.75, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (6.00, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (6.25, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (6.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (6.75, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (7.00, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (7.25, 0) {\texttt{\texttt{\_}}};

		\node[anchor=south west] at (7.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (7.75, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (8.00, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (8.25, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (8.50, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (8.75, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (9.00, 0) {\texttt{\texttt{0}}};
		\node[anchor=south west] at (9.25, 0) {\texttt{\texttt{0}}};


		\draw (0.50, 0) -- (0.95, 0) node [midway, below=1mm] {sign};
		\draw (1.05, 0) -- (3.15, 0) node [midway, below=1mm] {exponent};
		\draw (3.30, 0) -- (9.70, 0) node [midway, below=1mm] {fraction};
	\end{tikzpicture}
\end{center}

In other words:
\begin{itemize}[itemsep = 1mm]
	\item The first bit denotes the sign of the float's value. We'll label it $s$. \par
	If $s = 1$, this float is negative; if $s = 0$, it is positive.

	\item The next 8 bits represent the \textit{exponent} of this float. \par
	We'll call the value of these eight bits $E$. \par
	Naturally, $0 \leq E \leq 255$

	\item The remaining 23 bits represent the \textit{fraction} of this float. \par
	These 23 bits are interpreted as the fractional part of a binary decimal. \par
	For example, the bits \texttt{0b1010000\_00000000\_00000000} represents $0.5 + 0.125 = 0.625$.
\end{itemize}


\vspace{2mm}

The final value of a float with sign $s$, exponent $E$, and fraction $F$ is
\begin{equation*}
	(-1)^s ~\times~ 2^{E - 127} ~\times~ \left(1 + \frac{F}{2^{23}}\right)
\end{equation*}


Notice that this is very similar to decimal scientific notation, which is written as
\begin{equation*}
	(\pm 1) ~\times~ 10^{e} ~\times~ (f)
\end{equation*}

\vfill
\pagebreak

\problem{}
What is the value of \texttt{0b01000001\_10101000\_00000000\_00000000} if it is interpreted as a float? \par
\hint{$21 \div 16 = 1.3125$}

\begin{solution}
	This is 21:
	\begin{align*}
		&=~ 2^{131} \times \biggl(1 + \frac{2^{21} + 2^{19}}{2^{23}}\biggr) \\
		&=~ 2^{4} \times (1 + 0.25 + 0.0625) \\
		&=~ 16 \times (1.3125) \\
		&=~ 21
	\end{align*}
\end{solution}

\vfill

\problem{}
Encode $12.5$ as a float. \par
\hint{$12.5 \div 8 = 1.5625$}

\vspace{2mm}

What is the value of the resulting 32 bits if they are interpreted as a long? \par
\hint{A sum of powers of two is fine.}

\begin{solution}
	\begin{align*}
		12.5
		&=~ 8 \times 1.5625 \\
		&=~ 2^{3} \times \biggl(1 + (0.5 + 0.0625)\biggr) \\
		&=~ 2^{130} \times \biggl(1 + \frac{2^{22} + 2^{19}}{2^{23}}\biggr)
	\end{align*}

	\linehack{}

	This is \texttt{0b01000001\_01001000\_00000000\_00000000}, \par
	which is $2^{30} + 2^{24} + 2^{22} + 2^{19} = 11,095,237,632$
\end{solution}


\vfill
\pagebreak