From 850928f9a92703dd469e90d6e2ad8a69e7b9a8b6 Mon Sep 17 00:00:00 2001 From: Mark Date: Sun, 29 Jan 2023 22:10:13 -0800 Subject: [PATCH] Added regex handout --- Misc/Warm-Ups/regex.tex | 145 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 Misc/Warm-Ups/regex.tex diff --git a/Misc/Warm-Ups/regex.tex b/Misc/Warm-Ups/regex.tex new file mode 100644 index 0000000..4eb6be7 --- /dev/null +++ b/Misc/Warm-Ups/regex.tex @@ -0,0 +1,145 @@ +% use [nosolutions] flag to hide solutions. +% use [solutions] flag to show solutions. +\documentclass[ + nosolutions +]{../../resources/ormc_handout} + + +\usepackage{xcolor} +\usepackage{soul} + +\definecolor{Light}{gray}{.90} +\sethlcolor{Light} +\newcommand{\htexttt}[1]{\texttt{\hl{#1}}} + +\begin{document} + + \maketitle + {The Regex Warm-Up} + {Prepared by Mark on \today} + + Yesterday we discussed Deterministic Finite Automata. One interesting application of these mathematical objects is pattern matching, usually in the form of Regular Expressions. \\ + (abbreviated \say{regex}, which is pronounced like \say{gif}) + + \vspace{2mm} + + Regex is a language used to specify patterns in a string. You can think it as a concise way to define a DFA, using text instead of a huge graph. \\ + + Often enough, a clever regex pattern can do the work of a few hundred lines of code. \\ + + \vspace{2mm} + + Like the DFAs we have studied, a regex pattern \textit{accepts} or \textit{rejects} a string. However, we don't usually use this terminology when discussing regex, instead opting to say a pattern \textit{matches} or \textit{doesn't match} a string. \\ + + \vspace{5mm} + + \textbf{Quantifiers} \\ + Quantifiers tell us how many of a character to match. \\ + There are four of them: + \htexttt{+}, \htexttt{*}, \htexttt{?}, and \htexttt{\{ \}} + + \vspace{2mm} + + \htexttt{+} means \say{match one or more of the preceding token} \\ + \htexttt{*} means \say{match zero or more of the preceding token} \\ + + For example, the pattern \htexttt{ca+t} will match the following strings: + \begin{itemize} + \item \texttt{cat} + \item \texttt{caat} + \item \texttt{caaaaaaaat} + \end{itemize} + \htexttt{ca+t} will \textbf{not} match the string \texttt{ct}. \\ + The pattern \htexttt{ca*t} will match all the strings above, including \texttt{ct}. + \vspace{2mm} + + + \htexttt{?} means \say{match one or none of the preceeding token} \\ + The pattern \htexttt{linea?r} will match only \texttt{linear} and \texttt{liner} \\ + \vspace{2mm} + + Patterns with brackets \htexttt{\{min, max\}} are the most flexible quantifier. \\ + They specify exactly how many tokens to match: \\ + \htexttt{ab\{2\}a} will match only \texttt{abba}. \\ + \htexttt{ab\{1,3\}a} will match \texttt{aba}, \texttt{abba}, \texttt{abbba}. \\ + \htexttt{ab\{2,\}a} will match any \texttt{ab...ba} with at least two \texttt{b}s. + + \vspace{5mm} + + \problem{} + Write the patterns \htexttt{a*} and \htexttt{a+} using only \htexttt{\{ \}}. + \vfill + + \problem{} + Draw a DFA equivalent to the regex pattern \htexttt{01*0}. + \vfill + + \pagebreak + + + + + + + \textbf{Characters, Sets, and Groups} + + Characters tell us what to match. + + Usually we specify them literally, as shown above: \\ + \texttt{a+} means \say{one or more \texttt{a} character} \\ + + \vspace{2mm} + + There are, however, other ways we can specify characters. \\ + + \vspace{2mm} + + The first such way is the \textit{set}, denoted \htexttt{[ ]}. A set can pretend to be any character inside it. \\ + For example, \htexttt{m[aoy]th} will match \texttt{math}, \texttt{moth}, or \texttt{myth}. \\ + \htexttt{a[01]+b} will match \texttt{a0b}, \texttt{a111b}, \texttt{a1100110b}, and any other similar string. \\ + + \vspace{2mm} + + If we want to keep characters together, we can use the \textit{group}, denoted \htexttt{( )}. \\ + + Groups work exactly as you'd expect, representing an atomic\footnotemark{} group of characters. \\ + \htexttt{a(01)+b} will match \texttt{a01b} and \texttt{a010101b}, but will \textbf{not} match \texttt{a0b}, \texttt{a1b}, or \texttt{a1100110b}. \\ + + \footnotetext{In other words, \say{unbreakable}} + + + \problem{} + You are now familiar with most of the tools regex has to offer. \\ + Match the following strings: + \begin{enumerate} + \item An ISO-8601 date, like \texttt{2022-10-29}. \\ + Invalid dates like \texttt{2022-13-29} should also be matched. \\ + + \item A hexadecimal integer of any length. + \item A UCLA room number, like \texttt{MS 5118} or \texttt{Kinsey 1220B} + \item Any ISBN-10 of the form \texttt{0-316-00395-7}. \\ + Remember that the check digit can be an \texttt{X}. \\ + Dashes are optional. + \end{enumerate} + + + + \vfill + + + + + + \problem{} + If you'd like to know more, check out \texttt{regexr.com}. There's an interative regex prompt that provices explanations, as well as a cheatsheet that explains every regex token there is. You can find a nice set of challenges at \texttt{http://regex.alf.nu} \\ + I especially encourage you to look into this if you are interested in computer science. + \pagebreak + + \problem{} + Draw a DFA for each of the following regex strings. \\ + \begin{itemize} + \item Your solution to \ref{regex}, Part 2 + \item Your solution to \ref{regex}, Part 3 + \item Your solution to \ref{regex}, Part 4 + \end{itemize} +\end{document} \ No newline at end of file