PS239T.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames*,x11names*}{xcolor}
%
\documentclass[
]{book}
\usepackage{lmodern}
\usepackage{amssymb,amsmath}
\usepackage{ifxetex,ifluatex}
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
  \usepackage{unicode-math}
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\hypersetup{
  pdftitle={Computational Thinking for Social Scientists},
  pdfauthor={Jae Yeon Kim},
  colorlinks=true,
  linkcolor=Maroon,
  filecolor=Maroon,
  citecolor=Blue,
  urlcolor=Blue,
  pdfcreator={LaTeX via pandoc}}
\urlstyle{same} % disable monospaced font for URLs
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\usepackage{longtable,booktabs}
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{5}
\usepackage{booktabs}
\usepackage{amsthm}
\makeatletter
\def\thm@space@setup{%
  \thm@preskip=8pt plus 2pt minus 4pt
  \thm@postskip=\thm@preskip
}
\makeatother
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\usepackage{fontspec}
\usepackage{multicol}
\usepackage{hhline}
\usepackage{hyperref}
\usepackage[]{natbib}
\bibliographystyle{apalike}

\title{Computational Thinking for Social Scientists}
\author{\href{https://jaeyk.github.io/}{Jae Yeon Kim}}
\date{2022-01-30}

\begin{document}
\maketitle

{
\hypersetup{linkcolor=}
\setcounter{tocdepth}{1}
\tableofcontents
}
\hypertarget{hello-world}{%
\chapter{Hello World}\label{hello-world}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{print}\NormalTok{(}\StringTok{"Hello, World!"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, World!"
\end{verbatim}

\begin{quote}
Make simple things simple, and complex things possible. - \href{https://www.quora.com/What-is-the-story-behind-Alan-Kay-s-adage-Simple-things-should-be-simple-complex-things-should-be-possible}{Alan Kay}
\end{quote}

This is the website for \emph{Computational Thinking for Social Scientists}. This open-access book intends to help social scientists think computationally and develop proficiency with computational tools and techniques to research computational social science. Mastering these tools and techniques not only enables social scientists to collect, wrangle, analyze, and interpret data with less pain and more fun, but it also let them work on research projects that would previously seem impossible.

Horace Mann, the first great American advocate of public education, claimed that `'Education, then, beyond all other divides of human origin, is a great equalizer of conditions of men---the balance wheel of the social machinery.'' I believe in this potential of education; however, I also fully acknowledge that quality education is not accessible equally. Often, the gap between education and technology is greater among historically disadvantaged groups than advantaged groups. As an educator, this book is my small contribution to making this democratic vision of education possible, at least in the emerging field of computational social science.

That said, this book is not intended to be a comprehensive guide for computational social science or any particular programming language, computational tool, or technique. If you are interested in a general introduction to computational social science, I highly recommend \href{http://www.princeton.edu/~mjs3/}{Matthew Salganik}'s \href{https://www.bitbybitbook.com/}{Bit By Bit (2017)}. Salganik's book is comprehensive, accessible, and pedagogically friendly.

The book comprises two main subjects (fundamentals and applications) and eight main sessions.

\hypertarget{part-i-fundamentals}{%
\section{Part I Fundamentals}\label{part-i-fundamentals}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  \protect\hyperlink{motivation}{Why computational thinking}
\item
  \protect\hyperlink{git_bash}{Best practices in data and code management using Git and Bash}
\item
  \protect\hyperlink{tidy_data}{How to wrangle, model, and visualize data easier and faster}
\item
  \protect\hyperlink{functional_programming}{How to use functional programming to automate repeated things}
\item
  \protect\hyperlink{products}{How to develop data products (e.g., packages and shiny apps)}
\end{enumerate}

\hypertarget{part-ii-applications}{%
\section{Part II Applications}\label{part-ii-applications}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{5}
\item
  \protect\hyperlink{semi_structured_data}{How to collect and parse semi-structured data at scale (e.g., using APIs and web scraping)}
\item
  \protect\hyperlink{machine_learning}{How to analyze high-dimensional data (e.g., text) using machine learning}
\item
  \protect\hyperlink{big_data}{How to access, query, and manage big data using SQL}
\end{enumerate}

The book teaches how to do all of these, mostly in \href{https://www.r-project.org/about.html}{\textbf{R}}, and sometimes in \href{https://www.gnu.org/software/bash/}{\textbf{bash}} and \href{https://www.python.org/about/}{\textbf{Python}}.

\begin{itemize}
\item
  Why R? R is free, easy to learn (thanks to \href{https://www.tidyverse.org/}{\texttt{tidyverse}} and \href{https://rstudio.com/}{RStudio}), fast (thanks to \href{https://cran.r-project.org/web/packages/Rcpp/index.html}{\texttt{Rcpp}}), runs everywhere (Mac/Windows/Linux), open (16,000+ packages; counting only ones \href{https://cran.r-project.org/web/packages/}{available at CRAN}), and has a growing, large, and inclusive community (\href{https://twitter.com/search?q=\%23rstats\&src=typed_query}{\texttt{\#rstats}}).
\item
  Why R + Python + bash?

  \begin{quote}
  \begin{quote}
  ``For R and Python, Python is first and foremost a programming language. And that has a lot of good features, but it tends to mean, that if you are going to do data science in Python, you have to first learn how to program in Python. Whereas I think you are going to get up and running faster with R, than with Python because there's just a bunch more stuff built in and you don't have to learn as many programming concepts. You can focus on being a great political scientist or whatever you do and learning enough R that you don't have to become an expert programmer as well to get stuff done.'' - Hadley Wickham
  \end{quote}
  \end{quote}

  \begin{itemize}
  \tightlist
  \item
    However, this feature of the R community also raises a challenge.
  \end{itemize}

  \begin{quote}
  \begin{quote}
  Compared to other programming languages, the R community tends to be more focused on results instead of processes. Knowledge of software engineering best practices is patchy: for instance, not enough R programmers use source code control or automated testing. Inconsistency is rife across contributed packages, even within base R. You are confronted with over 20 years of evolution every time you use R. R is not a particularly fast programming language, and poorly written R code can be terribly slow. R is also a profligate user of memory. - Hadley Wickham
  \end{quote}
  \end{quote}

  \begin{itemize}
  \item
    RStudio, especially the tidyverse team, has made heroic efforts to overcome the limitations mentioned above. Readers will learn these recent advances in the R ecosystem and complement R with Python and Bash.
  \item
    Nevertheless, if you're serious about programming, I highly recommend learning Python. Learning Python also helps you fill gaps in software engineering that could be useful to be highly proficient in R.
  \end{itemize}
\end{itemize}

\hypertarget{special-thanks}{%
\section{Special thanks}\label{special-thanks}}

This book is collected as much as it is authored. It is a remix version of \href{https://github.com/rochelleterman/PS239T}{PS239T}, a graduate-level computational methods course at UC Berkeley, originally developed by \href{http://rochelleterman.com/}{Rochelle Terman} (Assistant Professor of Political Science, Chicago) then revised by \href{http://rachelbernhard.com/}{Rachel Bernhard} (Assistant Professor of Political Science, UC Davis). I have taught \href{https://github.com/PS239T/spring_2021}{PS239T} as lead instructor in Spring 2019 and TA in Spring 2018 and taught it with \href{https://nicholaskuipers.com/}{Nick Kuipers} (Postdoc, Stanford) in Spring 2020. Other teaching materials draw from the workshops I have created for \href{https://dlab.berkeley.edu/}{D-Lab} and \href{https://data.berkeley.edu/research/discovery-program-home}{Data Science Discovery Program} at UC Berkeley and \href{https://sicss.io/2021/howard-mathematica/}{the Summer Institute in Computational Social Science hosted by Howard University and Mathematica}. I also have cited all the other references whenever I am aware of related books, articles, slides, blog posts, or YouTube video clips.

\hypertarget{suggestions-questions-or-comments}{%
\section{Suggestions, questions, or comments}\label{suggestions-questions-or-comments}}

Please feel free to \href{https://github.com/jaeyk/PS239T/issues}{create issues}; if you find typos, errors, missing citations, please report them via the GitHub repository associated with this book.

\hypertarget{license}{%
\section{License}\label{license}}

\includegraphics{https://licensebuttons.net/l/by/4.0/88x31.png} This work is licensed under a \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International License}.

\hypertarget{motivation}{%
\chapter{Computational thinking}\label{motivation}}

\hypertarget{why-computational-thinking}{%
\section{Why computational thinking}\label{why-computational-thinking}}

If social scientists want to know how to work smart and not just hard, they need to take full advantage of the power of modern programming languages, and that power is \textbf{automation}.

Let's think about the following two cases (these examples come from \href{https://dlab.berkeley.edu/blog/why-teaching-social-scientists-how-code-professional-important}{the column} I contributed to the D-Lab website)

\begin{itemize}
\tightlist
\item
  Case 1: Suppose a social scientist needs to collect data on civic organizations in the United States from websites, Internal Revenue Service reports, and social media posts. As the number of these organizations is large, the researcher could not collect a large volume of data from diverse sources, so they would hire undergraduates and distribute tasks. This is a typical data collection plan in social science research, and it is labor-intensive. Automation is not part of the game plan. Yet, it is critical for so many reasons. Because the process is costly, no one is likely to replicate or update the data collection effort.
\end{itemize}

Case 1 illustrates that it is challenging to be reproducible and scalable without efficient data analytics pipelines.

\begin{itemize}
\tightlist
\item
  Case 2: An alternative is to write computer programs that collect such data automatically, parse them, and store them in interconnected databases. Additionally, someone may need to maintain and validate the quality of the data infrastructure. Nevertheless, this approach lowers the cost of the data collection process, thereby substantially increasing the \textbf{reproducibility} and \textbf{scalability}. Furthermore, the researcher can document their code and publicly share it using their GitHub repository or even gather some of the functions they used and distribute them as open-source libraries.
\end{itemize}

Case 2 illustrates the power of automation and how it benefits the academic community and the general public.

To reap these benefits, one needs to learn how to program. In the era of data science, programming is as valuable a skill as writing in social science research because the extent to which a researcher can automate the research process can determine its efficiency, reproducibility, and scalability.

Below is an insightful quote from Hadley Wickham, who won the 2019 COPSS Presidents' Award for his outstanding contribution to statistics via developing tidyverse R packages that have transformed how people wrangle, analyze, and visualize data. Even if you don't do big data or machine learning, you can still benefit from learning how to program and applying the programming skills to data analysis because it's a ``force multiplier.''

\begin{quote}
Every modern statistical and data analysis problem needs code to solve it. You shouldn't learn just the basics of programming, spend some time gaining mastery. Improving your programming skills pays off because code is a \textbf{force multiplier}: once you've solved a problem once, code allows you to solve it much faster in the future. As your programming skill increases, the generality of your solutions improves: you solve not just the precise problem you encountered, but a wider class of related problems (in this way programming skill is very much like mathematical skill). Finally, sharing your code with others allows them to benefit from your experience. - \href{https://imstat.org/2014/12/16/hadley-wickham-impact-the-world-by-being-useful/}{Hadley Wickham}
\end{quote}

However, I also do not claim that social scientists should learn programming like software engineers learn the subject. For social scientists, programming is a means, not an end. I encourage readers to think about what aspects of the social science research process can be automated. Again, programming is just a way to teach a machine to perform these tasks and get them done.

\begin{figure}
\centering
\includegraphics{https://bam.files.bbci.co.uk/bam/live/content/znmb87h/large}
\caption{From BBC Bitesize}
\end{figure}

Teaching a computer to perform a particular task requires computational thinking: ``formulating a problem and expressing its solution in a way that a computer---human or machine---can effectively carry out'' (defined by \href{http://www.cs.cmu.edu/afs/cs/usr/wing/www/publications/Wing06.pdf}{Jeannette M. Wing}). Specifically, this means readers need to get familiar with how computers think about data and handle them.

\hypertarget{how-to-teach-and-learn-computational-thinking}{%
\section{How to teach and learn computational thinking}\label{how-to-teach-and-learn-computational-thinking}}

This book teaches how you learn this art in incremental steps.

\begin{itemize}
\item
  From graphic user interface to command-line interface (ch 3)
\item
  From short programs to long programs (ch 4-5)
\item
  The ultimate goal is to solve complex problems at scale using computation (ch 6-7)
\end{itemize}

I will cover programming concepts, but I will emphasize practicing them more. As the following John Chamber's quote indicates, this approach helps you learn computational thinking and apply it in particular contexts by coding and solving problems.

\begin{quote}
``{[}W{]}e wanted users to be able to begin in an interactive environment, where they did not consciously think of themselves as programming. Then as their needs became clearer and their sophistication increased, they should be able to slide gradually into programming, when the language and system aspects would become more important.'' - \emph{Stages in the Evolution of S} by John Chambers (S is the progenitor of R)
\end{quote}

Here are also some valuable reminders.

\begin{itemize}
\item
  Beginners! Learning programming is a long game. The essential component of learning (for almost any subject) is consistency. Never stop writing code, even though your current code may fall far short of perfection.
  \includegraphics{misc/wickham.png}
\item
  Intermediate programmers! Try to empower, not intimidate, newbies. The most important rule in the computational social science community (at least, in my opinion) is being nice. Please read David Robinson's \href{http://varianceexplained.org/programming/bad-code/}{``A Million Lines of Bad Code''} for more insights.
\end{itemize}

\includegraphics{http://imgs.xkcd.com/comics/code_quality.png}

Finally, have fun. I've talked about how learning programming pays off. But I've taught long enough to know that this will not convince people to learn to program, especially those who've had negative experiences learning STEM.

Instead, I will try to make the materials as accessible as possible by emphasizing the following two ideas in teaching: showing the \textbf{BIG PICTURE} and walking through the \textbf{WORKFLOW.} With \href{https://media.illinois.edu/margaret-yee-man-ng}{Margaret Ng} (Assistant Professor of Journalism, UIUC), I wrote about why these two concepts are pedagogically important for teaching computational social science for all. \href{https://osf.io/preprints/socarxiv/pf7n6/?fbclid=IwAR2ZI0yw_pehS0mxAmeUBOGpzIhiO2LMUPGBzBLTLNo4C2HrJSoH9uZhgTY}{The article} is forthcoming in \emph{PS: Political Science and Politics.} If you are interested in my full argument, please read the article.

Here is a quick summary of why I think they matter for social science students' inclusive teaching of programming.

Showing the big picture: Every time you teach a new skill or technique, remind students what the input and output data type is. Students from either Excel, SPSS, or Stata backgrounds are not used to thinking about data structure when working on data. So, providing these guideposts is crucial to help them avoid making an obvious mistake (e.g., providing a character vector when a numeric vector is needed for the input data) and seeing the connection between different skills (e.g., using API and web scraping).\\
Walking through the workflow: Break down the steps involved in moving from the input to the output data. This way helps students feel less overwhelmed by learning the complex steps required to solve a particular task. It also helps students learn how to formulate a workflow when they encourage the same problem in a different context. Although the exact context is not identical, they can find patterns across them. Finally, teaching the workflow means breaking down these steps as well as putting them together, ideally using functions. Acquiring this skill is critical for students to advance from beginners to intermediate programmers who can write readable and reusable code.

\hypertarget{git_bash}{%
\chapter{Managing data and code}\label{git_bash}}

\hypertarget{the-command-line}{%
\section{The Command Line}\label{the-command-line}}

\hypertarget{the-big-picture}{%
\subsection{The Big Picture}\label{the-big-picture}}

As William Shotts the author of \emph{\href{http://linuxcommand.org/tlcl.php}{The Linux Command Line}} put it:

\begin{quote}
graphical user interfaces make easy tasks easy, while command-line interfaces make difficult tasks possible.
\end{quote}

\hypertarget{why-bother-using-the-command-line}{%
\subsection{Why bother using the command line?}\label{why-bother-using-the-command-line}}

Suppose that we want to create a plain text file that contains the word ``test.'' If we want to do this in the command line, you need to know the following commands.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{echo}: ``Write arguments to the standard output'' This is equivalent to using a text editor (e.g., nano, vim, emacs) and writing something.
\item
  \texttt{\textgreater{}\ test} Save the expression in a file named test.
\end{enumerate}

We can put these commands together like the following:

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{echo} \StringTok{"sth"} \OperatorTok{\textgreater{}}\NormalTok{ test }
\end{Highlighting}
\end{Shaded}

Don't worry if you are worried about memorizing these and more commands. Memorization is a far less important aspect of learning programming. In general, if you don't know what a command does, just type \texttt{\textless{}command\ name\textgreater{}\ -\/-help.} You can do \texttt{man\ \textless{}command\ name\textgreater{}} to obtain further information. Here, \texttt{man} stands for manual. If you need more user-friendly information, please consider using \href{https://tldr.sh/}{\texttt{tldr}}.

Let's make this simple case complex by scaling up. Suppose we want to make 100 duplicates of the \texttt{test} file. Below is the one-line code that performs the task!

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{for} \ExtensionTok{i}\NormalTok{ in }\DataTypeTok{\{1..100\}}\KeywordTok{;} \KeywordTok{do} \FunctionTok{cp}\NormalTok{ test }\StringTok{"test\_}\VariableTok{$i}\StringTok{"}\KeywordTok{;} \KeywordTok{done}  
\end{Highlighting}
\end{Shaded}

Let me break down the seemingly complex workflow.
1. \texttt{for\ i\ in\ \{1..100\}.} This is for loop. The numbers 1..100 inside the curly braces \texttt{\{\}} indicates the range of integers from 1 to 100. In R, this is equivalent to for (i in 1:100) \{\}\\
2. \texttt{;} is used to use multiple commands without making line breaks. ; works in the same way in R.
3. \texttt{\$var} returns the value associated with a variable. Type \texttt{name=\textless{}Your\ name\textgreater{}}. Then, type \texttt{echo\ \$name.} You should see your name printed. Variable assignment is one of the most basic things you'll learn in any programming. In R, we do this by using -\textgreater{}

If you have zero experience in programming, I might have provided too many concepts too early, like variable assignment and for loop. However, you don't need to worry about them at this point. We will cover them in the next chapter.

I will give you one more example to illustrate how powerful the command line is. Suppose we want to find which file contains the character ``COVID.'' This is equivalent to finding a needle in a haystack. It's a daunting task for humans, but not for computers. Commands are verbs. So, to express this problem in a language that computers could understand, let's first find what command we should use. Often, a simple Google or \href{https://stackoverflow.com/}{Stack Overflow} search leads to an answer.

In this case, \texttt{grep} is the answer (there's also grep in R). This command finds PATTERNS in each FIEL. What follows - are options (called flags): \texttt{r} (recursive), \texttt{n} (line number), \texttt{w} (match only whole words), \texttt{e} (use patterns for matching). \texttt{rnw} are for output control and \texttt{e} is for pattern selection.

So, to perform the task above, you just need one-line code: \texttt{grep\ -r\ -n\ -w\ -e\ "COVID\textquotesingle{}\textquotesingle{}}

\textbf{Quick reminders}
- \texttt{grep}: command
- \texttt{-rnw\ -e}: flags
- \texttt{COVID}: argument (usually file or file paths)

Let's remove (=\texttt{rm}) all the duplicate files and the original file. \texttt{*} (any number of characters) is a wildcard (if you want to identify a single number of characters, use \texttt{?}). It finds every file whose name starts with \texttt{test\_}.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{rm}\NormalTok{ test\_* test }
\end{Highlighting}
\end{Shaded}

Enough with demonstrations. What is this black magic? Can you do the same thing using a graphical interface? Which method is more efficient? I hope that my demonstrations give you enough sense of why learning the command line could be incredibly useful. In my experience, mastering the command line helps automate your research process from end to end. For instance, you don't need to write files from a website using your web browser. Instead, you can run the \texttt{wget} command in the terminal. Better yet, you don't even need to run the command for the second time. You can write a Shell script (\texttt{*.sh}) that automates downloading, moving, and sorting multiple files.

\hypertarget{unix-shell}{%
\subsection{UNIX Shell}\label{unix-shell}}

The other thing you might have noticed is that there are many overlaps between the commands and base R functions (R functions that can be used without installing additional packages). This connection is not coincident. UNIX preceded and influenced many programming languages, including R.

The following materials on UNIX and Shell are adapted from {[}the software carpentry{]}(\url{https://bids.GitHub.io/2015-06-04-berkeley/shell/00-intro.html}.

\hypertarget{unix}{%
\subsubsection{Unix}\label{unix}}

UNIX is an \textbf{operating system + a set of tools (utilities)}. It was developed by AT \& T employees at Bell Labs (1969-1971). From Mac OS X to Linux, many of the current operation systems are some versions of UNIX. Command-line INTERFACE is a way to communicate with your OS by typing, not pointing, and clicking.

For this reason, if you're using Max OS, then you don't need to do anything else to experience UNIX. You're already all set.

If you're using Windows, you need to install either GitBash (a good option if you only use Bash for Git and GitHub) or Windows Subsystem (highly recommended if your use case goes beyond Git and GitHub). For more information, see \href{https://GitHub.com/PS239T/spring_2021/blob/main/B_Install.md}{this installation guideline} from the course repo. If you're a Windows user and don't use Windows 10, I recommend installing \href{https://www.virtualbox.org/}{VirtualBox}.

UNIX is old, but it is still mainstream, and it will be. Moreover, \href{https://en.wikipedia.org/wiki/Unix_philosophy}{the UNIX philosophy} (``Do One Thing And Do It Well'')---minimalist, modular software development---is highly and widely influential.

\begin{figure}
\centering
\includegraphics{https://upload.wikimedia.org/wikipedia/commons/1/1b/Ken_Thompson_and_Dennis_Ritchie--1973.jpg}
\caption{Ken Thompson and Dennis Ritchie, key proponents of the Unix philosophy}
\end{figure}

\hypertarget{kernel}{%
\subsubsection{Kernel}\label{kernel}}

The kernel of UNIX is the hub of the operating system: it allocates time and memory to programs. It handles the \href{http://users.ox.ac.uk/~martinw/unix/chap3.html}{filestore} (e.g., files and directories) and communications in response to system calls.

\hypertarget{shell}{%
\subsubsection{Shell}\label{shell}}

The shell is an interactive program that provides an interface between the user and the kernel. The shell interprets commands entered by the user or supplied by a Shell script and passes them to the kernel for execution.

\hypertarget{human-computer-interfaces}{%
\subsubsection{Human-Computer interfaces}\label{human-computer-interfaces}}

At a high level, computers do four things:

\begin{itemize}
\tightlist
\item
  run programs
\item
  store data
\item
  communicate with each other
\item
  interact with us (through either CLI or GUI)
\end{itemize}

\hypertarget{the-command-line-1}{%
\subsubsection{The Command Line}\label{the-command-line-1}}

This kind of interface is called a \textbf{command-line interface}, or CLI,
to distinguish it from the \textbf{graphical user interface}, or GUI, that most people now use.

The heart of a CLI is a \textbf{read-evaluate-print loop}, or REPL: when the user types a command and then presses the enter (or return) key, the computer reads it, executes it, and prints its output. The user then types another command, and so on until the user logs off.

If you're using RStudio, you can use terminal inside RStudio (next to the ``Console''). (For instance, type Alt + Shift + M)

\hypertarget{the-shell}{%
\subsubsection{The Shell}\label{the-shell}}

This description makes it sound as though the user sends commands directly to the computer and sends the output directly to the user. In fact, there is usually a program in between called a \textbf{command shell}.

\begin{figure}
\centering
\includegraphics{https://miro.medium.com/max/1032/1*GuB5q_bWOSZa-8sDg1lEDA.png}
\caption{Source: Prashant Lakhera}
\end{figure}

What the user types go into the shell; it figures out what commands to run and orders the computer to execute them.

Note, the shell is called \emph{the shell}: it encloses the operating system to hide some of its complexity and make it simpler to interact with.

A shell is a program like any other. What's special about it is that its job is to run other programs rather than do calculations itself. The commands are themselves programs: when they terminate, the shell gives the user another prompt (\$ on our systems).

\hypertarget{bash}{%
\subsubsection{Bash}\label{bash}}

The most popular Unix shell is \textbf{Bash}, the Bourne Again Shell (so-called because it's derived from a shell written by Stephen Bourne --- this is what passes for wit among programmers). Bash is the default shell on most modern implementations of \textbf{Unix} and in most packages that provide Unix-like tools for Windows.

\hypertarget{why-shell}{%
\subsubsection{Why Shell?}\label{why-shell}}

Using Bash or any other shell sometimes feels more like programming than like using a mouse. Commands are terse (often only a couple of characters long), their names are frequently cryptic, and their output is lines of text rather than something visual like a graph.

On the other hand, the shell allows us to combine existing tools in powerful ways with only a few keystrokes and set up pipelines to handle large volumes of data automatically.

In addition, the command line is often the easiest way to interact with remote machines (explains why we learn Bash before learning Git and GitHub). If you work in a team and your team manages data in a remote server, you will likely need to get access the server via something like \texttt{ssh} (I will explain this when I explain \texttt{git}) and access a SQL database (this is the subject of the final chapter).

\hypertarget{our-first-command}{%
\subsubsection{Our first command}\label{our-first-command}}

The part of the operating system responsible for managing files and directories is called the \textbf{file system}. It organizes our data into files, which hold information, and directories (also called ``folders''), which hold files or other directories.

Several commands are frequently used to create, inspect, rename, and delete files and directories. To start exploring them, let's open a shell window:

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{jae@jae{-}X705UDR}\NormalTok{:\textasciitilde{}$ }
\end{Highlighting}
\end{Shaded}

Let's demystify the output above. There's nothing complicated.

\begin{itemize}
\tightlist
\item
  jae: a specific user name
\item
  jae-X705UDR: your computer/server name
\item
  \texttt{\textasciitilde{}}: current directory (\texttt{\textasciitilde{}} = home)
\item
  \texttt{\$}: a \textbf{prompt}, which shows us that the shell is waiting for input; your shell may show something more elaborate.
\end{itemize}

Type the command \texttt{whoami,} then press the Enter key (sometimes marked Return) to send the command to the shell.

The command's output is the ID of the current user, i.e., it shows us who the shell thinks we are:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{whoami}

\CommentTok{\# Should be your user name }
\ExtensionTok{jae} 
\end{Highlighting}
\end{Shaded}

More specifically, when we type \texttt{whoami} the shell, the following sequence of events occurs behind the screen.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Finds a program called \texttt{whoami},
\item
  Runs that program,
\item
  Displays that program's output, then
\item
  Displays a new prompt to tell us that it's ready for more commands.
\end{enumerate}

\hypertarget{communicating-to-other-systems}{%
\subsubsection{Communicating to other systems}\label{communicating-to-other-systems}}

In the next unit, we'll focus on the structure of our own operating systems. But our operating systems rarely work in isolation; we often rely on the Internet to communicate with others! You can visualize this sort of communication within your own shell by asking your computer to \texttt{ping} (based on the old term for submarine sonar) an IP address provided by Google (8.8.8.8); in effect, this will test whether your Internet is working.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{ping}\NormalTok{ 8.8.8.8}
\end{Highlighting}
\end{Shaded}

Note: Windows users may have to try a slightly different alternative:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{ping}\NormalTok{ {-}t 8.8.8.8}
\end{Highlighting}
\end{Shaded}

(Thanks \href{http://www.paulthissen.org/}{Paul Thissen} for the suggestion!)

\hypertarget{file-system-organization}{%
\subsubsection{File system organization}\label{file-system-organization}}

Next, let's find out where we are by running a \texttt{pwd} command (\textbf{print working directory}).

At any moment, our \textbf{current working directory} is our current default directory, i.e., the directory that the computer assumes we want to run commands in unless we explicitly specify something else.

Here, the computer's response is \texttt{/home/jae,} which is the \textbf{home directory}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\BuiltInTok{pwd}

\ExtensionTok{/home/jae}
\end{Highlighting}
\end{Shaded}

\textbf{Additional tips}

You can also download files to your computer in the terminal.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Install wget utility
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# sudo = super user }
\FunctionTok{sudo}\NormalTok{ apt{-}get install wget }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Download target files
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{wget}\NormalTok{ https://download1.rstudio.org/desktop/bionic/amd64/rstudio{-}1.4.1103{-}amd64.deb}
\end{Highlighting}
\end{Shaded}

\includegraphics{misc/wget.png}

\begin{quote}
\hypertarget{home-directory}{%
\subsubsection{Home Directory}\label{home-directory}}

The home directory path will look different on different operating systems. For example, on Linux, it will look like \texttt{/home/jae,} and on Windows, it will be similar to \texttt{C:\textbackslash{}Documents\ and\ Settings\textbackslash{}jae.} Note that it may look slightly different for different versions of Windows.
\end{quote}

\begin{quote}
\hypertarget{whoami}{%
\subsubsection{whoami}\label{whoami}}

If the command to find out who we are is \texttt{whoami,} the command to find out where we are ought to be called \texttt{whereami,} so why is it \texttt{pwd} instead? The usual answer is that in the early 1970s, when Unix was first being developed, every keystroke counted: the devices of the day were slow, and backspacing on a teletype was so painful that cutting the number of keystrokes to cut the number of typing mistakes was a win for usability. The reality is that commands were added to Unix one by one, without any master plan, by people who were immersed in its jargon.

The good news: because these basic commands were so integral to the development of early Unix, they have stuck around and appear (in some form) in almost all programming languages.
\end{quote}

\begin{quote}
If you're working on a Mac, the file structure will look similar, but not identical. The following image shows a file system graph for the typical Mac.
\end{quote}

\begin{figure}
\centering
\includegraphics{https://swcarpentry.GitHub.io/shell-novice/fig/home-directories.svg}
\caption{File Directory}
\end{figure}

We know that our current working directory \texttt{/home/jae} is stored inside \texttt{/home} because \texttt{/home} is the first part of its name. Similarly, we know that \texttt{/home} is stored inside the root directory \texttt{/} because its name begins with \texttt{/}.

\hypertarget{listing}{%
\subsubsection{Listing}\label{listing}}

Let's see what's in your home directory by running \texttt{ls} (**list files and directories):

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{ls}

\ExtensionTok{Applications}\NormalTok{        Dropbox         Pictures}
\ExtensionTok{Creative}\NormalTok{ Cloud Files    Google Drive        Public}
\ExtensionTok{Desktop}\NormalTok{         Library         Untitled.ipynb}
\ExtensionTok{Documents}\NormalTok{       Movies          anaconda}
\ExtensionTok{Downloads}\NormalTok{       Music           file.txt}
\end{Highlighting}
\end{Shaded}

\texttt{ls} prints the names of the files and directories in the current directory in alphabetical order, arranged neatly into columns.

We can make \texttt{ls} more useful by adding flags. For instance, you can make your computer show only directories in the file system using the following command. Here \texttt{-F} flag classifies files based on some types. For example, \texttt{/} indicates directories.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ls}\NormalTok{ {-}F /}
\end{Highlighting}
\end{Shaded}

The leading \texttt{/} tells the computer to follow the path from the file system's root, so it always refers to exactly one directory, no matter where we are when we run the command.

If you want to see only directories in the current working directory, you can do the following. (Remember \texttt{\^{}}? This wildcard identifies a single number of characters. In this case, `d'.)

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ls}\NormalTok{ {-}l }\KeywordTok{|} \FunctionTok{grep} \StringTok{"\^{}d"}
\end{Highlighting}
\end{Shaded}

What if we want to change our current working directory? Before we do this, \texttt{pwd} shows us that we're in \texttt{/home/jae,} and \texttt{ls} without any arguments shows us that directory's contents:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\BuiltInTok{pwd}

\ExtensionTok{/home/jae}

\NormalTok{$ }\FunctionTok{ls}

\ExtensionTok{Applications}\NormalTok{        Dropbox         Pictures}
\ExtensionTok{Creative}\NormalTok{ Cloud Files    Google Drive        Public}
\ExtensionTok{Desktop}\NormalTok{         Library         Untitled.ipynb}
\ExtensionTok{Documents}\NormalTok{       Movies          anaconda}
\ExtensionTok{Downloads}\NormalTok{       Music           file.txt}
\end{Highlighting}
\end{Shaded}

Use relative paths (e.g., \texttt{../spring\_2021/references.md}) whenever it's possible so that your code is not dependable on how your system is configured.

\textbf{Additional tips}

How can I find pdf files in \texttt{Downloads} using the terminal? Remember \texttt{*} wildcard?

\begin{Shaded}
\begin{Highlighting}[]
\BuiltInTok{cd}\NormalTok{ Downloads/ }

\FunctionTok{find}\NormalTok{ *.pdf}
\end{Highlighting}
\end{Shaded}

Also, note that you don't need to type every character. Type the first few characters, then press TAB (autocomplete). This is called \textbf{tab-completion}, and we will see it in R as we go on.

\hypertarget{moving-around}{%
\subsubsection{Moving around}\label{moving-around}}

We can use \texttt{cd} (\textbf{change directory}) followed by a directory name to change our working directory.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\BuiltInTok{cd}\NormalTok{ Desktop}
\end{Highlighting}
\end{Shaded}

\texttt{cd} doesn't print anything, but if we run \texttt{pwd} after it, we can see that we are now in \texttt{/home/jae/Desktop.}

If we run \texttt{ls} without arguments now, it lists the contents of \texttt{/home/jae/Desktop,} because that's where we now are:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\BuiltInTok{pwd}

\ExtensionTok{/home/jae/Desktop}
\end{Highlighting}
\end{Shaded}

We now know how to go down the directory tree: how do we go up? We could use an absolute path:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\BuiltInTok{cd}\NormalTok{ /home/jae/}
\end{Highlighting}
\end{Shaded}

but it's almost always simpler to use \texttt{cd\ ..} to go up one level:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\BuiltInTok{pwd}

\ExtensionTok{/home/jae/Desktop}

\NormalTok{$ }\BuiltInTok{cd}\NormalTok{ ..}
\end{Highlighting}
\end{Shaded}

\texttt{..} is a special directory name meaning ``the directory containing this one,'' or more succinctly, the \textbf{parent} of the current directory. Sure enough, if we run \texttt{pwd} after running \texttt{cd\ ..}, we're back in \texttt{/home/jae/}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\BuiltInTok{pwd}

\ExtensionTok{/home/jae/}
\end{Highlighting}
\end{Shaded}

The special directory \texttt{..} doesn't usually show up when we run \texttt{ls}. If we want to display it, we can give \texttt{ls} the `-a' flag:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{ls}\NormalTok{ {-}a}

\BuiltInTok{.}       \ExtensionTok{.localized}\NormalTok{  Shared}
\ExtensionTok{..}\NormalTok{      Guest       rachel}
\end{Highlighting}
\end{Shaded}

\texttt{-a\textquotesingle{}\ stands\ for\ "show\ all";\ it\ forces}ls\texttt{to\ show\ us\ file\ and\ directory\ names\ that\ begin\ with}.\texttt{,\ such\ as}..`.

\begin{quote}
\hypertarget{hidden-files-for-your-own-protection}{%
\subsubsection{Hidden Files: For Your Own Protection}\label{hidden-files-for-your-own-protection}}

As you can see, many other items just appeared when we enter \texttt{ls\ -a\textquotesingle{}.\ These\ files\ and\ directories\ begin\ with}.` followed by a name. Usually, files and directories hold important programmatic information. They are kept hidden so that users don't accidentally delete or edit them without knowing what they're doing.
\end{quote}

As you can see, it also displays another special directory that's just called \texttt{.}, which means ``the current working directory''. It may seem redundant to have a name for it, but we'll see some uses for it soon.

\textbf{Additional tips}

The above navigating exercises help us know about \texttt{cd} command, but not very exciting. So let's do something more concrete and potentially useful. Let's say you downloaded a file using your web browser and locate that file. How could you do that?

Your first step should be learning more about the \texttt{ls} command. You can do that by Googling or typing \texttt{ls\ -\/-help.} By looking at the documentation, you can recognize that you need to add \texttt{-t} (sort by time). Then, what's \texttt{\textbar{}}? It's called pipe, and it chains commands. For instance, if \texttt{\textless{}command\ 1\textgreater{}\ \textbar{}\ \textless{}command\ 2\textgreater{}}, then command1's output will be command2's input. \texttt{head} list the first ten lines of a file. \texttt{-n1} flag makes it show only the first line of the output (n1).

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Don\textquotesingle{}t forget to use TAB completion}
\BuiltInTok{cd}\NormalTok{ Downloads/ }

\FunctionTok{ls}\NormalTok{ {-}t }\KeywordTok{|} \FunctionTok{head}\NormalTok{ {-}n1}
\end{Highlighting}
\end{Shaded}

Yeah! We can do more cool things. For example, how can you find the most recently downloaded PDF file? You can do this by combining the two neat tricks you learned earlier.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ls}\NormalTok{ {-}t }\KeywordTok{|} \FunctionTok{find}\NormalTok{ *.pdf }\KeywordTok{|} \FunctionTok{head}\NormalTok{ {-}n1 }
\end{Highlighting}
\end{Shaded}

\hypertarget{creating-copying-removing-and-renaming-files}{%
\subsubsection{Creating, copying, removing, and renaming files}\label{creating-copying-removing-and-renaming-files}}

\hypertarget{creating-files}{%
\paragraph{Creating files}\label{creating-files}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  First, let's create an empty directory named exercise
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\FunctionTok{mkdir}\NormalTok{ exercise }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\item
  You can check whether the directory is created by typing \texttt{ls}. If the print format is challenging to read, add \texttt{-l} flag. Did you notice the difference?
\item
  Let's move to the \texttt{exercise} subdirectory and create a file named test
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\BuiltInTok{cd}\NormalTok{ exercise }\KeywordTok{;} \FunctionTok{touch}\NormalTok{ test }\KeywordTok{;} \FunctionTok{ls} 
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Read test
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\FunctionTok{cat}\NormalTok{ test }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{4}
\tightlist
\item
  Hmn. It's empty. Let's add something there. \texttt{\textgreater{}} = overwrite
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\BuiltInTok{echo} \StringTok{"something"} \OperatorTok{\textgreater{}}\NormalTok{ test }\KeywordTok{;} \FunctionTok{cat}\NormalTok{ test }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{5}
\tightlist
\item
  Yeah! Can you add more? \texttt{\textgreater{}\textgreater{}} = append
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\BuiltInTok{echo} \StringTok{"anything"} \OperatorTok{\textgreater{}\textgreater{}}\NormalTok{ test }\KeywordTok{;} \FunctionTok{cat}\NormalTok{ test }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{6}
\tightlist
\item
  Removing ``anything'' from \texttt{test} is a little bit more complex because you need to know how to use \texttt{grep} (remember that we used this command in the very first example). Here, I just demonstrate that you can do this task using Bash, and let's dig into this more when we talk about working with text files.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\FunctionTok{grep}\NormalTok{ {-}v }\StringTok{\textquotesingle{}anything\textquotesingle{}}\NormalTok{ test}
\end{Highlighting}
\end{Shaded}

\hypertarget{copying-and-removing-files}{%
\paragraph{Copying and Removing Files}\label{copying-and-removing-files}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Can we make a copy of \texttt{test}? Yes!
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\FunctionTok{cp}\NormalTok{ test test\_1}\KeywordTok{;} \FunctionTok{cat} 
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Can we make 100 copies of \texttt{test?} Yes!
\end{enumerate}

You can do this

\begin{Shaded}
\begin{Highlighting}[]

\FunctionTok{cp}\NormalTok{ test test\_1 }
\FunctionTok{cp}\NormalTok{ test test\_2}
\FunctionTok{cp}\NormalTok{ test test\_3 }

\ExtensionTok{...} 
\end{Highlighting}
\end{Shaded}

or

\begin{Shaded}
\begin{Highlighting}[]

\KeywordTok{for} \ExtensionTok{i}\NormalTok{ in }\DataTypeTok{\{1..100\}}\KeywordTok{;} \KeywordTok{do} \FunctionTok{cp}\NormalTok{ test }\StringTok{"test\_}\VariableTok{$i}\StringTok{"}\KeywordTok{;} \KeywordTok{done}  
\end{Highlighting}
\end{Shaded}

Which one do you like? (Again, don't focus on for loop. We'll learn it and other similar tools to deal with iterations in the later chapters.)

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Can you remove all of the \texttt{test\_} files?
\end{enumerate}

You can do this

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{rm}\NormalTok{ test\_1}
\FunctionTok{rm}\NormalTok{ test\_2}
\FunctionTok{rm}\NormalTok{ test\_3 }

\ExtensionTok{...}
\end{Highlighting}
\end{Shaded}

or

\begin{verbatim}
rm test_*
\end{verbatim}

Which one do you like?

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Let's remove the directory.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\BuiltInTok{cd}\NormalTok{ .. }

\FunctionTok{rm}\NormalTok{ exercise/}
\end{Highlighting}
\end{Shaded}

The \texttt{rm} command should not work because \texttt{exercise} is not a file. Type \texttt{rm\ -\/-help} and see which flag will be helpful. It might be `-d' (remove empty directories).

\begin{verbatim}
rm -d exercise/  
\end{verbatim}

Oops. Still not working because the directory is not empty. Try this. Now, it works.

\begin{verbatim}
rm -r exercise/ 
\end{verbatim}

What's \texttt{-r}? It stands for recursion (e.g., Recursion is a very powerful idea in programming and helps solve complex problems. We'll come back to it many times (e.g., \texttt{purrr::reduce()} in R).

\hypertarget{renaming-files}{%
\paragraph{Renaming files}\label{renaming-files}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Using \texttt{mv}
\end{enumerate}

First, we will learn how to move files and see how it's relevant for renaming files.

\begin{Shaded}
\begin{Highlighting}[]

\CommentTok{\# Create two directories }
\FunctionTok{mkdir}\NormalTok{ exercise\_1 }\KeywordTok{;} \FunctionTok{mkdir}\NormalTok{ exercise\_2 }

\CommentTok{\# Check whether they were indeed created }
\FunctionTok{find}\NormalTok{ exer*}

\CommentTok{\# Create an empty file }
\FunctionTok{touch}\NormalTok{ exercise\_1/test }

\CommentTok{\# Move to exercise\_1 and check }
\BuiltInTok{cd}\NormalTok{ exercise\_1 }\KeywordTok{;} \FunctionTok{ls} 

\CommentTok{\# Move this file to exercise\_2 }
\FunctionTok{mv}\NormalTok{ test ../exercise\_2 }

\CommentTok{\# Move to exercise\_2 and check }
\BuiltInTok{cd}\NormalTok{ exercise\_2 }\KeywordTok{;} \FunctionTok{ls} 
\end{Highlighting}
\end{Shaded}

What \texttt{mv} has something to do with renaming?

\begin{itemize}
\tightlist
\item
  {[}mv{]} {[}source{]} {[}destination{]}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\FunctionTok{mv}\NormalTok{ test new\_test }\KeywordTok{;} \FunctionTok{ls} 
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Using \texttt{rename}
\end{enumerate}

\texttt{mv} is an excellent tool to rename one file. But how about renaming many files? (Note that your pwd is still \texttt{exercise\_2} where you have the \texttt{new\_test} file.)

\begin{Shaded}
\begin{Highlighting}[]

\KeywordTok{for} \ExtensionTok{i}\NormalTok{ in }\DataTypeTok{\{1..100\}}\KeywordTok{;} \KeywordTok{do} \FunctionTok{cp}\NormalTok{ new\_test }\StringTok{"test\_}\VariableTok{$i}\StringTok{.csv"}\KeywordTok{;} \KeywordTok{done}  
\end{Highlighting}
\end{Shaded}

Then install \texttt{rename}. Either \texttt{sudo\ apt-get\ install\ -y\ rename} or \texttt{brew\ install\ rename} (MacOS).

Basic syntax: rename {[}flags{]} perlexpr (Perl Expression) files. Note that \href{https://en.wikipedia.org/wiki/Perl}{Perl} is another programming language.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Rename every csv file to txt file }
\ExtensionTok{rename} \StringTok{\textquotesingle{}s/.csv/.txt/\textquotesingle{}}\NormalTok{ *.csv}

\CommentTok{\# Check }
\FunctionTok{ls}\NormalTok{ {-}l}
\end{Highlighting}
\end{Shaded}

The key part is \texttt{s/.csv/.txt/} = \texttt{s/FIND/REPLACE}

Can you perform the same task using GUI? Yes, you can, but it would be more time-consuming. Using the command line, you did this via just one-liner(!). \href{http://korflab.ucdavis.edu/Bios/bio_keithb.html}{Keith Brandnam} wrote an excellent book titled \href{https://www.amazon.com/Unix-Perl-Rescue-Keith-Bradnam/dp/0521169828}{UNIX and Perl to the Rescue! (Cambridge University Press 2012)} that discusses how to use UNIX and Perl to deal with massively large datasets.

\hypertarget{working-with-csv-and-text-files}{%
\subsubsection{Working with CSV and text files}\label{working-with-csv-and-text-files}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Download a CSV file (Forbes World's Billionaires lists from 1996-2014). For more on the data source, see \href{https://corgis-edu.github.io/corgis/csv/billionaires/}{this site}.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{wget}\NormalTok{ https://corgis{-}edu.github.io/corgis/datasets/csv/billionaires/billionaires.csv}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Read the first two lines. \texttt{cat} is printing, and \texttt{head} shows the first few rows. \texttt{-n2} limits these number of rows equals 2.
\end{enumerate}

\textbf{Additional tips 1}
If you have a large text file, \texttt{cat} prints everything at once is inconvenient. The alternative is using \texttt{less.}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{cat}\NormalTok{ billionaires.csv }\KeywordTok{|} \FunctionTok{head}\NormalTok{ {-}n2}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Check the size of the dataset (2615 rows). So, there are 2014 observations (n-1 because of the header). \texttt{wc} prints newline, word, and byte counts for each file. If you run \texttt{wc} without \texttt{-l} flag, you get the following: \texttt{2615\ (line)\ 20433\ (word)\ 607861\ (byte)\ billionaires.csv}
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{wc}\NormalTok{ {-}l billionaires.csv}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  How about the number of columns? \texttt{sed} is a stream editor and very powerful when it's used to filter text in a pipeline. For more information, see \href{https://www.gnu.org/software/sed/manual/sed.html}{this article}. You've already seen \texttt{s/FIND/REPLACE.} Here, the pattern we are using is \texttt{s/delimiter/\textbackslash{}n/g.} We've seen that the delimiter is \texttt{,} so that's what I plugged in the command below.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{ {-}1 billionaires.csv }\KeywordTok{|} \FunctionTok{sed} \StringTok{\textquotesingle{}s/,/\textbackslash{}n/g\textquotesingle{}} \KeywordTok{|} \FunctionTok{nl}
\end{Highlighting}
\end{Shaded}

\textbf{Additional tips 2}
The other cool command for text parsing is \texttt{awk.} This command is handy for filtering.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  This is the same as using \texttt{cat.} So, what's new?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{awk} \StringTok{\textquotesingle{}\{print\}\textquotesingle{}}\NormalTok{ billionaires.csv }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  This is new.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{awk} \StringTok{\textquotesingle{}/China/ \{print\}\textquotesingle{}}\NormalTok{ billionaires.csv}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Let's see only the five rows. We filtered rows so that every row in the final dataset contains `China.'
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{awk} \StringTok{\textquotesingle{}/China/ \{print\}\textquotesingle{}}\NormalTok{ billionaires.csv }\KeywordTok{|} \FunctionTok{head}\NormalTok{ {-}n5 }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  You can also get the numbers of these rows.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{awk} \StringTok{\textquotesingle{}/China/ \{print NR\}\textquotesingle{}}\NormalTok{ billionaires.csv }
\end{Highlighting}
\end{Shaded}

\hypertarget{user-roles-and-file-permissions}{%
\subsubsection{User roles and file permissions}\label{user-roles-and-file-permissions}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  If you need admin access, use \texttt{sudo.} For instance, \texttt{sudo\ apt-get\ install\ \textless{}package\ name\textgreater{}} installs the package.
\item
  To run a Shell script (.sh), you need to change its file mode. You can make the script executable by typing \texttt{chmod\ +x\ \textless{}Shell\ script\textgreater{}.} Then, you can run it by typing \texttt{./pdf\_copy\_sh.} \texttt{.} refers to the current working directory. Other options: \texttt{sh\ pdf\_copy\_sh.} or \texttt{bash\ pdf\_copy\_sh.} I use \texttt{./pdf\_copy\_sh.}
\end{enumerate}

\hypertarget{writing-your-first-shell-script-.sh}{%
\subsubsection{Writing your first Shell script (.sh)}\label{writing-your-first-shell-script-.sh}}

Finally, we're learning how to write a Shell script (a file that ends with .sh). Here I show how to write a Shell script that creates a subdirectory called \texttt{/pdfs} under \texttt{/Download} directory, then find PDF files in \texttt{/Download} and copy those files to \texttt{pdfs.} Essentially, this Shell script creates a backup. Name this Shell script as `pdf\_copy.sh.'

\begin{Shaded}
\begin{Highlighting}[]

\CommentTok{\#!/bin/sh \# Stating this is a Shell script. }

\FunctionTok{mkdir}\NormalTok{ /home/jae/Downloads/pdfs }\CommentTok{\# Obviously, in your case, this file path should be incorrect.}

\BuiltInTok{cd}\NormalTok{ Download}

\FunctionTok{cp}\NormalTok{ *.pdf pdfs/ }

\BuiltInTok{echo} \StringTok{"Copied pdfs"}
\end{Highlighting}
\end{Shaded}

\textbf{Additional tips}

Using Make {[}TBD{]}

\hypertarget{references}{%
\subsection{References}\label{references}}

\begin{itemize}
\item
  \href{https://seankross.com/the-unix-workbench/}{The Unix Workbench} by Sean Kross
\item
  \href{http://swcarpentry.GitHub.io/shell-novice/}{The Unix Shell}, Software Carpentry
\item
  \href{https://www.datascienceatthecommandline.com/1e/}{Data Science at the Command Line} by Jeroen Janssens
\end{itemize}

\begin{itemize}
\item
  \href{https://missing.csail.mit.edu/2020/shell-tools/}{Shell Tools and Scripting}, ./missing-semester, MIT
\item
  \href{https://missing.csail.mit.edu/2020/command-line/}{Command-line Environment}, ./missing-semester, MIT
\end{itemize}

\hypertarget{git-and-github}{%
\section{Git and GitHub}\label{git-and-github}}

\hypertarget{the-big-picture-1}{%
\subsection{The Big Picture}\label{the-big-picture-1}}

\textbf{The most important point}

\begin{itemize}
\item
  Backup != Version control
\item
  If you do version control, you need to save your \textbf{raw data} in your hard disk, external drive, or cloud, but nothing else. In other words, anything you are going to change should be subject to version control (also, it's not the same as saving your code with names like 20200120\_Kim or something like that). Below, I will explain what version control is and how to do it using Git and GitHub.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://i2.wp.com/cdn-images-1.medium.com/max/399/1*7HHA_UkjUK7wp7qP4CYu1g.png?zoom=1.75\&w=456\&ssl=1}
\caption{Why you should do version control}
\end{figure}

\hypertarget{version-control-system}{%
\subsection{Version control system}\label{version-control-system}}

According to \href{https://guides.GitHub.com}{GitHub Guides}, a version control system ``tracks the history of changes as people and teams collaborate on projects together.'' Specifically, it helps to track the following information:

\begin{itemize}
\tightlist
\item
  Which changes were made?
\item
  Who made the changes?
\item
  When were the changes made?
\item
  Why were changes needed?
\end{itemize}

Git is a case of a \href{https://en.wikipedia.org/wiki/Distributed_version_control}{distributed version control system}, common in open source and commercial software development. This is no surprise given that Git \href{https://lkml.org/lkml/2005/4/6/121}{was originally created} to deal with Linux kernel development.

The following images, from \href{git-scm.com}{Pro Git}, show how a centralized (e.g., CVS, Subversion, and Perforce) and decentralized VCS (e.g., Git, Mercurial, Bazzar or Darcs) works differently.

\begin{figure}
\centering
\includegraphics{https://git-scm.com/book/en/v2/images/centralized.png}
\caption{Centralized version control system}
\end{figure}

Figure 2. Centralized VCS.

\begin{figure}
\centering
\includegraphics{https://git-scm.com/book/en/v2/images/distributed.png}
\caption{Decentralized version control system}
\end{figure}

Figure 3. Decentralized VCS.

For more information on the varieties of version control systems, please read \href{https://pdfs.semanticscholar.org/4490/4c70bc91e1bed4fe02b9e2282f031b7c90ea.pdf}{Petr Baudis's review} on that subject.

\begin{figure}
\centering
\includegraphics{https://plain-text.co/figures/git-basic.png}
\caption{Figure 2.1. A schematic git workflow from Healy's ``The Plain Person's Guide to Plain Text Social Science''}
\end{figure}

For more information, watch the following video:

\hypertarget{setup}{%
\subsection{Setup}\label{setup}}

\hypertarget{signup}{%
\subsubsection{Signup}\label{signup}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Make sure you have installed Git (\href{https://happygitwithr.com/install-git.html\#install-git}{{[}tutorial{]}}).
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{git}\NormalTok{ {-}{-}version }
\CommentTok{\# git version 2.xx.x}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  If you haven't, please sign up for a GitHub account: \url{https://github.com/}
\end{enumerate}

\begin{itemize}
\tightlist
\item
  If you're a student, please also sign up for GitHub Student Developer Pack: \url{https://education.github.com/pack} Basically, you can get a GitHub pro account for free (so why not?).
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Access GitHub using Hypertext Transfer Protocol Secure (HTTPS) or Secure Shell (SSH).
\end{enumerate}

\textbf{HTTPS}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Create a personal access token. Follow this guideline: \url{https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token}
\item
  Store your credential somewhere safe. You can use an R package like this \href{https://gitcreds.r-lib.org/}{gitcreds} and \href{https://docs.ropensci.org/credentials/}{credentials} to do so.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(gitcreds)}

\CommentTok{\# First time only }
\KeywordTok{gitcreds\_set}\NormalTok{()}

\CommentTok{\# Check }
\KeywordTok{gitcreds\_get}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  If you get asked to provide your password when you pull or push, the password should be your GitHub token (to be precise, personal access token).
\end{enumerate}

\textbf{SSH}

If possible, I highly recommend using SSH. Using SSH is safer and also makes connecting GitHub easier. SSH has two keys (public and private). The public key could be stored on any server (e.g., GitHub) and the private key could be saved in your client (e.g., your laptop). Only when the two are matched, the system unlocks.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  First, read \href{https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh}{this tutorial} and create SSH keys.
\item
  Second, read \href{https://happygitwithr.com/ssh-keys.html}{this tutorial} and check the keys and provide the public key to GitHub and add the private key to ssh-agent.
\end{enumerate}

Next time, if you want to use SSH, remember the following.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# SSH}
\ExtensionTok{git@github.com}\NormalTok{:}\OperatorTok{\textless{}}\NormalTok{user}\OperatorTok{\textgreater{}}\NormalTok{/}\OperatorTok{\textless{}}\NormalTok{repo}\OperatorTok{\textgreater{}}\NormalTok{.git}

\CommentTok{\# HTTPS}
\ExtensionTok{https}\NormalTok{://github.com/}\OperatorTok{\textless{}}\NormalTok{user}\OperatorTok{\textgreater{}}\NormalTok{/}\OperatorTok{\textless{}}\NormalTok{repo}\OperatorTok{\textgreater{}}\NormalTok{.git}
\end{Highlighting}
\end{Shaded}

\textbf{Additional tips}

When you try to clone a git repo, you can get the links like the above by clicking the CODE action button on GitHub.

\includegraphics{misc/ssh.png}

\hypertarget{configurations}{%
\subsubsection{Configurations}\label{configurations}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Method 1: using the terminal
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]

\CommentTok{\# User name and email }
\NormalTok{$ }\FunctionTok{git}\NormalTok{ config {-}{-}global user.name }\StringTok{"Firstname Lastname"}
\NormalTok{$ }\FunctionTok{git}\NormalTok{ config {-}{-}global user.email username@school.extension}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Method 2: using RStudio (if you insist on using R)
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(usethis)}
\KeywordTok{use\_git\_config}\NormalTok{(}\DataTypeTok{user.name =} \StringTok{"\textless{}Firstname Lastname\textgreater{}"}\NormalTok{,}
               \DataTypeTok{user.email =} \StringTok{"\textless{}username@school.extension\textgreater{}"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

You're all set!

\hypertarget{cloning-a-repository}{%
\subsection{Cloning a repository}\label{cloning-a-repository}}

Let's clone a repository. The following address is the course I co-taught in Spring 2021.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{git}\NormalTok{ clone https://github.com/PS239T/spring\_2021}
\end{Highlighting}
\end{Shaded}

If you \texttt{cd\ spring\_2021/} you can move to the cloned course repository. Cloning: copying a public GitHub repo (remote) -\textgreater{} Your machine

If I made some changes in the remote repo, you can apply them to your local copy by typing \texttt{git\ pull.} You may get promoted to provide a password. Then type the following to switch the remote URL's address from HTTPS to SSH.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{git}\NormalTok{ remote set{-}url origin git@github.com:[user]/[repo]}
\end{Highlighting}
\end{Shaded}

If this doesn't work and get the following error, try the following (assuming that your SSH key was removed). If you're using Mac, try this instead: \texttt{ssh-add\ -k\ \textasciitilde{}/.ssh/id\_rsa}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{ssh{-}add}\NormalTok{ \textasciitilde{}/.ssh/id\_rsa}
\end{Highlighting}
\end{Shaded}

If you still face difficulties, see \href{https://stackoverflow.com/questions/13509293/git-fatal-could-not-read-from-remote-repository}{this stack overflow thread}.

If you screwed something up in your local copy, you can just overwrite the local copy using the remote repo and make it exactly looks like the latter.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Download content from a remote repo }
\FunctionTok{git}\NormalTok{ fetch origin}

\CommentTok{\# Going back to origin/main}
\FunctionTok{git}\NormalTok{ reset {-}{-}hard origin/main }

\CommentTok{\# Remove local files }
\FunctionTok{git}\NormalTok{ clean {-}f}
\end{Highlighting}
\end{Shaded}

Note that the default branch name changed from master to main: \url{https://github.com/github/renaming} (Finally!) For this reason, if you're interacting with old repositories, the main branch name is likely to master.

\textbf{Additional tips}
You can see cloning and forking on GitHub, and they sound similar. Let me differentiate them.

\begin{itemize}
\item
  Cloning: creating a local copy of a \textbf{public} GitHub repo. In this case, you have writing access to the repo.
\item
  Forking (for open source projects): creating a copy of a \textbf{public} GitHub repo to your GitHub account, then you can clone it. In this case, you don't have writing access to the repo. You need to create pull requests if you want your changes reflected in the original repo. Don't worry about pull requests, as I will explain the concept shortly. For more information, see \href{https://docs.github.com/en/desktop/contributing-and-collaborating-using-github-desktop/cloning-and-forking-repositories-from-github-desktop}{this documentation}.
\end{itemize}

\hypertarget{making-a-repository}{%
\subsection{Making a repository}\label{making-a-repository}}

Create a new directory and move there.
Then initialize

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# new directory }
\NormalTok{$ }\FunctionTok{mkdir}\NormalTok{ code\_exercise}
\CommentTok{\# move }
\NormalTok{$ }\BuiltInTok{cd}\NormalTok{ code\_exercise }
\CommentTok{\# initialize}
\NormalTok{$ }\FunctionTok{git}\NormalTok{ init }
\end{Highlighting}
\end{Shaded}

Alternatively, you can create a Git repository via GitHub and then clone it on your local machine. Perhaps, it is an easier path for new users (I also do this all the time). I highly recommend adding README (more on why we do this in the following subsection).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{git}\NormalTok{ clone /path/to/repository}
\end{Highlighting}
\end{Shaded}

\textbf{Additional tips}
If you're unfamiliar with basic Git commands, please refer to \href{http://rogerdudler.GitHub.io/git-guide/files/git_cheat_sheet.pdf}{this Git cheat sheet}.

\hypertarget{commit-changes}{%
\subsection{Commit changes}\label{commit-changes}}

These features show how Git works as a version control system.

If you edited files or added new ones, you need to update your repository. In Git terms, this action is called committing changes.

My current pwd is \texttt{spring\_2021}. I created a text file named \texttt{test} containing text \texttt{jae.} You can check the file exists by typing `find "test```.

The following is a typical workflow to reflect this change to the remote.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{git}\NormalTok{ status }\CommentTok{\# check what\textquotesingle{}s changed. }
\NormalTok{$ }\FunctionTok{git}\NormalTok{ add . }\CommentTok{\# update every change. In Git terms, you\textquotesingle{}re staging. }
\NormalTok{$ }\FunctionTok{git}\NormalTok{ add file\_name }\CommentTok{\# or stage a specific file.}
\NormalTok{$ }\FunctionTok{git}\NormalTok{ commit {-}m }\StringTok{"your comment"} \CommentTok{\# your comment for the commit. }
\NormalTok{$ }\FunctionTok{git}\NormalTok{ push origin main }\CommentTok{\# commit the change. Origin is a default name given to a server by Git. \textasciigrave{}origin main\textasciigrave{} are optional. }
\end{Highlighting}
\end{Shaded}

Another image from \href{https://git-scm.com/about/staging-area}{Pro Git} nicely illustrates this process.

\begin{figure}
\centering
\includegraphics{https://git-scm.com/images/about/index1@2x.png}
\caption{Git Workflow}
\end{figure}

If you made a mistake, don't panic. You can't revert the process.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{git}\NormalTok{ reset {-}{-}soft HEAD\textasciitilde{}1 }\CommentTok{\# if you still want to keep the change, but you go back to t{-}1 }
\FunctionTok{git}\NormalTok{ reset {-}{-}hard HEAD\textasciitilde{}1 }\CommentTok{\# if you\textquotesingle{}re sure the change is unnecessary }
\end{Highlighting}
\end{Shaded}

Writing an informative commit is essential. To learn how to do this better, see the following video:

\hypertarget{push-and-pull-or-fetch}{%
\subsection{Push and pull (or fetch)}\label{push-and-pull-or-fetch}}

These features show how Git works as a collaboration tool.

If you have not already done it, let's clone the PS239T directory on your local machine.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{git}\NormalTok{ clone https://github.com/PS239T/spring\_2021 }\CommentTok{\# clone }
\end{Highlighting}
\end{Shaded}

\textbf{Additional tips 1}

If you try to remove \texttt{spring\_2021} using \texttt{rm\ -r\ spring\_2021/}, you will get an error about the write-protected regular file. Then, try \texttt{rm\ -rf\ spring\_2021/}.

Then, let's learn more about the repository.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{git}\NormalTok{ remote {-}v }
\end{Highlighting}
\end{Shaded}

You should see something like the following:

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{origin}\NormalTok{  git@github.com:PS239T/spring\_2021 (fetch)}
\ExtensionTok{origin}\NormalTok{  git@github.com:PS239T/spring\_2021 (push)}
\end{Highlighting}
\end{Shaded}

If you want to see more information, then type \texttt{git\ remote\ show\ origin.}

Previously, we learned how to send your data to save in the local machine to the remote (the GitHub server). You can do that by editing or creating files, committing, and typing \textbf{git push}.

Instead, if you want to update your local data with the remote data, you can type \textbf{git pull origin} (something like pwd in bash). Alternatively, you can use fetch (retrieve data from a remote). Git retrieves the data and merges it into your local data when you do that.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{git}\NormalTok{ fetch origin}
\end{Highlighting}
\end{Shaded}

\textbf{Additional tips 2}

Developers usually use PR to refer pull requests. When you are making PRs, it's recommended to scope down (small PRs) because they are easier on reviewers and to test. To learn about how to accomplish this, see \href{https://www.netlify.com/blog/2020/03/31/how-to-scope-down-prs/}{this blog post} by Sarah Drasner.

\hypertarget{branching}{%
\subsection{Branching}\label{branching}}

It's an advanced feature of Git's version control system that allows developers to ``diverge from the main line of development and continue to do work without messing with that main line,'' according to \href{https://git-scm.com/book/en/v1/Git-Branching}{Scott Chacon and Ben Straub}.

If you start working on a new feature, create a new branch.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{git}\NormalTok{ branch new\_features}
\NormalTok{$ }\FunctionTok{git}\NormalTok{ checkout new\_features}
\end{Highlighting}
\end{Shaded}

You can see the newly created branch by typing \textbf{git branch}.

In short, branching makes Git \href{https://git-scm.com/book/en/v2/Getting-Started-Git-Basics}{works like} a mini file system.

\hypertarget{other-useful-commands}{%
\subsection{Other useful commands}\label{other-useful-commands}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  For tracking history
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{git}\NormalTok{ diff }\CommentTok{\# to see what changed (e.g., inside a file)}
\NormalTok{$ }\FunctionTok{git}\NormalTok{ log }\CommentTok{\# to track who committed what}
\NormalTok{$ }\FunctionTok{git}\NormalTok{ log {-}S }\OperatorTok{\textless{}}\NormalTok{pattern}\OperatorTok{\textgreater{}} \CommentTok{\# you can find a log that contains the pattern }
\NormalTok{$ }\FunctionTok{git}\NormalTok{ checkout }\CommentTok{\# to recover old files }
\NormalTok{$ }\FunctionTok{git}\NormalTok{ revert }\CommentTok{\# revert to the previous commit }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  For removing and renaming files
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{$ }\FunctionTok{git}\NormalTok{ rm file\_name }\CommentTok{\# remove }
\NormalTok{$ }\FunctionTok{git}\NormalTok{ mv old\_file\_name new\_file\_name }\CommentTok{\# rename a file }
\end{Highlighting}
\end{Shaded}

How about removing a directory only from GitHub but not local?

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{git}\NormalTok{ rm {-}r {-}{-}cached }\OperatorTok{\textless{}}\NormalTok{directory}\OperatorTok{\textgreater{}}
\FunctionTok{git}\NormalTok{ commit {-}m }\StringTok{"\textless{}message\textgreater{}"}
\FunctionTok{git}\NormalTok{ push}
\end{Highlighting}
\end{Shaded}

\hypertarget{collaborations}{%
\subsection{Collaborations}\label{collaborations}}

Two options.

\begin{itemize}
\tightlist
\item
  Sharing a repository (suitable for a private project).
\item
  Fork and pull (suitable for an open-source project).
  \hspace{0pt} * The one who maintains the repository becomes the maintainer.
  \hspace{0pt} * The others can \href{https://help.GitHub.com/articles/about-forks/}{fork}, make changes, and even \href{https://help.GitHub.com/articles/about-pull-requests/}{pull} them back.
\end{itemize}

\hypertarget{deployment-github-pages}{%
\subsection{Deployment: GitHub Pages}\label{deployment-github-pages}}

Useful to deploy websites. I used the GitHub page to deploy this book.

\hypertarget{tracking-progress-github-issues}{%
\subsection{Tracking progress: GitHub Issues}\label{tracking-progress-github-issues}}

Useful to collect and respond to questions and suggestions (e.g., bug reports and feature suggestions) on the projects on which you're working.

\hypertarget{project-management-github-dashboards}{%
\subsection{Project management: GitHub Dashboards}\label{project-management-github-dashboards}}

I use GitHub dashboards for almost every project that I have done.

\hypertarget{using-git-clients}{%
\subsection{Using Git clients}\label{using-git-clients}}

Okay. Using command-line Git helps you understand how Git works. Also, you know that I'd love to do almost everything in the terminal if possible. However, using the command-line Git is sometimes too complicated or too buggy. An alternative is using Git clients (GUI).

I use \href{https://www.gitkraken.com/}{Git Kraken} because it's free, works on almost every OS (Windows, Mac, and Linux), and is quite versatile. It's instrumental in tracing and understanding your development process (e.g., commits, branches, pull requests, comments). I also heard positive things about \href{https://desktop.github.com/}{GitHub Desktop} (Windows and Mac) and \href{https://www.sourcetreeapp.com/}{Sourcetree} (Windows).
\#\# Getting started in R

\hypertarget{the-big-picture-2}{%
\subsection{The Big Picture}\label{the-big-picture-2}}

When you are reading this section, please note that you've already grasped some key concepts behind R programming language (functions and objects).

UNIX Commands (\texttt{cat}) = R Functions (\texttt{print})
Files = R Objects

\hypertarget{rstudio}{%
\subsection{RStudio}\label{rstudio}}

There are two main ways of interacting with R: using the console or using script files (plain text files containing your code).

If R is ready to accept commands, the R console shows a \texttt{\textgreater{}} prompt. If it receives a command (by typing, copy-pasting, or sent from the script editor using \texttt{Ctrl-Enter}; \texttt{Command-Enter} will also work on Macs), R will try to execute it, and when ready, show the results and come back with a new \texttt{\textgreater{}}-prompt to wait for further commands. This is the equivalent of the \texttt{\$} in your terminal.

\hypertarget{basic-syntax}{%
\subsection{Basic Syntax}\label{basic-syntax}}

\textbf{Comments}

Use \texttt{\#} signs to comment. Comment liberally in your R scripts. Anything to the right of a \texttt{\#} is ignored by R. For those of you familiar with other languages, there is no doc string, or equivalent to `""``` in R.

\textbf{Assignment operator}

\texttt{\textless{}-} is the assignment operator. It assigns values on the right to objects on the left. So, after executing \texttt{x\ \textless{}-\ 3}, \texttt{x} value is \texttt{3}. The arrow can be read as 3 \textbf{goes into} \texttt{x}. You can also use \texttt{=} for assignments.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{USweird \textless{}{-}}\StringTok{ "Why use lb for pound!"} \CommentTok{\# Use this}

\StringTok{"Why use lb for pound!"}\NormalTok{ =}\StringTok{ }\NormalTok{USweird}
\end{Highlighting}
\end{Shaded}

Nonetheless, \emph{can} does not mean you \emph{should}. It is good practice to use \texttt{\textless{}-} for assignments. \texttt{=} should only be used to specify the values of arguments of functions. This is what Google and Hadley Wickham recommend as well. If they don't convince you enough, here's \href{https://csgillespie.wordpress.com/2010/11/16/assignment-operators-in-r-vs/}{a real example}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{mean}\NormalTok{(}\DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{) }\CommentTok{\# Does it save x?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 5.5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rm}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in rm(x): object 'x' not found
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{mean}\NormalTok{(x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{) }\CommentTok{\# Does it save x?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 5.5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rm}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\textbf{Printing}

In R, an object's contents can be printed by simply executing the object name or calling the \texttt{print()} function.

\textbf{Help}

\begin{itemize}
\tightlist
\item
  \texttt{?} + object opens a help page for that specific object
\item
  \texttt{??} + object searches help pages containing the name of the object
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{?mean}
\NormalTok{??mean}
\KeywordTok{help}\NormalTok{(mean)}

\CommentTok{\# The above three will do the same. }

\KeywordTok{example}\NormalTok{(ls) }\CommentTok{\# provides an example(s) for how to use ls }

\KeywordTok{help.search}\NormalTok{(}\StringTok{"visualization"}\NormalTok{) }\CommentTok{\# search functions and packages that have "visualization" in their descriptions}
\end{Highlighting}
\end{Shaded}

\hypertarget{environment}{%
\subsection{Environment}\label{environment}}

Environment = a collection of pairs

\hypertarget{objects}{%
\subsubsection{Objects}\label{objects}}

\begin{itemize}
\tightlist
\item
  List objects in your current environment
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a numeric object }
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{3}\NormalTok{,}\DecValTok{4}\NormalTok{,}\DecValTok{5}\NormalTok{)}

\CommentTok{\# List the object }
\KeywordTok{ls}\NormalTok{()}

\CommentTok{\# Remove the object }
\KeywordTok{rm}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Remove objects from your current environment
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create an object }
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{5}

\CommentTok{\# Remove the object }
\KeywordTok{rm}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Remove all objects from your current environment
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create an object }
\NormalTok{a \textless{}{-}}\StringTok{ }\DecValTok{7}

\NormalTok{b \textless{}{-}}\StringTok{ }\DecValTok{3}

\CommentTok{\# Remove the object }
\KeywordTok{rm}\NormalTok{(}\DataTypeTok{list =} \KeywordTok{ls}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Force memory release
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Garbage collect; for more information, type ?gc() }

\KeywordTok{gc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\hypertarget{packages}{%
\subsubsection{Packages}\label{packages}}

\texttt{install.packages(package-name)} will download a package from one of the CRAN mirrors, assuming that a binary is available for your operating system.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# From CRAN}
\KeywordTok{install.packages}\NormalTok{(}\StringTok{"dplyr"}\NormalTok{) }

\CommentTok{\# Load package }
\KeywordTok{library}\NormalTok{(dplyr)}

\CommentTok{\# From GitHub }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install\_GitHub}\NormalTok{(}\StringTok{"jaeyk/tidytweetjson"}\NormalTok{) }\CommentTok{\# my own package }

\CommentTok{\# Unload package }
\CommentTok{\# detach("package:stats", unload=TRUE)}
\end{Highlighting}
\end{Shaded}

\textbf{Tips}

If you have multiple packages to install, then please consider using the \texttt{pacman} package. The following is an example. First, you install \texttt{pacman}. Then, you load several libraries by using \texttt{p\_load()} method.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}

\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  ggplot2,}
\NormalTok{  dplyr, }
\NormalTok{  broom}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

If you don't like to use \texttt{pacman,} the other option is to create a list (we're going to learn what is list soon).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pkgs \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"ggplot2"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{, }\StringTok{"broom"}\NormalTok{)}

\KeywordTok{install.packages}\NormalTok{(pkgs)}
\end{Highlighting}
\end{Shaded}

Still, we have to write two lines. The more straightforward, the better, right? Here's another approach that can simplify the code further.

Note that \texttt{lapply()} applies (there's a family of apply functions) a function to a list. In this case, library to pkgs. \texttt{apply} is an advanced concept related to anonymous functions. We will learn about it later when we study functions.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{inst \textless{}{-}}\StringTok{ }\KeywordTok{lapply}\NormalTok{(pkgs, library, }
               \DataTypeTok{character.only =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{project-oriented-research}{%
\section{Project-oriented research}\label{project-oriented-research}}

\hypertarget{the-big-picture-3}{%
\subsection{The Big Picture}\label{the-big-picture-3}}

\textbf{Computational reproducibility}

\begin{itemize}
\item
  Replication = code + data
\item
  Computational reproduciblity = code + data + environment + distribution
\item
  Reproducibility checklist by \href{http://www.biostat.jhsph.edu/~rpeng/}{Roger Peng}

  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \item
    Start with science (avoid vague questions and concepts)
  \item
    Don't do things by hand (not only about automation but also documentation)
  \item
    Don't point and click (same problem)
  \item
    Teach a computer (automation also solves documentation to some extent)
  \item
    Use some version control
  \item
    Don't save output (instead, keep the input and code)
  \item
    Set your seed
  \item
    Think about the entire pipeline
  \end{enumerate}
\end{itemize}

\hypertarget{setup-1}{%
\subsubsection{Setup}\label{setup-1}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  tidyverse, }\CommentTok{\# tidyverse}
\NormalTok{  here }\CommentTok{\# computational reproducibility}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{motivation-1}{%
\subsubsection{Motivation}\label{motivation-1}}

Why do you need to make your research computationally reproducible?: for your self-interest and public benefits.

\includegraphics{https://GitHub.com/dlab-berkeley/efficient-reproducible-project-management-in-R/blob/master/misc/screenshot.png?raw=true}

\hypertarget{how-to-organize-files-in-a-project}{%
\subsubsection{How to organize files in a project}\label{how-to-organize-files-in-a-project}}

You won't be able to reproduce your project unless it is efficiently organized.

Step 1. \href{https://environments.rstudio.com/}{\textbf{Environment}} is part of your project. If someone can't reproduce your environment, they won't be able to run your code.

\begin{itemize}
\tightlist
\item
  Launch R Studio. Tools \textgreater{} Global Options. You \textbf{should not} check Restore .RData into workspace at startup. Also, set the saving workspace option to \textbf{NEVER.}
\end{itemize}

Step 2. For each project, create a project directory named after the project.

name\_of\_the\_project

\begin{itemize}
\tightlist
\item
  data:

  \begin{itemize}
  \tightlist
  \item
    raw
  \item
    processed (all processed, cleaned, and tided)
  \end{itemize}
\item
  figures
\item
  packrat (optional)
\item
  reports (PDF, HTML, TEX, etc.,)
\item
  results (model outcomes, etc.,)
\item
  scripts (i.e., functions)
\item
  .gitignore (for Git)
\item
  name\_of\_project.Rproj (for R)
\item
  README.md (for Git)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Don\textquotesingle{}t name it a project. Instead, use a more informative name. For instance, \textasciigrave{}us\_election\textasciigrave{}, not \textasciigrave{}my\_project.\textasciigrave{}}

\KeywordTok{dir.create}\NormalTok{(}\StringTok{"../us\_election"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Step 3. Launch R Studio. Choose File \textgreater{} New project \textgreater{} Browse existing directories \textgreater{} Create project. This allows each project has its workspace.

Step 4. Organize files by putting them in separate subdirectories and sensibly naming them.

\begin{itemize}
\item
  Treat raw data as read-only (raw data should be RAW!) and put it in the \texttt{data} subdirectory.

  \begin{itemize}
  \tightlist
  \item
    Again, note that version control does not need to replace backup. You still need to back up your raw data.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"data"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Separate read-only data from processed data and put in the \texttt{processed\_data} subdirectory.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"processed\_data"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Put your code in the \texttt{src} subdirectory.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"src"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Put generated outputs (e.g., tables, figures) in the \texttt{outputs} subdirectory and treat them as disposable.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"outputs"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Put your custom functions in the \texttt{functions} subdirectory. Then, you can gather some of these functions and distribute them as an open-source library.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dir.create}\NormalTok{(here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"us\_election"}\NormalTok{, }\StringTok{"functions"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

Are you tired of creating these directories one by one? Why not automate? See the following example. You can save this function as a rscript (e.g., \texttt{setup.r}) and run it in the terminal using \texttt{Rscript\ \textless{}script\ name\textgreater{}.}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(pacman)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}

\CommentTok{\# Load here}
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  purrr, }\CommentTok{\# functional programming}
\NormalTok{  here }\CommentTok{\# computational reproducibility}
\NormalTok{)}

\CommentTok{\# Custom function}
\NormalTok{create\_dirs \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(name) \{}
  \KeywordTok{dir.create}\NormalTok{(}\KeywordTok{here}\NormalTok{(name))}
\NormalTok{\}}

\CommentTok{\# Apply function }
\NormalTok{purrr}\OperatorTok{::}\KeywordTok{map}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"data"}\NormalTok{, }\StringTok{"processed\_data"}\NormalTok{, }\StringTok{"src"}\NormalTok{, }\StringTok{"outputs"}\NormalTok{, }\StringTok{"functions"}\NormalTok{), create\_dirs)}
\end{Highlighting}
\end{Shaded}

\textbf{Challenge}

Set a project structure for a project named ``starwars.''

\hypertarget{how-to-organize-code-in-an-r-markdown-file}{%
\subsubsection{How to organize code in an R markdown file}\label{how-to-organize-code-in-an-r-markdown-file}}

\begin{itemize}
\item
  In addition to environment, \textbf{workflow} is an essential component of project efficiency and reproducibility.
\item
  What is R markdown? An R package, developed by \href{https://yihui.org/en/}{Yihui Xie}, provides an authoring framework for data science. Xie is also a developer of many widely popular R packages such as \texttt{knitr,} \href{https://GitHub.com/yihui/xaringan}{\texttt{xaringan}} (cool kids use xaringan not \href{https://en.wikipedia.org/wiki/Beamer_(LaTeX)}{Beamer} these days), \texttt{blogdown} (used to create \href{https://jaeyk.GitHub.io/}{my personal website}), and \texttt{bookdown} (used to create this book) among many others.

  \begin{itemize}
  \tightlist
  \item
    Many applications: \href{https://rstudio.GitHub.io/distill/basics.html}{reports}, \href{https://bookdown.org/yihui/rmarkdown/xaringan.html}{presentations}, \href{https://rmarkdown.rstudio.com/flexdashboard/}{dashboards}, \href{https://bookdown.org/yihui/rmarkdown/websites.html}{websites}\\
  \item
    Check out \href{https://ysc-rmarkdown.netlify.app/}{Communicating with R markdown workshop} by \href{https://alison.rbind.io/}{Alison Hill} (RStudio)

    \begin{itemize}
    \tightlist
    \item
      Alison Hill is a co-author of \href{https://bookdown.org/yihui/blogdown/}{\texttt{blogdown:\ Creating\ Websites\ with\ R\ Markdown.}}
    \end{itemize}
  \item
    Key strengths: dynamic reporting + reproducible science + easy deployment
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://GitHub.com/rstudio/concept-maps/raw/master/en/rmarkdown.svg}
\caption{Concept map for R Markdown. By Gabriela Sandoval, Florencia D'Andrea, Yanina Bellini Saibene, Monica Alonso.}
\end{figure}

\begin{itemize}
\tightlist
\item
  R Markdown basic syntax
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Header 1}
\CommentTok{\#\# Header 2}
\CommentTok{\#\#\# Header 3}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Use these section headers to indicate workflow.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Import packages and data}
\CommentTok{\# Tidy data}
\CommentTok{\# Wrangle data}
\CommentTok{\# Model data}
\CommentTok{\# Visualize data}
\end{Highlighting}
\end{Shaded}

Press \texttt{ctrl\ +\ shift\ +\ o}. You can see a document outline based on these headers. This is a nice feature for finding the code you need to focus on.

If your project's scale is large, divide these sections into files and numbers and save them in the \texttt{code} subdirectory.

\begin{itemize}
\tightlist
\item
  01\_wrangling.Rmd
\item
  02\_modeling.Rmd
  \ldots{}
\end{itemize}

\hypertarget{making-a-project-computationally-reproducible}{%
\subsubsection{Making a project computationally reproducible}\label{making-a-project-computationally-reproducible}}

\begin{itemize}
\item
  \texttt{setwd()}: set a working directory.
\item
  Note that using \texttt{setwd()} is not a reproducible way to set up your project. For instance, none will be able to run the following code except me.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Set a working directory }
\KeywordTok{setwd}\NormalTok{(}\StringTok{"/home/jae/starwars"}\NormalTok{)}

\CommentTok{\# Do something }
\KeywordTok{ggplot}\NormalTok{(mtcars, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ mpg, }\DataTypeTok{y =}\NormalTok{ wt)) }\OperatorTok{+}
\StringTok{   }\KeywordTok{geom\_point}\NormalTok{()}

\CommentTok{\# Export the object. }
\CommentTok{\# dot means the working directory set by setwd()}
\KeywordTok{ggsave}\NormalTok{(}\StringTok{"./outputs/example.png"}\NormalTok{) }\CommentTok{\# This is called relative path }
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Instead, learn how to use \texttt{here()}'.

  \begin{itemize}
  \item
    Key idea: separate workflow (e.g., workspace information) from products (code and data). For more information, please read Jenny Bryan's excellent piece on \href{https://www.tidyverse.org/blog/2017/12/workflow-vs-script/}{project-oriented workflow}.
  \item
    Example
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New: Reproducible }

\KeywordTok{ggplot}\NormalTok{(mtcars, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ mpg, }\DataTypeTok{y =}\NormalTok{ wt)) }\OperatorTok{+}
\StringTok{   }\KeywordTok{geom\_point}\NormalTok{()}

\KeywordTok{ggsave}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"project"}\NormalTok{, }\StringTok{"outputs"}\NormalTok{, }\StringTok{"example.png"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  How \texttt{here} works
\end{itemize}

\texttt{here()} function shows what's the top-level project directory.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{here}\OperatorTok{::}\KeywordTok{here}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Build a path including subdirectories
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{here}\OperatorTok{::}\KeywordTok{here}\NormalTok{(}\StringTok{"project"}\NormalTok{, }\StringTok{"outputs"}\NormalTok{)}
           \CommentTok{\#depth 1   \#depth 2}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  How \texttt{here} defines the top-level project directory. For example, the following list came from \href{https://GitHub.com/jennybc/here_here}{the here package vignette}).

  \begin{itemize}
  \item
    Is a file named .here present?
  \item
    Is this an RStudio Project? (\textbf{Note that we already set up an RStudio Project!} So, if you use RStudio's project feature, then you are ready to use \texttt{here}.)
  \item
    Is this an R package? Does it have a DESCRIPTION file?
  \item
    Is this a remake project? Does it have a file named \texttt{remake.yml}?
  \item
    Is this a projectile project? Does it have a file named \texttt{.projectile}?
  \item
    Is this a checkout from a version control system? Does it have a directory named \texttt{.git} or \texttt{.svn}? Currently, only Git and Subversion are supported.
  \item
    If there's no match then use \texttt{set\_here()} to create an empty \texttt{.here} file.
  \end{itemize}
\end{itemize}

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Can you define computational reproducibility?
\item
  Can you explain why sharing code and data is not enough for computational reproducibility
\end{enumerate}

\hypertarget{references-1}{%
\subsection{References}\label{references-1}}

\begin{itemize}
\item
  Code and data management

  \begin{itemize}
  \tightlist
  \item
    \href{https://web.stanford.edu/~gentzkow/research/CodeAndData.pdf}{``Code and Data for the Social Sciences: A Practitioner's Guide''} by Matthew Gentkow and Jesse M. Shapiro
  \end{itemize}
\item
  Project-oriented research

  \begin{itemize}
  \item
    Computational reproducibility

    \begin{itemize}
    \item
      \href{https://GitHub.com/swcarpentry/good-enough-practices-in-scientific-computing/blob/gh-pages/good-enough-practices-for-scientific-computing.pdf}{``Good Enough Practices in Scientific Computing''} by PLOS
    \item
      \href{https://swcarpentry.GitHub.io/r-novice-gapminder/02-project-intro/}{Project Management with RStudio} by Software Carpentry
    \item
      \href{https://kbroman.org/steps2rr/}{Initial steps toward reproducible research} by Karl Broman
    \end{itemize}
  \item
    Version control

    \begin{itemize}
    \item
      \href{https://swcarpentry.GitHub.io/git-novice/}{Version Control with Git} by Software Carpentry
    \item
      \href{http://plain-text.co/}{The Plain Person's Guide to Plain Text Social Science} by Kieran Healy
    \end{itemize}
  \end{itemize}
\end{itemize}

\hypertarget{writing-code-how-to-code-like-a-professional}{%
\section{Writing code: How to code like a professional}\label{writing-code-how-to-code-like-a-professional}}

\hypertarget{the-bic-piture}{%
\subsection{The Bic Piture}\label{the-bic-piture}}

\begin{itemize}
\tightlist
\item
  What is code style?
\end{itemize}

\begin{quote}
Every major open-source project has its style guide: a set of conventions (sometimes arbitrary) about writing code for that project. It is much easier to understand a large codebase when all the code in it is in a consistent style. - \href{https://google.GitHub.io/styleguide/}{Google Style Guides}
\end{quote}

\begin{itemize}
\item
  How to avoid smelly code?

  \begin{itemize}
  \tightlist
  \item
    Check out \href{https://GitHub.com/jennybc/code-smells-and-feels\#readme}{the code-smells Git repository} by Jenny Bryan.
  \end{itemize}
\end{itemize}

\hypertarget{write-readable-code}{%
\subsection{Write readable code}\label{write-readable-code}}

\begin{itemize}
\item
  Naming matters

  \begin{itemize}
  \tightlist
  \item
    When naming files, remember the following three rules:

    \begin{itemize}
    \tightlist
    \item
      Machine-readable (avoid spaces, punctuation, periods, and any other special characters except \_ and -)
    \item
      Human readable (should be meaningful. No text1, image1, etc.,)
    \item
      Ordering (e.g., 01, 02, 03, \ldots{} )
    \end{itemize}
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\NormalTok{fit\_models.R}

\CommentTok{\# Bad}
\NormalTok{fit models.R}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  When naming objects:

  \begin{itemize}
  \tightlist
  \item
    Don't use special characters.
  \item
    Don't capitalize.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good }
\NormalTok{day\_one}
    
\CommentTok{\# Bad }
\NormalTok{DayOne}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  When naming functions:

  \begin{itemize}
  \tightlist
  \item
    Don't use special characters.
  \item
    Don't capitalize.
  \item
    Use \texttt{verbs} instead of \texttt{nouns}. (Functions do something!)
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good }
\NormalTok{run\_rdd }

\CommentTok{\# Bad }
\NormalTok{rdd}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Spacing
\end{itemize}

Some people do spacing by pressing the Tab key, and others do it by pressing the Space key multiple times (and this is a serious subject).

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\NormalTok{x[, }\DecValTok{1}\NormalTok{] }

\KeywordTok{mean}\NormalTok{(x, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{) }

\CommentTok{\# Bad}

\NormalTok{x[,}\DecValTok{1}\NormalTok{]}

\KeywordTok{mean}\NormalTok{ (x, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Indenting
\end{itemize}

Indent at least 4 spaces. Note that some people, including none other than \href{https://simplystatistics.org/2018/07/27/why-i-indent-my-code-8-spaces/}{Roger Peng}, indent 8 spaces. The below example shows how you can change the default indentation setting using the RStudio configuration.

\begin{figure}
\centering
\includegraphics{https://pbs.twimg.com/media/CuHHs7yXgAAFWeh?format=jpg\&name=360x360}
\caption{Roger Peng's tweet}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\ControlFlowTok{if}\NormalTok{ (y }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{) \{}
  \KeywordTok{message}\NormalTok{(}\StringTok{"y is negative"}\NormalTok{)}
\NormalTok{\}}

\CommentTok{\# Bad}
\ControlFlowTok{if}\NormalTok{ (y }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{) \{}
\KeywordTok{message}\NormalTok{(}\StringTok{"Y is negative"}\NormalTok{)\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Long lines
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\KeywordTok{do\_something\_very\_complicated}\NormalTok{(}
  \DataTypeTok{something =} \StringTok{"that"}\NormalTok{,}
  \DataTypeTok{requires =}\NormalTok{ many,}
  \DataTypeTok{arguments =} \StringTok{"some of which may be long"}
\NormalTok{)}

\CommentTok{\# Bad}
\KeywordTok{do\_something\_very\_complicated}\NormalTok{(}\StringTok{"that"}\NormalTok{, }\DataTypeTok{requires =}\NormalTok{ many, }\DataTypeTok{arguments =}
                              \StringTok{"some of which may be long"}
\NormalTok{                              )}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Comments

  \begin{itemize}
  \tightlist
  \item
    Use comments to explain your decisions.
  \item
    But, show your code; Do not try to explain your code by comments.
  \item
    Also, try to comment out rather than delete the code you experiment with.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Average sleep hours of Jae}
\NormalTok{jae }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# By week}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(week) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Mean sleep hours }
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{week\_sleep =} \KeywordTok{mean}\NormalTok{(sleep, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Pipes (chaining commands)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Good}
\NormalTok{iris }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(Species) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize\_if}\NormalTok{(is.numeric, mean) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ungroup}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(measure, value, }\OperatorTok{{-}}\NormalTok{Species) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(value)}

\CommentTok{\# Bad}
\NormalTok{iris }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{group\_by}\NormalTok{(Species) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{summarize\_all}\NormalTok{(mean) }\OperatorTok{\%\textgreater{}\%}
\NormalTok{ungroup }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{gather}\NormalTok{(measure, value, }\OperatorTok{{-}}\NormalTok{Species) }\OperatorTok{\%\textgreater{}\%}
\KeywordTok{arrange}\NormalTok{(value)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Additional tips
\item
  Use \texttt{lintr} to check whether your code complies with a recommended style guideline (e.g., \texttt{tidyverse}) and \texttt{styler} package to format your code according to the style guideline.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://camo.GitHubusercontent.com/6cb80270269165a8d3046d2da03cbf2b8f19ee2f/687474703a2f2f692e696d6775722e636f6d2f61635632374e562e676966}
\caption{how lintr works}
\end{figure}

\hypertarget{write-reusable-code}{%
\subsection{Write reusable code}\label{write-reusable-code}}

\begin{itemize}
\tightlist
\item
  Pasting
\end{itemize}

\begin{quote}
Copy-and-paste programming, sometimes referred to as just pasting, is the production of highly repetitive computer programming code, as produced by copy and paste operations. It is primarily a pejorative term; those who use the term are often implying a lack of programming competence. It may also be the result of technology limitations (e.g., an insufficiently expressive development environment) as subroutines or libraries would normally be used instead. However, there are occasions when copy-and-paste programming is considered acceptable or necessary, such as for boilerplate, loop unrolling (when not supported automatically by the compiler), or certain programming idioms, and it is supported by some source code editors in the form of snippets. - \href{https://en.wikipedia.org/wiki/Copy-and-paste_programming}{Wikipedia}
\end{quote}

\begin{itemize}
\item
  It's okay for pasting for the first attempt to solve a problem. But if you copy and paste three times (a.k.a. \href{https://en.wikipedia.org/wiki/Rule_of_three_(computer_programming)}{Rule of Three} in programming), something's wrong. You're working too hard. You need to be lazy. What do I mean, and how can you do that?
\item
  The following exercise was inspired by \href{http://adv-r.had.co.nz/Functional-programming.html}{Wickham's example}.
\item
  Let's imagine \texttt{df} is a survey dataset.

  \begin{itemize}
  \item
    \texttt{a,\ b,\ c,\ d} = Survey questions
  \item
    \texttt{-99}: non-responses
  \item
    Your goal: replace \texttt{-99} with \texttt{NA}
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Data}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility }

\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}\StringTok{"a"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{ , }\DataTypeTok{replace=} \OtherTok{TRUE}\NormalTok{),}
             \StringTok{"b"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{ , }\DataTypeTok{replace=} \OtherTok{TRUE}\NormalTok{),}
             \StringTok{"c"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{ , }\DataTypeTok{replace=} \OtherTok{TRUE}\NormalTok{),}
             \StringTok{"d"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{ , }\DataTypeTok{replace=} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Copy and paste }
\NormalTok{df}\OperatorTok{$}\NormalTok{a[df}\OperatorTok{$}\NormalTok{a }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{b[df}\OperatorTok{$}\NormalTok{b }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{c[df}\OperatorTok{$}\NormalTok{c }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{d[df}\OperatorTok{$}\NormalTok{d }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##       a     b     c     d
##   <dbl> <dbl> <dbl> <dbl>
## 1     3     3     3     1
## 2     3     2     3     1
## 3     1    NA     1     2
## 4     1    NA     2     1
## 5    NA     1     1     3
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Using a function

  \begin{itemize}
  \tightlist
  \item
    function: input + computation + output
  \item
    If you write a function, you gain efficiency because you don't need to copy and paste the computation part.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a custom function}
\NormalTok{fix\_missing \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x) \{ }\CommentTok{\# INPUT}
\NormalTok{  x[x }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA} \CommentTok{\# COMPUTATION}
\NormalTok{  x }\CommentTok{\# OUTPUT }
\NormalTok{\}}

\CommentTok{\# Apply the function to each column (vector)}
\CommentTok{\# This iterated part can and should be automated.}
\NormalTok{df}\OperatorTok{$}\NormalTok{a \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{a)}
\NormalTok{df}\OperatorTok{$}\NormalTok{b \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{b)}
\NormalTok{df}\OperatorTok{$}\NormalTok{c \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{c)}
\NormalTok{df}\OperatorTok{$}\NormalTok{d \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{d)}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Automation

  \begin{itemize}
  \tightlist
  \item
    Many options for automation in R: \texttt{for\ loop}, \texttt{apply} family, etc.
  \item
    Here's a tidy solution that comes from the \texttt{purrr} package.
  \item
    The power and joy of one-liner.
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{map\_df}\NormalTok{(df, fix\_missing) }\CommentTok{\# What is this magic? We will unpack the blackbox (\textasciigrave{}map\_df()\textasciigrave{}) later.}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Takeaways
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Your code becomes more reusable when it would be easier to \textbf{change, debug, and scale-up}. Don't repeat yourself and embrace the power of lazy programming.
\end{enumerate}

\begin{quote}
Lazy, because only lazy programmers will want to write the kind of tools that might replace them in the end. Lazy, because only a lazy programmer will avoid writing monotonous, repetitive code---thus avoiding redundancy, the enemy of software maintenance and flexible refactoring. Mostly, the tools and processes that come out of this endeavor fired by laziness will speed up the production. - \href{http://blogoscoped.com/archive/2005-08-24-n14.html}{Philipp Lenssen}
\end{quote}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Only when your code becomes \textbf{reusable}, you would become \textbf{efficient} in your data work. Otherwise, you need to start from scratch or copy and paste, when you work on a new project.
\end{enumerate}

\begin{quote}
Code reuse aims to save time and resources and reduce redundancy by taking advantage of assets that have already been created in some form within the software product development process.{[}2{]} The key idea in reuse is that parts of a computer program written at one time can be or should be used in the construction of other programs written at a later time. - Wikipedia
\end{quote}

\hypertarget{test-your-code-systematically}{%
\subsection{Test your code systematically}\label{test-your-code-systematically}}

I strongly recommend switching from adhoc testing to formal automated testing (i.e., unit testing).

\begin{quote}
Whenever you are tempted to type something into a print statement or a debugger expression, write it as a test instead. --- Martin Fowler the author of \emph{Refactoring}
\end{quote}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(testthat)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"testthat"}\NormalTok{)}

\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(testthat)}

\KeywordTok{context}\NormalTok{(}\StringTok{"Variable check"}\NormalTok{)}

\KeywordTok{test\_that}\NormalTok{(}\StringTok{"Check whether instructor variable is setup correctly"}\NormalTok{, \{}
  
\NormalTok{  instructors \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Jae"}\NormalTok{, }\StringTok{"Nick"}\NormalTok{)}

  \KeywordTok{expect\_equal}\NormalTok{(}\KeywordTok{class}\NormalTok{(instructors), }\StringTok{"character"}\NormalTok{)}

\NormalTok{\}}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Test passed
\end{verbatim}

Inspired by an example in Hadley Wickham's \href{https://journal.r-project.org/archive/2011-1/RJournal_2011-1_Wickham.pdf}{R Journal paper} (2011).

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{context}\NormalTok{(}\StringTok{"Model check"}\NormalTok{)}

\KeywordTok{test\_that}\NormalTok{(}\StringTok{"Check whether the model is lm"}\NormalTok{, \{}
  
\NormalTok{  model \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(mpg }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{wt, }\DataTypeTok{data =}\NormalTok{ mtcars)}
  
  \CommentTok{\# Passes}
  \KeywordTok{expect\_that}\NormalTok{(model, }\KeywordTok{is\_a}\NormalTok{(}\StringTok{"lm"}\NormalTok{))}

  \CommentTok{\# Fails}
  \KeywordTok{expect\_that}\NormalTok{(model, }\KeywordTok{is\_a}\NormalTok{(}\StringTok{"glm"}\NormalTok{))}

\NormalTok{\})}
\end{Highlighting}
\end{Shaded}

\hypertarget{run-tests}{%
\section{Run tests}\label{run-tests}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{test\_file}\NormalTok{(}\KeywordTok{file.choose}\NormalTok{()) }\CommentTok{\# file }

\KeywordTok{test\_dir}\NormalTok{() }\CommentTok{\# directory}

\KeywordTok{auto\_test}\NormalTok{() }\CommentTok{\# the test code tested when you save the file }
\end{Highlighting}
\end{Shaded}

\hypertarget{asking-questions-minimal-reproducible-example}{%
\subsection{Asking questions: Minimal reproducible example}\label{asking-questions-minimal-reproducible-example}}

\begin{itemize}
\item
  Chances are you're going to use StackOverflow a lot to solve a pressing problem you face. However, others can't understand/be interested in your problem unless you provide an example that they can understand with minimal effort. Such an example is called a minimal reproducible example (MRE).
\item
  Read \href{https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example}{this StackOverFlow post} to understand the concept and best practices.
\item
  Simply put, an MRE consists of the following items:

  \begin{itemize}
  \tightlist
  \item
    A minimal dataset
  \item
    The minimal burnable code
  \item
    The necessary information on package, R version, system (use \texttt{sessionInfo()})
  \item
    A seed for reproducibility (\texttt{set.seed()}), if you used a random process.
  \end{itemize}
\end{itemize}

In practice, use the \texttt{reprex} package to create the code component of the MRE.

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(reprex)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"reprex"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Copy the following code and type \texttt{reprex()} in the console.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gpa \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{, }\DecValTok{4}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\KeywordTok{mean}\NormalTok{(gpa)}
\end{Highlighting}
\end{Shaded}

\hypertarget{references-2}{%
\subsection{References}\label{references-2}}

\begin{itemize}
\item
  Writing code

  \begin{itemize}
  \tightlist
  \item
    Style guides

    \begin{itemize}
    \tightlist
    \item
      R

      \begin{itemize}
      \tightlist
      \item
        \href{https://google.GitHub.io/styleguide/Rguide.xml}{Google's R style guide}
      \item
        \href{http://r-pkgs.had.co.nz/r.html}{R code style guide} by Hadley Wickham
      \item
        \href{http://style.tidyverse.org/}{The tidyverse style guide} by Hadley Wickham
      \end{itemize}
    \item
      Python

      \begin{itemize}
      \tightlist
      \item
        \href{https://GitHub.com/google/styleguide/blob/gh-pages/pyguide.md}{Google Python Style Guide}
      \item
        \href{https://docs.python-guide.org/writing/style/\#zen-of-python}{Code Style} by the Hitchhiker's Guide to Python
      \end{itemize}
    \end{itemize}
  \end{itemize}
\end{itemize}

\hypertarget{tidy_data}{%
\chapter{Tidy data and its friends}\label{tidy_data}}

\hypertarget{setup-2}{%
\section{Setup}\label{setup-2}}

\begin{itemize}
\tightlist
\item
  Check your \texttt{dplyr} package is up-to-date by typing \texttt{packageVersion("dplyr")}. If the current installed version is less than 1.0, then update by typing \texttt{update.packages("dplyr")}. You may need to restart R to make it work.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ifelse}\NormalTok{(}\KeywordTok{packageVersion}\NormalTok{(}\StringTok{"dplyr"}\NormalTok{) }\OperatorTok{\textgreater{}=}\StringTok{ }\DecValTok{1}\NormalTok{,}
  \StringTok{"The installed version of dplyr package is greater than or equal to 1.0.0"}\NormalTok{, }\KeywordTok{update.packages}\NormalTok{(}\StringTok{"dplyr"}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "The installed version of dplyr package is greater than or equal to 1.0.0"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: pacman
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  tidyverse, }\CommentTok{\# the tidyverse framework}
\NormalTok{  skimr, }\CommentTok{\# skimming data}
\NormalTok{  here, }\CommentTok{\# computational reproducibility}
\NormalTok{  infer, }\CommentTok{\# statistical inference}
\NormalTok{  tidymodels, }\CommentTok{\# statistical modeling}
\NormalTok{  gapminder, }\CommentTok{\# toy data}
\NormalTok{  nycflights13, }\CommentTok{\# for exercise}
\NormalTok{  ggthemes, }\CommentTok{\# additional themes}
\NormalTok{  ggrepel, }\CommentTok{\# arranging ggplots}
\NormalTok{  patchwork, }\CommentTok{\# arranging ggplots}
\NormalTok{  broom, }\CommentTok{\# tidying model outputs}
\NormalTok{  waldo }\CommentTok{\# side{-}by{-}side code comparison}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{base-r-data-structure}{%
\section{Base R data structure}\label{base-r-data-structure}}

The rest of the chapter follows the basic structure in \href{https://rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf}{the Data Wrangling Cheat Sheet} created by RStudio.

To make the best use of the R language, you'll need a strong understanding of the basic data types and data structures and how to operate on those. R is an \textbf{object-oriented} language, so the importance of this cannot be understated.

It is \textbf{critical} to understand because these are the objects you will manipulate on a day-to-day basis in R, and they are not always as easy to work with as they sound at the outset. Dealing with object conversions is one of the most common sources of frustration for beginners.

\begin{quote}
To understand computations in R, two slogans are helpful:
- Everything that exists is an object.
- Everything that happens is a function call.
\end{quote}

\begin{quote}
\_\_John Chambers\_\_the creator of S (the mother of R)
\end{quote}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  \protect\hyperlink{main-classes}{Main Classes} introduces you to R's one-dimensional or atomic classes and data structures. R has five basic atomic classes: logical, integer, numeric, complex, character. Social scientists don't use complex classes.
\item
  \protect\hyperlink{attributes}{Attributes} takes a small detour to discuss attributes, R's flexible metadata specification. Here, you'll learn about factors, an important data structure created by setting attributes of an atomic vector. R has many data structures: vector, list, matrix, data frame, factors, tables.
\end{enumerate}

\begin{figure}
\centering
\includegraphics{https://github.com/rstudio/concept-maps/raw/master/en/data-types.svg}
\caption{Concept map for data types. By Meghan Sposato, Brendan Cullen, Monica Alonso.}
\end{figure}

\hypertarget{d-data-vectors}{%
\subsection{1D data: Vectors}\label{d-data-vectors}}

\hypertarget{atomic-classes}{%
\subsubsection{Atomic classes}\label{atomic-classes}}

\texttt{R}'s main atomic classes are:

\begin{itemize}
\tightlist
\item
  character (or a ``string'' in Python and Stata)
\item
  numeric (integer or float)
\item
  integer (just integer)
\item
  logical (booleans)
\end{itemize}

\begin{longtable}[]{@{}ll@{}}
\toprule
Example & Type\tabularnewline
\midrule
\endhead
``a'', ``swc'' & character\tabularnewline
2, 15.5 & numeric\tabularnewline
2 (Must add a \texttt{L} at end to denote integer) & integer\tabularnewline
\texttt{TRUE}, \texttt{FALSE} & logical\tabularnewline
\bottomrule
\end{longtable}

Like Python, R is dynamically typed. There are a few differences in terminology, however, that are pertinent.

\begin{itemize}
\tightlist
\item
  First, ``types'' in Python are referred to as ``classes'' in R.
\end{itemize}

What is a class?

\begin{figure}
\centering
\includegraphics{https://ds055uzetaobb.cloudfront.net/brioche/uploads/pJZt3mh3Ht-prettycars.png?width=2400}
\caption{from \url{https://brilliant.org/}}
\end{figure}

\begin{itemize}
\tightlist
\item
  Second, R has different names for the types string, integer, and float --- specifically \textbf{character}, \textbf{integer} (not different), and \textbf{numeric}. Because there is no ``float'' class in R, users tend to default to the ``numeric'' class when working with numerical data.
\end{itemize}

The function for recovering object classes is \texttt{class()}. L suffix to qualify any number with the intent of making it an explicit integer. See more from the \href{https://cran.r-project.org/doc/manuals/R-lang.html}{R language definition}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(}\DecValTok{3}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(3L)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "integer"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(}\StringTok{"Three"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "character"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(F)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "logical"
\end{verbatim}

\hypertarget{data-structures}{%
\subsection{Data structures}\label{data-structures}}

R's base data structures can be organized by their dimensionality (1d, 2d, or nd) and whether they're homogeneous (all contents must be of the same type) or heterogeneous (the contents can be of different types). This gives rise to the five data types most often used in data analysis:

\begin{longtable}[]{@{}lll@{}}
\toprule
& Homogeneous & Heterogeneous\tabularnewline
\midrule
\endhead
1d & Atomic vector & List\tabularnewline
2d & Matrix & Data frame\tabularnewline
nd & Array &\tabularnewline
\bottomrule
\end{longtable}

Each data structure has its specifications and behavior. For our purposes, an important thing to remember is that R is always \textbf{faster} (more efficient) working with homogeneous (\textbf{vectorized}) data.

\hypertarget{vector-properties}{%
\subsubsection{Vector properties}\label{vector-properties}}

Vectors have three common properties:

\begin{itemize}
\tightlist
\item
  Class, \texttt{class()}, or what type of object it is (same as \texttt{type()} in Python).
\item
  Length, \texttt{length()}, how many elements it contains (same as \texttt{len()} in Python).
\item
  Attributes, \texttt{attributes()}, additional arbitrary metadata.
\end{itemize}

They differ in the types of their elements: all atomic vector elements must be the same type, whereas the elements of a list can have different types.

\hypertarget{creating-different-types-of-atomic-vectors}{%
\subsubsection{Creating different types of atomic vectors}\label{creating-different-types-of-atomic-vectors}}

Remember, there are four common types of vectors:
* \texttt{logical}
* \texttt{integer}
* \texttt{numeric} (same as \texttt{double})
* \texttt{character}.

You can create an empty vector with \texttt{vector()} (By default, the mode is \texttt{logical.} You can be more explicit as shown in the examples below.) It is more common to use direct constructors such as \texttt{character()}, \texttt{numeric()}, etc.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{vector}\NormalTok{()}

\CommentTok{\# with a length and type}
\KeywordTok{vector}\NormalTok{(}\StringTok{"character"}\NormalTok{, }\DataTypeTok{length =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "" "" "" "" "" "" "" "" "" ""
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#\# character vector of length 5}
\KeywordTok{character}\NormalTok{(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "" "" "" "" ""
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{numeric}\NormalTok{(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0 0 0 0 0
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{logical}\NormalTok{(}\DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE FALSE FALSE FALSE FALSE
\end{verbatim}

Atomic vectors are usually created with \texttt{c()}, which is short for concatenate:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{length}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3
\end{verbatim}

\texttt{x} is a numeric vector. These are the most common kind. You can also have logical vectors.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\OtherTok{TRUE}\NormalTok{, }\OtherTok{TRUE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{)}

\NormalTok{y}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1]  TRUE  TRUE FALSE FALSE
\end{verbatim}

Finally, you can have character vectors:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{kim\_family \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Jae"}\NormalTok{, }\StringTok{"Sun"}\NormalTok{, }\StringTok{"Jane"}\NormalTok{)}

\KeywordTok{is.integer}\NormalTok{(kim\_family) }\CommentTok{\# integer?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.character}\NormalTok{(kim\_family) }\CommentTok{\# character?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.atomic}\NormalTok{(kim\_family) }\CommentTok{\# atomic?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{typeof}\NormalTok{(kim\_family) }\CommentTok{\# what\textquotesingle{}s the type?}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "character"
\end{verbatim}

\textbf{Short exercise: Create and examine your vector}

Create a character vector called \texttt{fruit} containing 4 of your favorite fruits. Then evaluate its structure using the commands below.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# First, create your fruit vector}
\CommentTok{\# YOUR CODE HERE}
\NormalTok{fruit \textless{}{-}}
\StringTok{  }\CommentTok{\# Examine your vector}
\StringTok{  }\KeywordTok{length}\NormalTok{(fruit)}
\KeywordTok{class}\NormalTok{(fruit)}
\KeywordTok{str}\NormalTok{(fruit)}
\end{Highlighting}
\end{Shaded}

\textbf{Add elements}

You can add elements to the end of a vector by passing the original vector into the \texttt{c} function, like the following:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{z \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Beyonce"}\NormalTok{, }\StringTok{"Kelly"}\NormalTok{, }\StringTok{"Michelle"}\NormalTok{, }\StringTok{"LeToya"}\NormalTok{)}

\NormalTok{z \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(z, }\StringTok{"Farrah"}\NormalTok{)}

\NormalTok{z}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Beyonce"  "Kelly"    "Michelle" "LeToya"   "Farrah"
\end{verbatim}

More examples of vectors

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{0.5}\NormalTok{, }\FloatTok{0.7}\NormalTok{)}

\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\OtherTok{TRUE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{)}

\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{, }\StringTok{"d"}\NormalTok{, }\StringTok{"e"}\NormalTok{)}

\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{9}\OperatorTok{:}\DecValTok{100}
\end{Highlighting}
\end{Shaded}

You can also create vectors as a sequence of numbers:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{series \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\NormalTok{series}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1]  1  2  3  4  5  6  7  8  9 10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{seq}\NormalTok{(}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1]  1  2  3  4  5  6  7  8  9 10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{seq}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DataTypeTok{by =} \FloatTok{0.1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1]  1.0  1.1  1.2  1.3  1.4  1.5  1.6  1.7  1.8  1.9  2.0  2.1  2.2  2.3  2.4
## [16]  2.5  2.6  2.7  2.8  2.9  3.0  3.1  3.2  3.3  3.4  3.5  3.6  3.7  3.8  3.9
## [31]  4.0  4.1  4.2  4.3  4.4  4.5  4.6  4.7  4.8  4.9  5.0  5.1  5.2  5.3  5.4
## [46]  5.5  5.6  5.7  5.8  5.9  6.0  6.1  6.2  6.3  6.4  6.5  6.6  6.7  6.8  6.9
## [61]  7.0  7.1  7.2  7.3  7.4  7.5  7.6  7.7  7.8  7.9  8.0  8.1  8.2  8.3  8.4
## [76]  8.5  8.6  8.7  8.8  8.9  9.0  9.1  9.2  9.3  9.4  9.5  9.6  9.7  9.8  9.9
## [91] 10.0
\end{verbatim}

Atomic vectors are always flat, even if you nest \texttt{c()}'s:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\DecValTok{2}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# the same as}
\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4
\end{verbatim}

\textbf{Types and Tests}

Given a vector, you can determine its class with \texttt{class}, or check if it's a specific type with an ``is'' function: \texttt{is.character()}, \texttt{is.numeric()}, \texttt{is.integer()}, \texttt{is.logical()}, or, more generally, \texttt{is.atomic()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{char\_var \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"harry"}\NormalTok{, }\StringTok{"sally"}\NormalTok{)}

\KeywordTok{class}\NormalTok{(char\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "character"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.character}\NormalTok{(char\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.atomic}\NormalTok{(char\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{num\_var \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\FloatTok{2.5}\NormalTok{, }\FloatTok{4.5}\NormalTok{)}

\KeywordTok{class}\NormalTok{(num\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.numeric}\NormalTok{(num\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.atomic}\NormalTok{(num\_var)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

NB: \texttt{is.vector()} does not test if an object is a vector. Instead, it returns \texttt{TRUE} only if the object is a vector with no attributes apart from names. Use \texttt{is.atomic(x)\ \textbar{}\textbar{}\ is.list(x)} to test if an object is actually a vector.

\textbf{Coercion}

All atomic vector elements must be the same type, so when you attempt to combine different types, they will be \textbf{coerced} to the \textbf{most flexible type.} Types from least to most flexible are: logical \textgreater{} integer \textgreater{} double \textgreater{} character.

For example, combining a character and an integer yields a character:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\DecValTok{1}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  chr [1:2] "a" "1"
\end{verbatim}

\textbf{Guess what the following do without running them first}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{c}\NormalTok{(}\FloatTok{1.7}\NormalTok{, }\StringTok{"a"}\NormalTok{)}

\KeywordTok{c}\NormalTok{(}\OtherTok{TRUE}\NormalTok{, }\DecValTok{2}\NormalTok{)}

\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Notice that when a logical vector is coerced to an integer or double, \texttt{TRUE} becomes 1, and \texttt{FALSE} becomes 0. This is very useful in conjunction with \texttt{sum()} and \texttt{mean()}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\OtherTok{FALSE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{, }\OtherTok{TRUE}\NormalTok{)}

\KeywordTok{as.numeric}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0 0 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Total number of TRUEs}
\KeywordTok{sum}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Proportion that is TRUE}
\KeywordTok{mean}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0.3333333
\end{verbatim}

Coercion often happens automatically. This is called implicit coercion. Most mathematical functions (\texttt{+}, \texttt{log}, \texttt{abs}, etc.) will coerce to a numeric or integer, and most logical operations (\texttt{\&}, \texttt{\textbar{}}, \texttt{any}, etc) will coerce to a logical. You will usually get a warning message if the coercion might lose information.

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{1} \OperatorTok{\textless{}}\StringTok{ "2"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\StringTok{"1"} \OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

You can also coerce vectors explicitly coerce with \texttt{as.character()}, \texttt{as.numeric()}, \texttt{as.integer()}, or \texttt{as.logical()}. Example:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{0}\OperatorTok{:}\DecValTok{6}

\KeywordTok{as.numeric}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0 1 2 3 4 5 6
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{as.logical}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{as.character}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "0" "1" "2" "3" "4" "5" "6"
\end{verbatim}

Sometimes coercions, especially nonsensical ones, won't work.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}

\KeywordTok{as.numeric}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: NAs introduced by coercion
\end{verbatim}

\begin{verbatim}
## [1] NA NA NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{as.logical}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] NA NA NA
\end{verbatim}

\textbf{Short Exercise}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 1. Create a vector of a sequence of numbers between 1 to 10.}

\CommentTok{\# 2. Coerce that vector into a character vector}

\CommentTok{\# 3. Add the element "11" to the end of the vector}

\CommentTok{\# 4. Coerce it back to a numeric vector.}
\end{Highlighting}
\end{Shaded}

\hypertarget{lists}{%
\subsubsection{Lists}\label{lists}}

Lists are also vectors, but different from atomic vectors because their elements can be of any type. In short, they are generic vectors. For example, you construct lists by using \texttt{list()} instead of \texttt{c()}:

Lists are sometimes called recursive vectors, because a list can contain other lists. This makes them fundamentally different from atomic vectors.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\StringTok{"a"}\NormalTok{, }\OtherTok{TRUE}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\DecValTok{4}\NormalTok{, }\DecValTok{5}\NormalTok{, }\DecValTok{6}\NormalTok{))}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] 1
## 
## [[2]]
## [1] "a"
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] 4 5 6
\end{verbatim}

You can coerce other objects using \texttt{as.list()}. You can test for a list with \texttt{is.list()}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{as.list}\NormalTok{(x)}

\KeywordTok{is.list}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{length}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 10
\end{verbatim}

\texttt{c()} will combine several lists into one. If given a combination of atomic vectors and lists, \texttt{c()} (con\textbf{c}atenate) will coerce the vectors to lists before combining them. Compare the results of \texttt{list()} and \texttt{c()}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{))}

\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{))}

\KeywordTok{str}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## List of 2
##  $ :List of 2
##   ..$ : num 1
##   ..$ : num 2
##  $ : num [1:2] 3 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## List of 4
##  $ : num 1
##  $ : num 2
##  $ : num 3
##  $ : num 4
\end{verbatim}

You can turn a list into an atomic vector with \texttt{unlist()}. If the elements of a list have different types, \texttt{unlist()} uses the same coercion rules as \texttt{c()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{))}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [[1]][[1]]
## [1] 1
## 
## [[1]][[2]]
## [1] 2
## 
## 
## [[2]]
## [1] 3 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{unlist}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4
\end{verbatim}

Lists are used to build up many of the more complicated data structures in R. For example, both data frames and linear models objects (as produced by \texttt{lm()}) are lists:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.list}\NormalTok{(mtcars)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mod \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(mpg }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{wt, }\DataTypeTok{data =}\NormalTok{ mtcars)}

\KeywordTok{is.list}\NormalTok{(mod)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

For this reason, lists are handy inside functions. You can ``staple'' together many different kinds of results into a single object that a function can return.

A list does not print to the console like a vector. Instead, each element of the list starts on a new line.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x.vec \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\NormalTok{x.list \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\NormalTok{x.vec}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x.list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3
\end{verbatim}

For lists, elements are \textbf{indexed by double brackets}. Single brackets will still return a(nother) list. (We'll talk more about subsetting and indexing in the fourth lesson.)

\textbf{Exercises}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  What are the four basic types of atomic vectors? How does a list differ from an atomic vector?
\item
  Why is \texttt{1\ ==\ "1"} true? Why is \texttt{-1\ \textless{}\ FALSE} true? Why is \texttt{"one"\ \textless{}\ 2} false?
\item
  Create three vectors and then combine them into a list.
\item
  If \texttt{x} is a list, what is the class of \texttt{x{[}1{]}}? How about \texttt{x{[}{[}1{]}{]}}?
\end{enumerate}

\hypertarget{attributes}{%
\subsection{Attributes}\label{attributes}}

Attributes provide additional information about the data to you, the user, and to R. We've already seen the following three attributes in action:

\begin{itemize}
\item
  Names (\texttt{names(x)}), a character vector giving each element a name.
\item
  Dimensions (\texttt{dim(x)}), used to turn vectors into matrices.
\item
  Class (\texttt{class(x)}), used to implement the S3 object system.
\end{itemize}

\textbf{Additional tips}

In an object-oriented system, a \href{https://www.google.com/search?q=what+is+class+programming\&oq=what+is+class+programming\&aqs=chrome.0.0l6.3543j0j4\&sourceid=chrome\&ie=UTF-8}{class} (an extensible problem-code-template) defines a type of object like what its properties are, how it behaves, and how it relates to other types of objects. Therefore, technically, an object is an \href{https://en.wikipedia.org/wiki/Instance_(computer_science)}{instance} (or occurrence) of a class. A method is a function associated with a particular type of object.

\hypertarget{names}{%
\subsubsection{Names}\label{names}}

You can name a vector when you create it:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DataTypeTok{a =} \DecValTok{1}\NormalTok{, }\DataTypeTok{b =} \DecValTok{2}\NormalTok{, }\DataTypeTok{c =} \DecValTok{3}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

You can also modify an existing vector:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}

\KeywordTok{names}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## NULL
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{names}\NormalTok{(x) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"e"}\NormalTok{, }\StringTok{"f"}\NormalTok{, }\StringTok{"g"}\NormalTok{)}

\KeywordTok{names}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "e" "f" "g"
\end{verbatim}

Names don't have to be unique. However, character subsetting, described in the next lesson, is the most important reason to use names, and it is most useful when the names are unique. (For Python users: when names are unique, a vector behaves like a Python dictionary key.)

Not all elements of a vector need to have a name. If some names are missing, \texttt{names()} will return an empty string for those elements. If all names are missing, \texttt{names()} will return \texttt{NULL}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DataTypeTok{a =} \DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}

\KeywordTok{names}\NormalTok{(y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "a" ""  ""
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{z \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}

\KeywordTok{names}\NormalTok{(z)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## NULL
\end{verbatim}

You can create a new vector without names using \texttt{unname(x)}, or remove names in place with \texttt{names(x)\ \textless{}-\ NULL}.

\hypertarget{factors}{%
\subsubsection{Factors}\label{factors}}

Factors are special vectors that represent categorical data. Factors can be ordered (ordinal variable) or unordered (nominal or categorical variable) and are important for modeling functions such as \texttt{lm()} and \texttt{glm()} and also in plot methods.

\textbf{Quiz}
1. If you want to enter dummy variables (Democrats = 1, Non-democrats = 0) in your regression model, should you use a numeric or factor variable?

Factors can only contain pre-defined values. Set allowed values using the \texttt{levels()} attribute. Note that a factor's levels will always be character values.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"a"}\NormalTok{)}

\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"a"}\NormalTok{))}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] a b b a
## Levels: a b
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "factor"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{levels}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "a" "b"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# You can\textquotesingle{}t use values that are not in the levels}
\NormalTok{x[}\DecValTok{2}\NormalTok{] \textless{}{-}}\StringTok{ "c"}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in `[<-.factor`(`*tmp*`, 2, value = "c"): invalid factor level, NA
## generated
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# NB: you can\textquotesingle{}t combine factors}
\KeywordTok{c}\NormalTok{(}\KeywordTok{factor}\NormalTok{(}\StringTok{"a"}\NormalTok{), }\KeywordTok{factor}\NormalTok{(}\StringTok{"b"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] a b
## Levels: a b
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rep}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{5}\NormalTok{, }\KeywordTok{rep}\NormalTok{(}\DecValTok{6}\NormalTok{, }\DecValTok{5}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 5 5 5 5 5 5
\end{verbatim}

Factors are pretty much integers that have labels on them. Underneath, it's really numbers (1, 2, 3\ldots).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"a"}\NormalTok{))}

\KeywordTok{str}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  Factor w/ 2 levels "a","b": 1 2 2 1
\end{verbatim}

They are better than using simple integer labels because factors are what are called self-describing. For example, \texttt{democrat} and \texttt{republican} is more descriptive than \texttt{1}s and \texttt{2}s.

Factors are useful when you know the possible values a variable may take, even if you don't see all values in a given dataset. Using a factor instead of a character vector makes it obvious when some groups contain no observations:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{party\_char \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"democrat"}\NormalTok{, }\StringTok{"democrat"}\NormalTok{, }\StringTok{"democrat"}\NormalTok{)}

\NormalTok{party\_char}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "democrat" "democrat" "democrat"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{party\_factor \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(party\_char, }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"democrat"}\NormalTok{, }\StringTok{"republican"}\NormalTok{))}

\NormalTok{party\_factor}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] democrat democrat democrat
## Levels: democrat republican
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{table}\NormalTok{(party\_char) }\CommentTok{\# shows only democrats}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## party_char
## democrat 
##        3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{table}\NormalTok{(party\_factor) }\CommentTok{\# shows republicans too}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## party_factor
##   democrat republican 
##          3          0
\end{verbatim}

Sometimes factors can be left unordered. Example: \texttt{democrat}, \texttt{republican.}

Other times you might want factors to be ordered (or ranked). Example: \texttt{low}, \texttt{medium}, \texttt{high}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"low"}\NormalTok{, }\StringTok{"medium"}\NormalTok{, }\StringTok{"high"}\NormalTok{))}

\KeywordTok{str}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  Factor w/ 3 levels "high","low","medium": 2 3 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.ordered}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{ordered}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"low"}\NormalTok{, }\StringTok{"medium"}\NormalTok{, }\StringTok{"high"}\NormalTok{), }\DataTypeTok{levels =} \KeywordTok{c}\NormalTok{(}\StringTok{"high"}\NormalTok{, }\StringTok{"medium"}\NormalTok{, }\StringTok{"low"}\NormalTok{))}

\KeywordTok{is.ordered}\NormalTok{(y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

While factors look (and often behave) like character vectors, they are integers. So be careful when treating them like strings. Some string methods (like \texttt{gsub()} and \texttt{grepl()}) will coerce factors to strings, while others (like \texttt{nchar()}) will throw an error, and still others (like \texttt{c()}) will use the underlying integer values.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"a"}\NormalTok{)}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "a" "b" "b" "a"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.factor}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{as.factor}\NormalTok{(x)}

\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] a b b a
## Levels: a b
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{c}\NormalTok{(x, }\StringTok{"c"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "1" "2" "2" "1" "c"
\end{verbatim}

For this reason, it's usually best to explicitly convert factors to character vectors if you need string-like behavior. There was a memory advantage to using factors instead of character vectors in early versions of R, but this is no longer the case.

Unfortunately, most data loading functions in R automatically convert character vectors to factors. This is suboptimal, because there's no way for those functions to know the set of all possible levels or their optimal order. If this becomes a problem, use the argument \texttt{stringsAsFactors\ =\ FALSE} to suppress this behavior and manually convert character vectors to factors using your knowledge of the data.

\textbf{More attributes}

All R objects can have arbitrary additional attributes used to store metadata about the object. Attributes can be considered a named list (with unique names). Attributes can be accessed individually with \texttt{attr()} or all at once (as a list) with \texttt{attributes().}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{y \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\KeywordTok{attr}\NormalTok{(y, }\StringTok{"my\_attribute"}\NormalTok{) \textless{}{-}}\StringTok{ "This is a vector"}

\KeywordTok{attr}\NormalTok{(y, }\StringTok{"my\_attribute"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "This is a vector"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# str returns a new object with modified information}
\KeywordTok{str}\NormalTok{(}\KeywordTok{attributes}\NormalTok{(y))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## List of 1
##  $ my_attribute: chr "This is a vector"
\end{verbatim}

\textbf{Exercises}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  What happens to a factor when you modify its levels?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{f1 \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(letters)}

\KeywordTok{levels}\NormalTok{(f1) \textless{}{-}}\StringTok{ }\KeywordTok{rev}\NormalTok{(}\KeywordTok{levels}\NormalTok{(f1))}

\NormalTok{f1}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] z y x w v u t s r q p o n m l k j i h g f e d c b a
## Levels: z y x w v u t s r q p o n m l k j i h g f e d c b a
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  What does this code do? How do \texttt{f2} and \texttt{f3} differ from \texttt{f1}?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{f2 \textless{}{-}}\StringTok{ }\KeywordTok{rev}\NormalTok{(}\KeywordTok{factor}\NormalTok{(letters))}

\NormalTok{f3 \textless{}{-}}\StringTok{ }\KeywordTok{factor}\NormalTok{(letters, }\DataTypeTok{levels =} \KeywordTok{rev}\NormalTok{(letters))}
\end{Highlighting}
\end{Shaded}

\hypertarget{d-data-matrices-and-dataframes}{%
\subsection{2D data: Matrices and dataframes}\label{d-data-matrices-and-dataframes}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Matrices: data structures for storing 2d data that is all the same class.
\item
  Dataframes: teaches you about the dataframe, the most important data structure for storing data in R, because it stores different kinds of (2d) data.
\end{enumerate}

\hypertarget{matrices}{%
\subsubsection{Matrices}\label{matrices}}

Matrices are created when we combine multiple vectors with the same class (e.g., numeric). This creates a dataset with rows and columns. By definition, if you want to combine multiple classes of vectors, you want a dataframe. You can coerce a matrix to become a dataframe and vice-versa, but as with all vector coercions, the results can be unpredictable, so be sure you know how each variable (column) will convert.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{m \textless{}{-}}\StringTok{ }\KeywordTok{matrix}\NormalTok{(}\DataTypeTok{nrow =} \DecValTok{2}\NormalTok{, }\DataTypeTok{ncol =} \DecValTok{2}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2]
## [1,]   NA   NA
## [2,]   NA   NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dim}\NormalTok{(m)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2 2
\end{verbatim}

Matrices are filled column-wise.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{m \textless{}{-}}\StringTok{ }\KeywordTok{matrix}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{6}\NormalTok{, }\DataTypeTok{nrow =} \DecValTok{2}\NormalTok{, }\DataTypeTok{ncol =} \DecValTok{3}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
\end{verbatim}

Other ways to construct a matrix

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{m \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\KeywordTok{dim}\NormalTok{(m) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{2}\NormalTok{, }\DecValTok{5}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dim}\NormalTok{(m) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{5}\NormalTok{, }\DecValTok{2}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
\end{verbatim}

You can transpose a matrix (or dataframe) with \texttt{t()}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{m \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{10}

\KeywordTok{dim}\NormalTok{(m) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{2}\NormalTok{, }\DecValTok{5}\NormalTok{)}

\NormalTok{m}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{t}\NormalTok{(m)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
## [3,]    5    6
## [4,]    7    8
## [5,]    9   10
\end{verbatim}

Another way is to bind columns or rows using \texttt{cbind()} and \texttt{rbind()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}

\NormalTok{y \textless{}{-}}\StringTok{ }\DecValTok{10}\OperatorTok{:}\DecValTok{12}

\KeywordTok{cbind}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      x  y
## [1,] 1 10
## [2,] 2 11
## [3,] 3 12
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# or}

\KeywordTok{rbind}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   [,1] [,2] [,3]
## x    1    2    3
## y   10   11   12
\end{verbatim}

You can also use the \texttt{byrow} argument to specify how the matrix is filled. From R's own documentation:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mdat \textless{}{-}}\StringTok{ }\KeywordTok{matrix}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{11}\NormalTok{, }\DecValTok{12}\NormalTok{, }\DecValTok{13}\NormalTok{),}
  \DataTypeTok{nrow =} \DecValTok{2}\NormalTok{,}
  \DataTypeTok{ncol =} \DecValTok{3}\NormalTok{,}
  \DataTypeTok{byrow =} \OtherTok{TRUE}\NormalTok{,}
  \DataTypeTok{dimnames =} \KeywordTok{list}\NormalTok{(}
    \KeywordTok{c}\NormalTok{(}\StringTok{"row1"}\NormalTok{, }\StringTok{"row2"}\NormalTok{),}
    \KeywordTok{c}\NormalTok{(}\StringTok{"C.1"}\NormalTok{, }\StringTok{"C.2"}\NormalTok{, }\StringTok{"C.3"}\NormalTok{)}
\NormalTok{  )}
\NormalTok{)}
\NormalTok{mdat}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      C.1 C.2 C.3
## row1   1   2   3
## row2  11  12  13
\end{verbatim}

Notice that we gave \texttt{names} to the dimensions in \texttt{mdat}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dimnames}\NormalTok{(mdat)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "row1" "row2"
## 
## [[2]]
## [1] "C.1" "C.2" "C.3"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rownames}\NormalTok{(mdat)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "row1" "row2"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{colnames}\NormalTok{(mdat)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "C.1" "C.2" "C.3"
\end{verbatim}

\hypertarget{dataframes}{%
\subsubsection{Dataframes}\label{dataframes}}

A data frame is an essential data type in R. It's pretty much the \textbf{de facto} data structure for most tabular data and what we use for statistics.

\hypertarget{creation}{%
\paragraph{Creation}\label{creation}}

You create a data frame using \texttt{data.frame()}, which takes named vectors as input:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec1 \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}
\NormalTok{vec2 \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(vec1, vec2)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   vec1 vec2
## 1    1    a
## 2    2    b
## 3    3    c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 'data.frame':    3 obs. of  2 variables:
##  $ vec1: int  1 2 3
##  $ vec2: chr  "a" "b" "c"
\end{verbatim}

Beware: \texttt{data.frame()}'s default behavior which turns strings into factors. Remember to use \texttt{stringAsFactors\ =\ FALSE} to suppress this behavior as needed:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{,}
  \DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{),}
  \DataTypeTok{stringsAsFactors =} \OtherTok{FALSE}
\NormalTok{)}
\KeywordTok{str}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 'data.frame':    3 obs. of  2 variables:
##  $ x: int  1 2 3
##  $ y: chr  "a" "b" "c"
\end{verbatim}

In reality, we rarely type up our datasets ourselves, and certainly not in R. The most common way to make a data.frame is by calling a file using \texttt{read.csv} (which relies on the \texttt{foreign} package), \texttt{read.dta} (if you're using a Stata file), or some other kinds of data file input.

\hypertarget{structure-and-attributes}{%
\paragraph{Structure and Attributes}\label{structure-and-attributes}}

Under the hood, a data frame is a list of equal-length vectors. This makes it a 2-dimensional structure, so it shares properties of both the matrix and the list.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec1 \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}
\NormalTok{vec2 \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(vec1, vec2)}

\KeywordTok{str}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 'data.frame':    3 obs. of  2 variables:
##  $ vec1: int  1 2 3
##  $ vec2: chr  "a" "b" "c"
\end{verbatim}

This means that a dataframe has \texttt{names()}, \texttt{colnames()}, and \texttt{rownames()}, although \texttt{names()} and \texttt{colnames()} are the same thing.

** Summary **

\begin{itemize}
\tightlist
\item
  Set column names: \texttt{names()} in data frame, \texttt{colnames()} in matrix
\item
  Set row names: \texttt{row.names()} in data frame, \texttt{rownames()} in matrix
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec1 \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}
\NormalTok{vec2 \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(vec1, vec2)}

\CommentTok{\# these two are equivalent}
\KeywordTok{names}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "vec1" "vec2"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{colnames}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "vec1" "vec2"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# change the colnames}
\KeywordTok{colnames}\NormalTok{(df) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Number"}\NormalTok{, }\StringTok{"Character"}\NormalTok{)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   Number Character
## 1      1         a
## 2      2         b
## 3      3         c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{names}\NormalTok{(df) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Number"}\NormalTok{, }\StringTok{"Character"}\NormalTok{)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   Number Character
## 1      1         a
## 2      2         b
## 3      3         c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# change the rownames}
\KeywordTok{rownames}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "1" "2" "3"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rownames}\NormalTok{(df) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"donut"}\NormalTok{, }\StringTok{"pickle"}\NormalTok{, }\StringTok{"pretzel"}\NormalTok{)}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##         Number Character
## donut        1         a
## pickle       2         b
## pretzel      3         c
\end{verbatim}

The \texttt{length()} of a dataframe is the length of the underlying list and so is the same as \texttt{ncol()}; \texttt{nrow()} gives the number of rows.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec1 \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{3}
\NormalTok{vec2 \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{)}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(vec1, vec2)}

\CommentTok{\# these two are equivalent {-} number of columns}
\KeywordTok{length}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ncol}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# get number of rows}
\KeywordTok{nrow}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# get number of both columns and rows}
\KeywordTok{dim}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3 2
\end{verbatim}

\hypertarget{testing-and-coercion}{%
\paragraph{Testing and coercion}\label{testing-and-coercion}}

To check if an object is a dataframe, use \texttt{class()} or test explicitly with \texttt{is.data.frame()}:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "data.frame"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{is.data.frame}\NormalTok{(df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] TRUE
\end{verbatim}

You can coerce an object to a dataframe with \texttt{as.data.frame()}:

\begin{itemize}
\item
  A vector will create a one-column dataframe.
\item
  A list will create one column for each element; it's an error if they're
  not all the same length.
\item
  A matrix will create a data frame with the same number of columns and rows as the matrix.
\end{itemize}

\hypertarget{combining-dataframes}{%
\paragraph{Combining dataframes}\label{combining-dataframes}}

You can combine dataframes using \texttt{cbind()} and \texttt{rbind()}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{,}
  \DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{),}
  \DataTypeTok{stringsAsFactors =} \OtherTok{FALSE}
\NormalTok{)}

\KeywordTok{cbind}\NormalTok{(df, }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{z =} \DecValTok{3}\OperatorTok{:}\DecValTok{1}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y z
## 1 1 a 3
## 2 2 b 2
## 3 3 c 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{rbind}\NormalTok{(df, }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{x =} \DecValTok{10}\NormalTok{, }\DataTypeTok{y =} \StringTok{"z"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    x y
## 1  1 a
## 2  2 b
## 3  3 c
## 4 10 z
\end{verbatim}

When combining column-wise, the number of rows must match, but row names are ignored. When combining row-wise, both the number and names of columns must match. (If you want to combine rows that don't have the same columns, other functions/packages in R can help.)

It's a common mistake to try and create a dataframe by \texttt{cbind()}ing vectors together. This doesn't work because \texttt{cbind()} will create a matrix unless one of the arguments is already a dataframe. Instead use \texttt{data.frame()} directly:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bad \textless{}{-}}\StringTok{ }\NormalTok{(}\KeywordTok{cbind}\NormalTok{(}\DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{2}\NormalTok{, }\DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{)))}
\NormalTok{bad}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      x   y  
## [1,] "1" "a"
## [2,] "2" "b"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(bad)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  chr [1:2, 1:2] "1" "2" "a" "b"
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:2] "x" "y"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{good \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{x =} \DecValTok{1}\OperatorTok{:}\DecValTok{2}\NormalTok{, }\DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{),}
  \DataTypeTok{stringsAsFactors =} \OtherTok{FALSE}
\NormalTok{)}
\NormalTok{good}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y
## 1 1 a
## 2 2 b
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{str}\NormalTok{(good)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 'data.frame':    2 obs. of  2 variables:
##  $ x: int  1 2
##  $ y: chr  "a" "b"
\end{verbatim}

The conversion rules for \texttt{cbind()} are complicated and best avoided by ensuring all inputs are of the same type.

\textbf{Other objects}

Missing values are specified with \texttt{NA,} which is a logical vector of length 1. \texttt{NA} will always be coerced to the correct type if used inside \texttt{c()}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\OtherTok{NA}\NormalTok{, }\DecValTok{1}\NormalTok{)}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] NA  1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{typeof}\NormalTok{(}\OtherTok{NA}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "logical"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{typeof}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "double"
\end{verbatim}

\texttt{Inf} is infinity. You can have either positive or negative infinity.

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{1} \OperatorTok{/}\StringTok{ }\DecValTok{0}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] Inf
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{1} \OperatorTok{/}\StringTok{ }\OtherTok{Inf}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 0
\end{verbatim}

\texttt{NaN} means Not a number. It's an undefined value.

\begin{Shaded}
\begin{Highlighting}[]
\DecValTok{0} \OperatorTok{/}\StringTok{ }\DecValTok{0}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] NaN
\end{verbatim}

\hypertarget{subset}{%
\subsection{Subset}\label{subset}}

When working with data, you'll need to subset objects early and often. Luckily, R's subsetting operators are powerful and fast. Mastery of subsetting allows you to succinctly express complex operations in a way that few other languages can match. Subsetting is hard to learn because you need to master several interrelated concepts:

\begin{itemize}
\item
  The three subsetting operators, \texttt{{[}}, \texttt{{[}{[}}, and \texttt{\$}.
\item
  Important differences in behavior for different objects (e.g., vectors, lists, factors, matrices, and data frames).
\item
  The use of subsetting in conjunction with assignment.
\end{itemize}

This unit helps you master subsetting by starting with the simplest type of subsetting: subsetting an atomic vector with \texttt{{[}}. It then gradually extends your knowledge to more complicated data types (like dataframes and lists) and then to the other subsetting operators, \texttt{{[}{[}} and \texttt{\$}. You'll then learn how subsetting and assignment can be combined to modify parts of an object, and, finally, you'll see a large number of useful applications.

\hypertarget{atomic-vectors}{%
\subsubsection{Atomic vectors}\label{atomic-vectors}}

Let's explore the different types of subsetting with a simple vector, \texttt{x}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Note that the number after the decimal point gives the original position in the vector.

\textbf{NB:} In R, positions start at 1, unlike Python, which starts at 0. Fun!**

There are five things that you can use to subset a vector:

\hypertarget{positive-integers}{%
\paragraph{Positive integers}\label{positive-integers}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{1}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3.3 2.1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# \textasciigrave{}order(x)\textasciigrave{} gives the positions of smallest to largest values.}
\KeywordTok{order}\NormalTok{(x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 3 2 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{order}\NormalTok{(x)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 3.3 4.2 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{4}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 3.3 4.2 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Duplicated indices yield duplicated values}
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 2.1
\end{verbatim}

\hypertarget{negative-integers}{%
\paragraph{Negative integers}\label{negative-integers}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}
\NormalTok{x[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{3}\NormalTok{, }\DecValTok{1}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 5.4
\end{verbatim}

You can't mix positive and negative integers in a single subset:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Error in x[c(-1, 2)]: only 0's may be mixed with negative subscripts
\end{verbatim}

\hypertarget{logical-vectors}{%
\paragraph{Logical vectors}\label{logical-vectors}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}

\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\OtherTok{TRUE}\NormalTok{, }\OtherTok{TRUE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{, }\OtherTok{FALSE}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2
\end{verbatim}

This is probably the most useful type of subsetting because you write the expression that creates the logical vector.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}

\CommentTok{\# this returns a logical vector}
\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{3}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE  TRUE  TRUE  TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# use a conditional statement to create an implicit logical vector}
\NormalTok{x[x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{3}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 3.3 5.4
\end{verbatim}

You can combine conditional statements with \texttt{\&} (and), \texttt{\textbar{}} (or), and \texttt{!} (not)

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}

\CommentTok{\# combing two conditional statements with \&}
\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{3} \OperatorTok{\&}\StringTok{ }\NormalTok{x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{5}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE  TRUE  TRUE FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{3} \OperatorTok{\&}\StringTok{ }\NormalTok{x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{5}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 3.3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# combing two conditional statements with |}
\NormalTok{x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{3} \OperatorTok{|}\StringTok{ }\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{5}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1]  TRUE FALSE FALSE  TRUE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{3} \OperatorTok{|}\StringTok{ }\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{5}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# combining conditional statements with !}
\OperatorTok{!}\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{5}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1]  TRUE  TRUE  TRUE FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\OperatorTok{!}\NormalTok{x }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{5}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2 3.3
\end{verbatim}

Another way to generate implicit conditional statements is using the \texttt{\%in\%} operator, which works like the \texttt{in} keywords in Python.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# generate implicit logical vectors through the \%in\% operator}
\NormalTok{x }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{3.3}\NormalTok{, }\FloatTok{4.2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] FALSE  TRUE  TRUE FALSE
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.1 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[x }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{3.3}\NormalTok{, }\FloatTok{4.2}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4.2 3.3
\end{verbatim}

\hypertarget{character-vectors}{%
\paragraph{Character vectors}\label{character-vectors}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\FloatTok{2.1}\NormalTok{, }\FloatTok{4.2}\NormalTok{, }\FloatTok{3.3}\NormalTok{, }\FloatTok{5.4}\NormalTok{)}

\CommentTok{\# apply names}
\KeywordTok{names}\NormalTok{(x) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"b"}\NormalTok{, }\StringTok{"c"}\NormalTok{, }\StringTok{"d"}\NormalTok{)}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   a   b   c   d 
## 2.1 4.2 3.3 5.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# subset using names}
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\StringTok{"d"}\NormalTok{, }\StringTok{"c"}\NormalTok{, }\StringTok{"a"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   d   c   a 
## 5.4 3.3 2.1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Like integer indices, you can repeat indices}
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"a"}\NormalTok{, }\StringTok{"a"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   a   a   a 
## 2.1 2.1 2.1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Careful! names are always matched exactly}
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DataTypeTok{abc =} \DecValTok{1}\NormalTok{, }\DataTypeTok{def =} \DecValTok{2}\NormalTok{)}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## abc def 
##   1   2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\StringTok{"a"}\NormalTok{, }\StringTok{"d"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## <NA> <NA> 
##   NA   NA
\end{verbatim}

\hypertarget{more-on-string-operations}{%
\subparagraph{More on string operations}\label{more-on-string-operations}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{firstName \textless{}{-}}\StringTok{ "Jae Yeon"}
\NormalTok{lastName \textless{}{-}}\StringTok{ "Kim"}
\end{Highlighting}
\end{Shaded}

Unlike in Python, R does not have a reserved operator for string concatenation such as \texttt{+}. Furthermore, using the usual concatenation operator \texttt{c()} on two or more character strings will not create a single character string, but rather a \textbf{vector} of character strings.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(firstName, lastName)}

\KeywordTok{print}\NormalTok{(fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon" "Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{length}\NormalTok{(fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2
\end{verbatim}

To combine two or more character strings into one larger character string, we use the \texttt{paste()} function. This function takes character strings or vectors and collapses their values into a single character string, with each value separated by a character string selected by the user.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(firstName, lastName)}

\KeywordTok{print}\NormalTok{(fullName)}

\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(firstName, lastName, }\DataTypeTok{sep =} \StringTok{"+"}\NormalTok{)}

\KeywordTok{print}\NormalTok{(fullName)}

\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(firstName, lastName, }\DataTypeTok{sep =} \StringTok{"\_\_\_"}\NormalTok{)}
\KeywordTok{print}\NormalTok{(fullName)}
\end{Highlighting}
\end{Shaded}

As with Python, R can also extract substrings based on the index position of its characters. There are, however, two critical differences. First, \textbf{index positions in R start at 1}. This is in contrast to Python, where indexation begins at 0.

Second, \textbf{object subsets using index positions in R contain all the elements in the specified range}. If some object called \texttt{data} contains five elements, \texttt{data{[}2:4{]}} will return the elements at the second, third, and fourth positions. By contrast, the same subset in Python would return the objects at the third and fourth positions (or second and third positions, depending upon whether your index starts at 0 or 1).

Third, \textbf{R does not allow indexing of character strings}*. Instead, you must use the \texttt{substr()} function. Note that this function must receive both the \texttt{start} and \texttt{stop} arguments. So if you want to get all the characters between some index and the end of the string, you must use the \texttt{nchar()} function, which will tell you the length of a character string.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fullName \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(firstName, lastName)}

\CommentTok{\# this won\textquotesingle{}t work like in Python}
\NormalTok{fullName[}\DecValTok{1}\NormalTok{] }\CommentTok{\# R sees the string as a unitary object {-} it can\textquotesingle{}t be indexed this way}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fullName[}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon Kim" NA             NA             NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# So use this instead}
\KeywordTok{substr}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{start =} \DecValTok{1}\NormalTok{, }\DataTypeTok{stop =} \DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Ja"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{substr}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{start =} \DecValTok{5}\NormalTok{, }\DataTypeTok{stop =} \DecValTok{5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Y"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{substr}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{start =} \DecValTok{1}\NormalTok{, }\DataTypeTok{stop =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon K"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{substr}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{start =} \DecValTok{11}\NormalTok{, }\DataTypeTok{stop =} \KeywordTok{nchar}\NormalTok{(fullName))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "im"
\end{verbatim}

Like Python, R has a number of string methods, though these exist as individual rather than ``mix-and-match'' functions. For example:

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{toupper}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "JAE YEON KIM"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{tolower}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "jae yeon kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{strsplit}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{split =} \StringTok{" "}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "Jae"  "Yeon" "Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{strsplit}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ fullName, }\DataTypeTok{split =} \StringTok{"n"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "Jae Yeo" " Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{gsub}\NormalTok{(}\DataTypeTok{pattern =} \StringTok{"Kim"}\NormalTok{, }\DataTypeTok{replacement =} \StringTok{"Choi"}\NormalTok{, }\DataTypeTok{x =}\NormalTok{ fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Jae Yeon Choi"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{gsub}\NormalTok{(}\DataTypeTok{pattern =} \StringTok{"Jae Yeon"}\NormalTok{, }\DataTypeTok{replacement =} \StringTok{"Danny"}\NormalTok{, }\DataTypeTok{x =}\NormalTok{ fullName)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Danny Kim"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Note the importance of cases! This doesn\textquotesingle{}t throw an error, so you won\textquotesingle{}t realize your function didn\textquotesingle{}t work unless you double{-}check several entries.}

\KeywordTok{gsub}\NormalTok{(}\DataTypeTok{pattern =} \StringTok{" "}\NormalTok{, }\DataTypeTok{replacement =} \StringTok{""}\NormalTok{, }\DataTypeTok{x =}\NormalTok{ fullName) }\CommentTok{\# The same function is used for replacements and stripping}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "JaeYeonKim"
\end{verbatim}

\hypertarget{lists-1}{%
\subsubsection{Lists}\label{lists-1}}

Subsetting a list works in the same way as subsetting an atomic vector. Using \texttt{{[}} will always return a list; \texttt{{[}{[}} and \texttt{\$}, as described below, let you pull out the list's components.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{l \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\StringTok{"a"}\NormalTok{ =}\StringTok{ }\DecValTok{1}\NormalTok{, }\StringTok{"b"}\NormalTok{ =}\StringTok{ }\DecValTok{2}\NormalTok{)}
\NormalTok{l}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## $a
## [1] 1
## 
## $b
## [1] 2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{l[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## $a
## [1] 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{l[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{l[}\StringTok{"a"}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## $a
## [1] 1
\end{verbatim}

\hypertarget{matrices-1}{%
\subsubsection{Matrices}\label{matrices-1}}

The most common way of subsetting matrices (2d) is a simple generalization of 1d subsetting: you supply a 1d index for each dimension, separated by a comma. Blank subsetting is now useful because it lets you keep all rows or all columns.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a \textless{}{-}}\StringTok{ }\KeywordTok{matrix}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{9}\NormalTok{, }\DataTypeTok{nrow =} \DecValTok{3}\NormalTok{)}
\KeywordTok{colnames}\NormalTok{(a) \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"A"}\NormalTok{, }\StringTok{"B"}\NormalTok{, }\StringTok{"C"}\NormalTok{)}
\NormalTok{a}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      A B C
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# rows come first, then columns}
\NormalTok{a[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), ]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      A B C
## [1,] 1 4 7
## [2,] 2 5 8
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a[}\KeywordTok{c}\NormalTok{(T, F, T), }\KeywordTok{c}\NormalTok{(}\StringTok{"B"}\NormalTok{, }\StringTok{"A"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      B A
## [1,] 4 1
## [2,] 6 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a[}\DecValTok{0}\NormalTok{, }\DecValTok{{-}2}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      A C
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{), }\DecValTok{{-}2}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      A C
## [1,] 1 7
## [2,] 2 8
\end{verbatim}

\hypertarget{data-frames}{%
\subsubsection{Data frames}\label{data-frames}}

Data from data frames can be addressed like matrices (with row and column indicators separated by a comma).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{x =} \DecValTok{4}\OperatorTok{:}\DecValTok{6}\NormalTok{, }\DataTypeTok{y =} \DecValTok{3}\OperatorTok{:}\DecValTok{1}\NormalTok{, }\DataTypeTok{z =}\NormalTok{ letters[}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{])}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y z
## 1 4 3 a
## 2 5 2 b
## 3 6 1 c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# return only the rows where x == 6}
\NormalTok{df[df}\OperatorTok{$}\NormalTok{x }\OperatorTok{==}\StringTok{ }\DecValTok{6}\NormalTok{, ]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y z
## 3 6 1 c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# return the first and third row}
\NormalTok{df[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{), ]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y z
## 1 4 3 a
## 3 6 1 c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# return the first and third row and the first and second column}
\NormalTok{df[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x y
## 1 4 3
## 3 6 1
\end{verbatim}

Data frames possess both lists and matrices' characteristics: if you subset with a single vector, they behave like lists and return only the columns.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# There are two ways to select columns from a data frame}
\CommentTok{\# Like a list:}
\NormalTok{df[}\KeywordTok{c}\NormalTok{(}\StringTok{"x"}\NormalTok{, }\StringTok{"z"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x z
## 1 4 a
## 2 5 b
## 3 6 c
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Like a matrix}
\NormalTok{df[, }\KeywordTok{c}\NormalTok{(}\StringTok{"x"}\NormalTok{, }\StringTok{"z"}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x z
## 1 4 a
## 2 5 b
## 3 6 c
\end{verbatim}

But there's a significant difference when selecting a single column: matrix subsetting simplifies by default, list subsetting does not.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(df[}\StringTok{"x"}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   x
## 1 4
## 2 5
## 3 6
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{((df[}\StringTok{"x"}\NormalTok{]))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "data.frame"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{(df[, }\StringTok{"x"}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4 5 6
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{((df[, }\StringTok{"x"}\NormalTok{]))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "integer"
\end{verbatim}

See the bottom section on \protect\hyperlink{simplify-preserve}{Simplying and Preserving to know more}

\hypertarget{subsetting-operators}{%
\subsubsection{Subsetting operators}\label{subsetting-operators}}

There are two other subsetting operators: \texttt{{[}{[}} and \texttt{\$}.

\begin{itemize}
\tightlist
\item
  \texttt{{[}{[}} is similar to \texttt{{[}}, except it can only return a single value, and it allows you to pull pieces out of a list.
\item
  \texttt{\$} is a useful shorthand for \texttt{{[}{[}} combined with character subsetting.
\end{itemize}

\hypertarget{section}{%
\paragraph{\texorpdfstring{\texttt{{[}{[}}}{{[}{[}}}\label{section}}

You need \texttt{{[}{[}} when working with lists. When \texttt{{[}} is applied to a list it always returns a list: it never gives you the list's contents. To get the contents, you need \texttt{{[}{[}}:

\begin{quote}
``If list \texttt{x} is a train carrying objects, then \texttt{x{[}{[}5{]}{]}} is
the object in car 5; \texttt{x{[}4:6{]}} is a train of cars 4-6.''

--- \citet{RLangTip}
\end{quote}

Because data frames are lists of columns, you can use \texttt{{[}{[}} to extract a column from data frames:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# these two are equivalent}
\NormalTok{mtcars[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars[, }\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# which differs from this:}
\NormalTok{mtcars[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      mpg
## Mazda RX4           21.0
## Mazda RX4 Wag       21.0
## Datsun 710          22.8
## Hornet 4 Drive      21.4
## Hornet Sportabout   18.7
## Valiant             18.1
## Duster 360          14.3
## Merc 240D           24.4
## Merc 230            22.8
## Merc 280            19.2
## Merc 280C           17.8
## Merc 450SE          16.4
## Merc 450SL          17.3
## Merc 450SLC         15.2
## Cadillac Fleetwood  10.4
## Lincoln Continental 10.4
## Chrysler Imperial   14.7
## Fiat 128            32.4
## Honda Civic         30.4
## Toyota Corolla      33.9
## Toyota Corona       21.5
## Dodge Challenger    15.5
## AMC Javelin         15.2
## Camaro Z28          13.3
## Pontiac Firebird    19.2
## Fiat X1-9           27.3
## Porsche 914-2       26.0
## Lotus Europa        30.4
## Ford Pantera L      15.8
## Ferrari Dino        19.7
## Maserati Bora       15.0
## Volvo 142E          21.4
\end{verbatim}

\hypertarget{section-1}{%
\paragraph{\texorpdfstring{\texttt{\$}}{\$}}\label{section-1}}

\texttt{\$} is a shorthand operator, where \texttt{x\$y} is equivalent to \texttt{x{[}{[}"y",\ exact\ =\ FALSE{]}{]}}. It's often used to access variables in a data frame:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# these two are equivalent}
\NormalTok{mtcars[[}\StringTok{"cyl"}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars}\OperatorTok{$}\NormalTok{cyl}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
\end{verbatim}

One common mistake with \texttt{\$} is to try and use it when you have the name of a column stored in a variable:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{var \textless{}{-}}\StringTok{ "cyl"}
\CommentTok{\# Doesn\textquotesingle{}t work {-} mtcars$var translated to mtcars[["var"]]}
\NormalTok{mtcars}\OperatorTok{$}\NormalTok{var}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## NULL
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Instead use [[}
\NormalTok{mtcars[[var]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
\end{verbatim}

\hypertarget{subassignment}{%
\subsubsection{Subassignment}\label{subassignment}}

All subsetting operators can be combined with an assignment operator to modify selected values of the input vector.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{1}\OperatorTok{:}\DecValTok{5}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4 5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{)] \textless{}{-}}\StringTok{ }\DecValTok{2}\OperatorTok{:}\DecValTok{3}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2 3 3 4 5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# The length of the LHS needs to match the RHS!}
\NormalTok{x[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] \textless{}{-}}\StringTok{ }\DecValTok{4}\OperatorTok{:}\DecValTok{1}
\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2 4 3 2 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x[}\DecValTok{1}\NormalTok{] \textless{}{-}}\StringTok{ }\DecValTok{4}\OperatorTok{:}\DecValTok{1}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in x[1] <- 4:1: number of items to replace is not a multiple of
## replacement length
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This is mostly useful when conditionally modifying vectors}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{a =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{, }\OtherTok{NA}\NormalTok{))}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    a
## 1  1
## 2 10
## 3 NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{$}\NormalTok{a[df}\OperatorTok{$}\NormalTok{a }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{5}\NormalTok{] \textless{}{-}}\StringTok{ }\DecValTok{0}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    a
## 1  0
## 2 10
## 3 NA
\end{verbatim}

\hypertarget{tidyverse}{%
\section{Tidyverse}\label{tidyverse}}

\hypertarget{the-big-picture-4}{%
\subsection{The Big Picture}\label{the-big-picture-4}}

\begin{quote}
``Tidy data sets are easy to manipulate, model and visualize, and have a specific structure: each variable is a column, each observation is a row, and each type of observational unit is a table.'' - Hadley Wickham
\end{quote}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Variables -\textgreater{} \textbf{Columns}
\item
  Observations -\textgreater{} \textbf{Rows}
\item
  Values -\textgreater{} \textbf{Cells}
\end{enumerate}

\begin{figure}
\centering
\includegraphics{https://garrettgman.github.io/images/tidy-1.png}
\caption{Tidy Data Example (Source: R for Data Science)}
\end{figure}

If dataframes are tidy, it's easy to transform, visualize, model, and program them using tidyverse packages (a whole workflow).

\begin{figure}
\centering
\includegraphics{https://miro.medium.com/max/960/0*mlPyX0NE0WQwEzpS.png}
\caption{Tidyverse: an opinionated collection of R packages}
\end{figure}

\begin{itemize}
\tightlist
\item
  Nevertheless, don't be \textbf{religious}.
\end{itemize}

\begin{quote}
In summary, tidy data is a useful conceptual idea and is often the right way to go for general, small data sets, but may not be appropriate for all problems. - Jeff Leek
\end{quote}

For instance, in many data science applications, linear algebra-based computations are essential (e.g., \href{https://www.math.upenn.edu/~kazdan/312S13/JJ/PCA-JJ.pdf}{Principal Component Analysis}). These computations are optimized to work on matrices, not tidy data frames (for more information, read \href{https://simplystatistics.org/2016/02/17/non-tidy-data/}{Jeff Leek's blog post}).

This is what tidy data looks like.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{library}\NormalTok{(tidyverse)}

\NormalTok{table1}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

\textbf{Additional tips}

There are so many different ways of looking at data in R. Can you discuss the pros and cons of each approach? Which one do you prefer and why?

\begin{itemize}
\item
  \texttt{str(table1)}
\item
  \texttt{glimpse(table1)}: similar to \texttt{str()} cleaner output
\item
  \texttt{skim(table1)}: \texttt{str()} + \texttt{summary()} + more
\item
  The big picture

  \begin{itemize}
  \tightlist
  \item
    Tidying data with \textbf{tidyr}
  \item
    Processing data with \textbf{dplyr}
  \end{itemize}
\end{itemize}

These two packages don't do anything new but simplify most common tasks in data manipulation. Plus, they are fast, consistent, and more readable.

Practically, this approach is right because you will have consistency in data format across all the projects you're working on. Also, tidy data works well with key packages (e.g., \texttt{dplyr,} \texttt{ggplot2}) in R.

Computationally, this approach is useful for vectorized programming because ``different variables from the same observation are always paired''. Vectorized means a function applies to a vector that treats each element individually (=operations working in parallel).

\hypertarget{tidying-tidyr}{%
\section{Tidying (tidyr)}\label{tidying-tidyr}}

\hypertarget{reshaping}{%
\subsection{Reshaping}\label{reshaping}}

\textbf{Signs of messy datasets}

\begin{itemize}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \tightlist
  \item
    Column headers are values, not variable names.
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{1}
  \tightlist
  \item
    Multiple variables are not stored in one column.
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{2}
  \tightlist
  \item
    Variables are stored in both rows and columns.
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{3}
  \tightlist
  \item
    Multiple types of observational units are stored in the same table.
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{4}
  \tightlist
  \item
    A single observational unit is stored in multiple tables.
  \end{enumerate}
\end{itemize}

Let's take a look at the cases of untidy data.

\begin{figure}
\centering
\includegraphics{https://garrettgman.github.io/images/tidy-5.png}
\caption{Messy Data Case 1 (Source: R for Data Science)}
\end{figure}

\begin{itemize}
\item
  Make It Longer

  \begin{longtable}[]{@{}lll@{}}
  \toprule
  Col1 & Col2 & Col3\tabularnewline
  \midrule
  \endhead
  & &\tabularnewline
  & &\tabularnewline
  & &\tabularnewline
  \bottomrule
  \end{longtable}
\end{itemize}

\textbf{Challenge}: Why is this data not tidy?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table4a}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Let's pivot (rotate by 90 degrees).
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://github.com/rstudio/concept-maps/raw/master/en/pivoting.svg}
\caption{Concept map for pivoting. By Florian Schmoll, Monica Alonso.}
\end{figure}

\begin{itemize}
\tightlist
\item
  \href{https://tidyr.tidyverse.org/reference/pivot_longer.html}{\texttt{pivot\_longer()}} increases the number of rows (longer) and decreases the number of columns. The inverse function is \texttt{pivot\_wider()}. These functions improve the usability of \texttt{gather()} and \texttt{spread()}.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://www.storybench.org/wp-content/uploads/2019/08/pivot-longer-image.png}
\caption{What pivot\_longer() does (Source: \url{https://www.storybench.org})}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://education.rstudio.com/blog/2020/09/concept-maps/pipe-operator.png}
\caption{Concept map for pipe operator. By Jeroen Janssens, Monica Alonso.}
\end{figure}

\begin{itemize}
\tightlist
\item
  The pipe operator \texttt{\%\textgreater{}\%} originally comes from the \texttt{magrittr} package. The idea behind the pipe operator is \href{https://www.datacamp.com/community/tutorials/pipe-r-tutorial}{similar to} what we learned about chaining functions in high school. f: B -\textgreater{} C and g: A -\textgreater{} B can be expressed as \(f(g(x))\). The pipe operator chains operations. When reading the pipe operator, read as ``and then'' (Wickham's recommendation). The keyboard shortcut is ctrl + shift + M. The key idea here is not creating temporary variables and focusing on verbs (functions). We'll learn more about this functional programming paradigm later on.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table4a}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way, less intuitive}
\NormalTok{table4a }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(}
    \DataTypeTok{key =} \StringTok{"year"}\NormalTok{, }\CommentTok{\# Current column names}
    \DataTypeTok{value =} \StringTok{"cases"}\NormalTok{, }\CommentTok{\# The values matched to cases}
    \KeywordTok{c}\NormalTok{(}\StringTok{"1999"}\NormalTok{, }\StringTok{"2000"}\NormalTok{)}
\NormalTok{  ) }\CommentTok{\# Selected columns}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way, more intuitive}
\NormalTok{table4a }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_longer}\NormalTok{(}
    \DataTypeTok{cols =} \KeywordTok{c}\NormalTok{(}\StringTok{"1999"}\NormalTok{, }\StringTok{"2000"}\NormalTok{), }\CommentTok{\# Selected columns}
    \DataTypeTok{names\_to =} \StringTok{"year"}\NormalTok{, }\CommentTok{\# Shorter columns (the columns going to be in one column called year)}
    \DataTypeTok{values\_to =} \StringTok{"cases"}
\NormalTok{  ) }\CommentTok{\# Longer rows (the values are going to be in a separate column called named cases)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Afghanistan 2000    2666
## 3 Brazil      1999   37737
## 4 Brazil      2000   80488
## 5 China       1999  212258
## 6 China       2000  213766
\end{verbatim}

\begin{itemize}
\item
  There's another problem, did you catch it?
\item
  The data type of \texttt{year} variable should be \texttt{numeric} not \texttt{character}. By default, \texttt{pivot\_longer()} transforms uninformative columns to character.
\item
  You can fix this problem by using \texttt{names\_transform} argument.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table4a }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_longer}\NormalTok{(}
    \DataTypeTok{cols =} \KeywordTok{c}\NormalTok{(}\StringTok{"1999"}\NormalTok{, }\StringTok{"2000"}\NormalTok{), }\CommentTok{\# Put two columns together}
    \DataTypeTok{names\_to =} \StringTok{"year"}\NormalTok{, }\CommentTok{\# Shorter columns (the columns going to be in one column called year)}
    \DataTypeTok{values\_to =} \StringTok{"cases"}\NormalTok{, }\CommentTok{\# Longer rows (the values are going to be in a separate column called named cases)}
    \DataTypeTok{names\_transform =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{year =}\NormalTok{ readr}\OperatorTok{::}\NormalTok{parse\_number)}
\NormalTok{  ) }\CommentTok{\# Transform the variable}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##   country      year  cases
##   <chr>       <dbl>  <int>
## 1 Afghanistan  1999    745
## 2 Afghanistan  2000   2666
## 3 Brazil       1999  37737
## 4 Brazil       2000  80488
## 5 China        1999 212258
## 6 China        2000 213766
\end{verbatim}

\textbf{Additional tips}

\texttt{parse\_number()} also keeps only numeric information in a variable.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{parse\_number}\NormalTok{(}\StringTok{"reply1994"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1994
\end{verbatim}

A flat file (e.g., CSV) is a rectangular shaped combination of strings. \href{https://cran.r-project.org/web/packages/readr/vignettes/readr.html}{Parsing} determines the type of each column and turns into a vector of a more specific type. Tidyverse has \texttt{parse\_} functions (from \texttt{readr} package) that are flexible and fast (e.g., \texttt{parse\_integer()}, \texttt{parse\_double()}, \texttt{parse\_logical()}, \texttt{parse\_datetime()}, \texttt{parse\_date()}, \texttt{parse\_time()}, \texttt{parse\_factor()}, etc).

\begin{itemize}
\tightlist
\item
  Let's do another practice.
\end{itemize}

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Why is this data not tidy? (This exercise comes from \href{https://tidyr.tidyverse.org/articles/pivot.html}{\texttt{pivot} function vigenette}.) Too long or too wide?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{billboard}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 317 x 79
##    artist     track date.entered   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8
##    <chr>      <chr> <date>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2 Pac      Baby~ 2000-02-26      87    82    72    77    87    94    99    NA
##  2 2Ge+her    The ~ 2000-09-02      91    87    92    NA    NA    NA    NA    NA
##  3 3 Doors D~ Kryp~ 2000-04-08      81    70    68    67    66    57    54    53
##  4 3 Doors D~ Loser 2000-10-21      76    76    72    69    67    65    55    59
##  5 504 Boyz   Wobb~ 2000-04-15      57    34    25    17    17    31    36    49
##  6 98^0       Give~ 2000-08-19      51    39    34    26    26    19     2     2
##  7 A*Teens    Danc~ 2000-07-08      97    97    96    95   100    NA    NA    NA
##  8 Aaliyah    I Do~ 2000-01-29      84    62    51    41    38    35    35    38
##  9 Aaliyah    Try ~ 2000-03-18      59    53    38    28    21    18    16    14
## 10 Adams, Yo~ Open~ 2000-08-26      76    76    74    69    68    67    61    58
## # ... with 307 more rows, and 68 more variables: wk9 <dbl>, wk10 <dbl>,
## #   wk11 <dbl>, wk12 <dbl>, wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>,
## #   wk17 <dbl>, wk18 <dbl>, wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>,
## #   wk23 <dbl>, wk24 <dbl>, wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>,
## #   wk29 <dbl>, wk30 <dbl>, wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>,
## #   wk35 <dbl>, wk36 <dbl>, wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>,
## #   wk41 <dbl>, wk42 <dbl>, wk43 <dbl>, wk44 <dbl>, wk45 <dbl>, wk46 <dbl>, ...
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  How can you fix it? Which pivot?
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way}
\NormalTok{billboard }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(}
    \DataTypeTok{key =} \StringTok{"week"}\NormalTok{,}
    \DataTypeTok{value =} \StringTok{"rank"}\NormalTok{,}
    \KeywordTok{starts\_with}\NormalTok{(}\StringTok{"wk"}\NormalTok{)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Use regular expressions}
\StringTok{  }\KeywordTok{drop\_na}\NormalTok{() }\CommentTok{\# Drop NAs}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5,307 x 5
##    artist         track                   date.entered week   rank
##    <chr>          <chr>                   <date>       <chr> <dbl>
##  1 2 Pac          Baby Don't Cry (Keep... 2000-02-26   wk1      87
##  2 2Ge+her        The Hardest Part Of ... 2000-09-02   wk1      91
##  3 3 Doors Down   Kryptonite              2000-04-08   wk1      81
##  4 3 Doors Down   Loser                   2000-10-21   wk1      76
##  5 504 Boyz       Wobble Wobble           2000-04-15   wk1      57
##  6 98^0           Give Me Just One Nig... 2000-08-19   wk1      51
##  7 A*Teens        Dancing Queen           2000-07-08   wk1      97
##  8 Aaliyah        I Don't Wanna           2000-01-29   wk1      84
##  9 Aaliyah        Try Again               2000-03-18   wk1      59
## 10 Adams, Yolanda Open My Heart           2000-08-26   wk1      76
## # ... with 5,297 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Note that \texttt{pivot\_longer()} is more versatile than \texttt{gather()}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}
\NormalTok{billboard }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_longer}\NormalTok{(}
    \DataTypeTok{cols =} \KeywordTok{starts\_with}\NormalTok{(}\StringTok{"wk"}\NormalTok{), }\CommentTok{\# Use regular expressions}
    \DataTypeTok{names\_to =} \StringTok{"week"}\NormalTok{,}
    \DataTypeTok{values\_to =} \StringTok{"rank"}\NormalTok{,}
    \DataTypeTok{values\_drop\_na =} \OtherTok{TRUE} \CommentTok{\# Drop NAs}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5,307 x 5
##    artist  track                   date.entered week   rank
##    <chr>   <chr>                   <date>       <chr> <dbl>
##  1 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk1      87
##  2 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk2      82
##  3 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk3      72
##  4 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk4      77
##  5 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk5      87
##  6 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk6      94
##  7 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk7      99
##  8 2Ge+her The Hardest Part Of ... 2000-09-02   wk1      91
##  9 2Ge+her The Hardest Part Of ... 2000-09-02   wk2      87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02   wk3      92
## # ... with 5,297 more rows
\end{verbatim}

\begin{itemize}
\item
  Make It Wider
\item
  Why is this data not tidy?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table2}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
\end{verbatim}

\begin{itemize}
\item
  Each observation is spread across two rows.
\item
  How can you fix it?: \texttt{pivot\_wider()}.
\end{itemize}

\textbf{Two differences between \texttt{pivot\_longer()} and \texttt{pivot\_wider()}}

\begin{itemize}
\item
  In \texttt{pivot\_longer()}, the arguments are named \texttt{names\_to} and \texttt{values\_to} (\emph{to}).
\item
  In \texttt{pivot\_wider()}, this pattern is opposite. The arguments are named \texttt{names\_from} and \texttt{values\_from} (\emph{from}).
\item
  The number of required arguments for \texttt{pivot\_longer()} is 3 (col, names\_to, values\_to).
\item
  The number of required arguments for \texttt{pivot\_wider()} is 2 (names\_from, values\_from).
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://www.storybench.org/wp-content/uploads/2019/08/pivot-wider-image.png}
\caption{What pivot\_wider() does (Source: \url{https://www.storybench.org})}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way}
\NormalTok{table2 }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{spread}\NormalTok{(}
    \DataTypeTok{key =}\NormalTok{ type,}
    \DataTypeTok{value =}\NormalTok{ count}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}
\NormalTok{table2 }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_wider}\NormalTok{(}
    \DataTypeTok{names\_from =}\NormalTok{ type, }\CommentTok{\# first}
    \DataTypeTok{values\_from =}\NormalTok{ count }\CommentTok{\# second}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

Sometimes, a consultee came to me and asked: ``I don't have missing values in my original dataframe. Then R said that I had missing values after doing some data transformations. What happened?''

Here's an answer.

R defines missing values in two ways.

\begin{itemize}
\item
  \emph{Implicit missing values}: simply not present in the data.
\item
  \emph{Explicit missing values}: flagged with NA
\end{itemize}

\textbf{Challenge}

The example comes from \href{https://r4ds.had.co.nz/tidy-data.html}{\emph{R for Data Science}}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stocks \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{year =} \KeywordTok{c}\NormalTok{(}\DecValTok{2019}\NormalTok{, }\DecValTok{2019}\NormalTok{, }\DecValTok{2019}\NormalTok{, }\DecValTok{2020}\NormalTok{, }\DecValTok{2020}\NormalTok{, }\DecValTok{2020}\NormalTok{),}
  \DataTypeTok{qtr =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{4}\NormalTok{),}
  \DataTypeTok{return =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}\NormalTok{)}
\NormalTok{)}

\NormalTok{stocks}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##    year   qtr return
##   <dbl> <dbl>  <dbl>
## 1  2019     1      1
## 2  2019     2      2
## 3  2019     3      3
## 4  2020     2     NA
## 5  2020     3      2
## 6  2020     4      3
\end{verbatim}

\begin{itemize}
\item
  Where is the explicit missing value?
\item
  Does \texttt{stocks} have implicit missing values?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# implicit missing values become explicit}
\NormalTok{stocks }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_wider}\NormalTok{(}
    \DataTypeTok{names\_from =}\NormalTok{ year,}
    \DataTypeTok{values\_from =}\NormalTok{ return}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 4 x 3
##     qtr `2019` `2020`
##   <dbl>  <dbl>  <dbl>
## 1     1      1     NA
## 2     2      2     NA
## 3     3      3      2
## 4     4     NA      3
\end{verbatim}

\textbf{Challenge}

\begin{itemize}
\item
  This exercise comes from \href{https://tidyr.tidyverse.org/articles/pivot.html}{\texttt{pivot} function vigenette}.
\item
  Could you make \texttt{station} a series of dummy variables using \texttt{pivot\_wider()}?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fish\_encounters}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 114 x 3
##    fish  station  seen
##    <fct> <fct>   <int>
##  1 4842  Release     1
##  2 4842  I80_1       1
##  3 4842  Lisbon      1
##  4 4842  Rstr        1
##  5 4842  Base_TD     1
##  6 4842  BCE         1
##  7 4842  BCW         1
##  8 4842  BCE2        1
##  9 4842  BCW2        1
## 10 4842  MAE         1
## # ... with 104 more rows
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Which pivot should you use?
\item
  Are there explicit missing values?
\item
  How could you turn these NAs into 0s? Check \texttt{values\_fill} argument in the \texttt{pivot\_wider()} function.
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Separate
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://garrettgman.github.io/images/tidy-6.png}
\caption{Messy Data Case 2 (Source: R for Data Science)}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Toy example}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{c}\NormalTok{(}\OtherTok{NA}\NormalTok{, }\StringTok{"Dad.apple"}\NormalTok{, }\StringTok{"Mom.orange"}\NormalTok{, }\StringTok{"Daughter.banana"}\NormalTok{))}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                 x
## 1            <NA>
## 2       Dad.apple
## 3      Mom.orange
## 4 Daughter.banana
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Separate}
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{separate}\NormalTok{(x, }\DataTypeTok{into =} \KeywordTok{c}\NormalTok{(}\StringTok{"Name"}\NormalTok{, }\StringTok{"Preferred\_fruit"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##       Name Preferred_fruit
## 1     <NA>            <NA>
## 2      Dad           apple
## 3      Mom          orange
## 4 Daughter          banana
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Don\textquotesingle{}t need the first variable}

\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{separate}\NormalTok{(x, }\DataTypeTok{into =} \KeywordTok{c}\NormalTok{(}\OtherTok{NA}\NormalTok{, }\StringTok{"Preferred\_fruit"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   Preferred_fruit
## 1            <NA>
## 2           apple
## 3          orange
## 4          banana
\end{verbatim}

\textbf{Practice}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table3}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Note \texttt{sep} argument. You can specify how to separate joined values.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table3 }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{separate}\NormalTok{(rate,}
    \DataTypeTok{into =} \KeywordTok{c}\NormalTok{(}\StringTok{"cases"}\NormalTok{, }\StringTok{"population"}\NormalTok{),}
    \DataTypeTok{sep =} \StringTok{"/"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Note \texttt{convert} argument. You can specify whether automatically convert the new values or not.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{table3 }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{separate}\NormalTok{(rate,}
    \DataTypeTok{into =} \KeywordTok{c}\NormalTok{(}\StringTok{"cases"}\NormalTok{, }\StringTok{"population"}\NormalTok{),}
    \DataTypeTok{sep =} \StringTok{"/"}\NormalTok{,}
    \DataTypeTok{convert =} \OtherTok{TRUE}
\NormalTok{  ) }\CommentTok{\# cases and population become integers}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Unite
\end{itemize}

\texttt{pivot\_longer()} \textless-\textgreater{} \texttt{pivot\_wider()}

\texttt{separate()} \textless-\textgreater{} \texttt{unite()}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a toy example}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{name =} \KeywordTok{c}\NormalTok{(}\StringTok{"Jae"}\NormalTok{, }\StringTok{"Sun"}\NormalTok{, }\StringTok{"Jane"}\NormalTok{, }\OtherTok{NA}\NormalTok{),}
  \DataTypeTok{birthmonth =} \KeywordTok{c}\NormalTok{(}\StringTok{"April"}\NormalTok{, }\StringTok{"April"}\NormalTok{, }\StringTok{"June"}\NormalTok{, }\OtherTok{NA}\NormalTok{)}
\NormalTok{)}

\CommentTok{\# Include missing values}
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{unite}\NormalTok{(}
  \StringTok{"contact"}\NormalTok{,}
  \KeywordTok{c}\NormalTok{(}\StringTok{"name"}\NormalTok{, }\StringTok{"birthmonth"}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     contact
## 1 Jae_April
## 2 Sun_April
## 3 Jane_June
## 4     NA_NA
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Do not include missing values}
\NormalTok{df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{unite}\NormalTok{(}\StringTok{"contact"}\NormalTok{,}
  \KeywordTok{c}\NormalTok{(}\StringTok{"name"}\NormalTok{, }\StringTok{"birthmonth"}\NormalTok{),}
  \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     contact
## 1 Jae_April
## 2 Sun_April
## 3 Jane_June
## 4
\end{verbatim}

\hypertarget{filling}{%
\subsection{Filling}\label{filling}}

This is a relatively less-known function of the tidyr package. However, I found this function super useful to complete time-series data. For instance, how can you replace NA in the following example (this use case is drawn from the \href{https://tidyr.tidyverse.org/reference/fill.html}{tidyr package vignette}.)?

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Example}
\NormalTok{stock \textless{}{-}}\StringTok{ }\NormalTok{tibble}\OperatorTok{::}\KeywordTok{tribble}\NormalTok{(}
  \OperatorTok{\textasciitilde{}}\NormalTok{quarter, }\OperatorTok{\textasciitilde{}}\NormalTok{year, }\OperatorTok{\textasciitilde{}}\NormalTok{stock\_price,}
  \StringTok{"Q1"}\NormalTok{, }\DecValTok{2000}\NormalTok{, }\DecValTok{10000}\NormalTok{,}
  \StringTok{"Q2"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{10001}\NormalTok{, }\CommentTok{\# Replace NA with 2000}
  \StringTok{"Q3"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{10002}\NormalTok{, }\CommentTok{\# Replace NA with 2000}
  \StringTok{"Q4"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{10003}\NormalTok{, }\CommentTok{\# Replace NA with 2000}
  \StringTok{"Q1"}\NormalTok{, }\DecValTok{2001}\NormalTok{, }\DecValTok{10004}\NormalTok{,}
  \StringTok{"Q2"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{10005}\NormalTok{, }\CommentTok{\# Replace NA with 2001}
  \StringTok{"Q3"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{10006}\NormalTok{, }\CommentTok{\# Replace NA with 2001}
  \StringTok{"Q4"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{10007}\NormalTok{, }\CommentTok{\# Replace NA with 2001}
\NormalTok{)}

\KeywordTok{fill}\NormalTok{(stock, year)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 8 x 3
##   quarter  year stock_price
##   <chr>   <dbl>       <dbl>
## 1 Q1       2000       10000
## 2 Q2       2000       10001
## 3 Q3       2000       10002
## 4 Q4       2000       10003
## 5 Q1       2001       10004
## 6 Q2       2001       10005
## 7 Q3       2001       10006
## 8 Q4       2001       10007
\end{verbatim}

Let's take a slightly more complex example.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Example}
\NormalTok{yelp\_rate \textless{}{-}}\StringTok{ }\NormalTok{tibble}\OperatorTok{::}\KeywordTok{tribble}\NormalTok{(}
  \OperatorTok{\textasciitilde{}}\NormalTok{neighborhood, }\OperatorTok{\textasciitilde{}}\NormalTok{restraurant\_type, }\OperatorTok{\textasciitilde{}}\NormalTok{popularity\_rate,}
  \StringTok{"N1"}\NormalTok{, }\StringTok{"Chinese"}\NormalTok{, }\DecValTok{5}\NormalTok{,}
  \StringTok{"N2"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{4}\NormalTok{,}
  \StringTok{"N3"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{3}\NormalTok{,}
  \StringTok{"N4"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{2}\NormalTok{,}
  \StringTok{"N1"}\NormalTok{, }\StringTok{"Indian"}\NormalTok{, }\DecValTok{1}\NormalTok{,}
  \StringTok{"N2"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{2}\NormalTok{,}
  \StringTok{"N3"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{3}\NormalTok{,}
  \StringTok{"N4"}\NormalTok{, }\OtherTok{NA}\NormalTok{, }\DecValTok{4}\NormalTok{,}
  \StringTok{"N1"}\NormalTok{, }\StringTok{"Mexican"}\NormalTok{, }\DecValTok{5}
\NormalTok{)}

\KeywordTok{fill}\NormalTok{(yelp\_rate, restraurant\_type) }\CommentTok{\# default is direction = .down}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 9 x 3
##   neighborhood restraurant_type popularity_rate
##   <chr>        <chr>                      <dbl>
## 1 N1           Chinese                        5
## 2 N2           Chinese                        4
## 3 N3           Chinese                        3
## 4 N4           Chinese                        2
## 5 N1           Indian                         1
## 6 N2           Indian                         2
## 7 N3           Indian                         3
## 8 N4           Indian                         4
## 9 N1           Mexican                        5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{fill}\NormalTok{(yelp\_rate, restraurant\_type, }\DataTypeTok{.direction =} \StringTok{"up"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 9 x 3
##   neighborhood restraurant_type popularity_rate
##   <chr>        <chr>                      <dbl>
## 1 N1           Chinese                        5
## 2 N2           Indian                         4
## 3 N3           Indian                         3
## 4 N4           Indian                         2
## 5 N1           Indian                         1
## 6 N2           Mexican                        2
## 7 N3           Mexican                        3
## 8 N4           Mexican                        4
## 9 N1           Mexican                        5
\end{verbatim}

\hypertarget{manipulating-dplyr}{%
\section{Manipulating (dplyr)}\label{manipulating-dplyr}}

\begin{figure}
\centering
\includegraphics{https://education.rstudio.com/blog/2020/09/concept-maps/dplyr.png}
\caption{Concept map for dplyr. By Monica Alonso, Greg Wilson.}
\end{figure}

\texttt{dplyr} is better than the base R approaches to data processing:

\begin{itemize}
\tightlist
\item
  fast to run (due to the C++ backed) and intuitive to type
\item
  works well with tidy data and databases (thanks to \href{https://dbplyr.tidyverse.org/}{\texttt{dbplyr}})
\end{itemize}

\hypertarget{rearranging}{%
\subsection{Rearranging}\label{rearranging}}

\begin{itemize}
\item
  Arrange
\item
  Order rows
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{arrange}\NormalTok{(mtcars, mpg) }\CommentTok{\# Low to High (default)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{arrange}\NormalTok{(mtcars, }\KeywordTok{desc}\NormalTok{(mpg)) }\CommentTok{\# High to Row}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
\end{verbatim}

\begin{itemize}
\item
  Rename
\item
  Rename columns
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\DecValTok{2011}\NormalTok{, }\DecValTok{2012}\NormalTok{, }\DecValTok{2013}\NormalTok{))}

\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{rename}\NormalTok{(}
    \DataTypeTok{Year =} \CommentTok{\# NEW name}
\NormalTok{      y}
\NormalTok{  ) }\CommentTok{\# OLD name}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 1
##    Year
##   <dbl>
## 1  2011
## 2  2012
## 3  2013
\end{verbatim}

\hypertarget{subset-observations-rows}{%
\subsection{Subset observations (rows)}\label{subset-observations-rows}}

\begin{itemize}
\item
  Choose row by logical condition
\item
  Single condition
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(gender }\OperatorTok{==}\StringTok{ "feminine"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(height))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 17 x 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Taun We     213  NA   none       grey       black             NA fema~ femin~
##  2 Adi Gal~    184  50   none       dark       blue              NA fema~ femin~
##  3 Ayla Se~    178  55   none       blue       hazel             48 fema~ femin~
##  4 Shaak Ti    178  57   none       red, blue~ black             NA fema~ femin~
##  5 Luminar~    170  56.2 black      yellow     blue              58 fema~ femin~
##  6 Zam Wes~    168  55   blonde     fair, gre~ yellow            NA fema~ femin~
##  7 Jocasta~    167  NA   white      fair       blue              NA fema~ femin~
##  8 Barriss~    166  50   black      yellow     blue              40 fema~ femin~
##  9 Beru Wh~    165  75   brown      light      blue              47 fema~ femin~
## 10 Dormé       165  NA   brown      light      brown             NA fema~ femin~
## 11 Padmé A~    165  45   brown      light      brown             46 fema~ femin~
## 12 Shmi Sk~    163  NA   black      fair       brown             72 fema~ femin~
## 13 Cordé       157  NA   brown      light      brown             NA fema~ femin~
## 14 Leia Or~    150  49   brown      light      brown             19 fema~ femin~
## 15 Mon Mot~    150  NA   auburn     fair       blue              48 fema~ femin~
## 16 R4-P17       96  NA   none       silver, r~ red, blue         NA none  femin~
## 17 Rey          NA  NA   brown      light      hazel             NA fema~ femin~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

The following filtering example was inspired by \href{https://suzan.rbind.io/2018/02/dplyr-tutorial-3/}{the suzanbert's dplyr blog post}.

\begin{itemize}
\tightlist
\item
  Multiple conditions (numeric)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# First example}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(height }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{180}\NormalTok{, height }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{160}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nrow}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 24
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Same as above}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(height }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{180} \OperatorTok{\&}\StringTok{ }\NormalTok{height }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{160}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nrow}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 24
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Not same as above}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(height }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{180} \OperatorTok{|}\StringTok{ }\NormalTok{height }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{160}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nrow}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 81
\end{verbatim}

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{(\arabic{enumi})}
\tightlist
\item
  Use \texttt{filter(between())} to find characters whose heights are between 180 and 160 and (2) count the number of these observations.
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Minimum reproducible example
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{heights =} \KeywordTok{c}\NormalTok{(}\DecValTok{160}\OperatorTok{:}\DecValTok{180}\NormalTok{),}
  \DataTypeTok{char =} \KeywordTok{rep}\NormalTok{(}\StringTok{"none"}\NormalTok{, }\KeywordTok{length}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\DecValTok{160}\OperatorTok{:}\DecValTok{180}\NormalTok{)))}
\NormalTok{)}

\NormalTok{df }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(}\KeywordTok{between}\NormalTok{(heights, }\DecValTok{161}\NormalTok{, }\DecValTok{179}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 19 x 2
##    heights char 
##      <int> <chr>
##  1     161 none 
##  2     162 none 
##  3     163 none 
##  4     164 none 
##  5     165 none 
##  6     166 none 
##  7     167 none 
##  8     168 none 
##  9     169 none 
## 10     170 none 
## 11     171 none 
## 12     172 none 
## 13     173 none 
## 14     174 none 
## 15     175 none 
## 16     176 none 
## 17     177 none 
## 18     178 none 
## 19     179 none
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Multiple conditions (character)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Filter names include ars; \textasciigrave{}grepl\textasciigrave{} is a base R function}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(}\KeywordTok{grepl}\NormalTok{(}\StringTok{"ars"}\NormalTok{, }\KeywordTok{tolower}\NormalTok{(name)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 4 x 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Owen Lars    178   120 brown, gr~ light      blue              52 male  mascu~
## 2 Beru Whi~    165    75 brown      light      blue              47 fema~ femin~
## 3 Quarsh P~    183    NA black      dark       brown             62 <NA>  <NA>  
## 4 Cliegg L~    183    NA brown      fair       blue              82 male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Or, if you prefer dplyr way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(}\KeywordTok{str\_detect}\NormalTok{(}\KeywordTok{tolower}\NormalTok{(name), }\StringTok{"ars"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 4 x 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Owen Lars    178   120 brown, gr~ light      blue              52 male  mascu~
## 2 Beru Whi~    165    75 brown      light      blue              47 fema~ femin~
## 3 Quarsh P~    183    NA black      dark       brown             62 <NA>  <NA>  
## 4 Cliegg L~    183    NA brown      fair       blue              82 male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Filter brown and black hair\_color}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(hair\_color }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"black"}\NormalTok{, }\StringTok{"brown"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 31 x 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Leia Or~    150  49   brown      light      brown           19   fema~ femin~
##  2 Beru Wh~    165  75   brown      light      blue            47   fema~ femin~
##  3 Biggs D~    183  84   black      light      brown           24   male  mascu~
##  4 Chewbac~    228 112   brown      unknown    blue           200   male  mascu~
##  5 Han Solo    180  80   brown      fair       brown           29   male  mascu~
##  6 Wedge A~    170  77   brown      fair       hazel           21   male  mascu~
##  7 Jek Ton~    180 110   brown      fair       blue            NA   male  mascu~
##  8 Boba Fe~    183  78.2 black      fair       brown           31.5 male  mascu~
##  9 Lando C~    177  79   black      dark       brown           31   male  mascu~
## 10 Arvel C~     NA  NA   brown      fair       brown           NA   male  mascu~
## # ... with 21 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>
\end{verbatim}

\textbf{Challenge}

Use \texttt{str\_detect()} to find characters whose names include ``Han''.

\begin{itemize}
\tightlist
\item
  Choose row by position (row index)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(height)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{slice}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{6}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Yarael P~    264    NA none       white      yellow            NA male  mascu~
## 2 Tarfful      234   136 brown      brown      blue              NA male  mascu~
## 3 Lama Su      229    88 none       grey       black             NA male  mascu~
## 4 Chewbacca    228   112 brown      unknown    blue             200 male  mascu~
## 5 Roos Tar~    224    82 none       grey       orange            NA male  mascu~
## 6 Grievous     216   159 none       brown, wh~ green, y~         NA male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Sample by a fraction
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# For reproducibility}
\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}

\CommentTok{\# Old way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{sample\_frac}\NormalTok{(}\FloatTok{0.10}\NormalTok{,}
    \DataTypeTok{replace =} \OtherTok{FALSE}
\NormalTok{  ) }\CommentTok{\# Without replacement}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 9 x 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Arvel Cr~     NA    NA brown      fair       brown           NA   male  mascu~
## 2 Sly Moore    178    48 none       pale       white           NA   <NA>  <NA>  
## 3 IG-88        200   140 none       metal      red             15   none  mascu~
## 4 Biggs Da~    183    84 black      light      brown           24   male  mascu~
## 5 Leia Org~    150    49 brown      light      brown           19   fema~ femin~
## 6 Watto        137    NA black      blue, grey yellow          NA   male  mascu~
## 7 Jabba De~    175  1358 <NA>       green-tan~ orange         600   herm~ mascu~
## 8 Darth Va~    202   136 none       white      yellow          41.9 male  mascu~
## 9 Taun We      213    NA none       grey       black           NA   fema~ femin~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{slice\_sample}\NormalTok{(}
    \DataTypeTok{prop =} \FloatTok{0.10}\NormalTok{,}
    \DataTypeTok{replace =} \OtherTok{FALSE}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 8 x 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Raymus A~    188  79   brown      light      brown           NA   male  mascu~
## 2 Tarfful      234 136   brown      brown      blue            NA   male  mascu~
## 3 Han Solo     180  80   brown      fair       brown           29   male  mascu~
## 4 Mas Amed~    196  NA   none       blue       blue            NA   male  mascu~
## 5 Barriss ~    166  50   black      yellow     blue            40   fema~ femin~
## 6 Darth Va~    202 136   none       white      yellow          41.9 male  mascu~
## 7 Finn          NA  NA   black      dark       dark            NA   male  mascu~
## 8 Boba Fett    183  78.2 black      fair       brown           31.5 male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Sample by number
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{sample\_n}\NormalTok{(}\DecValTok{20}\NormalTok{,}
    \DataTypeTok{replace =} \OtherTok{FALSE}
\NormalTok{  ) }\CommentTok{\# Without replacement}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 20 x 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Quarsh ~    183    NA black      dark       brown             62 <NA>  <NA>  
##  2 Poe Dam~     NA    NA brown      light      brown             NA male  mascu~
##  3 Mas Ame~    196    NA none       blue       blue              NA male  mascu~
##  4 Zam Wes~    168    55 blonde     fair, gre~ yellow            NA fema~ femin~
##  5 Leia Or~    150    49 brown      light      brown             19 fema~ femin~
##  6 Jango F~    183    79 black      tan        brown             66 male  mascu~
##  7 Ben Qua~    163    65 none       grey, gre~ orange            NA male  mascu~
##  8 Padmé A~    165    45 brown      light      brown             46 fema~ femin~
##  9 Mace Wi~    188    84 none       dark       brown             72 male  mascu~
## 10 R2-D2        96    32 <NA>       white, bl~ red               33 none  mascu~
## 11 Shmi Sk~    163    NA black      fair       brown             72 fema~ femin~
## 12 Ratts T~     79    15 none       grey, blue unknown           NA male  mascu~
## 13 Nute Gu~    191    90 none       mottled g~ red               NA male  mascu~
## 14 Darth M~    175    80 none       red        yellow            54 male  mascu~
## 15 Bib For~    180    NA none       pale       pink              NA male  mascu~
## 16 C-3PO       167    75 <NA>       gold       yellow           112 none  mascu~
## 17 Yarael ~    264    NA none       white      yellow            NA male  mascu~
## 18 Ki-Adi-~    198    82 white      pale       yellow            92 male  mascu~
## 19 BB8          NA    NA none       none       black             NA none  mascu~
## 20 Eeth Ko~    171    NA black      brown      brown             NA male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}

\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{slice\_sample}\NormalTok{(}
    \DataTypeTok{n =} \DecValTok{20}\NormalTok{,}
    \DataTypeTok{replace =} \OtherTok{FALSE}
\NormalTok{  ) }\CommentTok{\# Without replacement}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 20 x 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Owen La~    178   120 brown, gr~ light      blue              52 male  mascu~
##  2 Ki-Adi-~    198    82 white      pale       yellow            92 male  mascu~
##  3 Captain~     NA    NA unknown    unknown    unknown           NA <NA>  <NA>  
##  4 Gregar ~    185    85 black      dark       brown             NA male  mascu~
##  5 R5-D4        97    32 <NA>       white, red red               NA none  mascu~
##  6 Ackbar      180    83 none       brown mot~ orange            41 male  mascu~
##  7 Wedge A~    170    77 brown      fair       hazel             21 male  mascu~
##  8 Dormé       165    NA brown      light      brown             NA fema~ femin~
##  9 Rey          NA    NA brown      light      hazel             NA fema~ femin~
## 10 IG-88       200   140 none       metal      red               15 none  mascu~
## 11 Roos Ta~    224    82 none       grey       orange            NA male  mascu~
## 12 Shmi Sk~    163    NA black      fair       brown             72 fema~ femin~
## 13 R2-D2        96    32 <NA>       white, bl~ red               33 none  mascu~
## 14 Poe Dam~     NA    NA brown      light      brown             NA male  mascu~
## 15 Obi-Wan~    182    77 auburn, w~ fair       blue-gray         57 male  mascu~
## 16 Plo Koon    188    80 none       orange     black             22 male  mascu~
## 17 Tarfful     234   136 brown      brown      blue              NA male  mascu~
## 18 Lobot       175    79 none       light      blue              37 male  mascu~
## 19 San Hill    191    NA none       grey       gold              NA male  mascu~
## 20 Kit Fis~    196    87 none       green      black             NA male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Top 10 rows orderd by height
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Old way}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{, height)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 10 x 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Darth V~    202   136 none       white      yellow          41.9 male  mascu~
##  2 Chewbac~    228   112 brown      unknown    blue           200   male  mascu~
##  3 Roos Ta~    224    82 none       grey       orange          NA   male  mascu~
##  4 Rugor N~    206    NA none       green      orange          NA   male  mascu~
##  5 Yarael ~    264    NA none       white      yellow          NA   male  mascu~
##  6 Lama Su     229    88 none       grey       black           NA   male  mascu~
##  7 Taun We     213    NA none       grey       black           NA   fema~ femin~
##  8 Grievous    216   159 none       brown, wh~ green, y~       NA   male  mascu~
##  9 Tarfful     234   136 brown      brown      blue            NA   male  mascu~
## 10 Tion Me~    206    80 none       grey       black           NA   male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# New way}
\NormalTok{starwars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{slice\_max}\NormalTok{(height, }\DataTypeTok{n =} \DecValTok{10}\NormalTok{) }\CommentTok{\# Variable first, Argument second}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 10 x 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Yarael ~    264    NA none       white      yellow          NA   male  mascu~
##  2 Tarfful     234   136 brown      brown      blue            NA   male  mascu~
##  3 Lama Su     229    88 none       grey       black           NA   male  mascu~
##  4 Chewbac~    228   112 brown      unknown    blue           200   male  mascu~
##  5 Roos Ta~    224    82 none       grey       orange          NA   male  mascu~
##  6 Grievous    216   159 none       brown, wh~ green, y~       NA   male  mascu~
##  7 Taun We     213    NA none       grey       black           NA   fema~ femin~
##  8 Rugor N~    206    NA none       green      orange          NA   male  mascu~
##  9 Tion Me~    206    80 none       grey       black           NA   male  mascu~
## 10 Darth V~    202   136 none       white      yellow          41.9 male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
\end{verbatim}

\hypertarget{subset-variables-columns}{%
\subsection{Subset variables (columns)}\label{subset-variables-columns}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{names}\NormalTok{(msleep)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "name"         "genus"        "vore"         "order"        "conservation"
##  [6] "sleep_total"  "sleep_rem"    "sleep_cycle"  "awake"        "brainwt"     
## [11] "bodywt"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select only numeric columns
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Only numeric}
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(}\KeywordTok{where}\NormalTok{(is.numeric))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 6
##    sleep_total sleep_rem sleep_cycle awake  brainwt  bodywt
##          <dbl>     <dbl>       <dbl> <dbl>    <dbl>   <dbl>
##  1        12.1      NA        NA      11.9 NA        50    
##  2        17         1.8      NA       7    0.0155    0.48 
##  3        14.4       2.4      NA       9.6 NA         1.35 
##  4        14.9       2.3       0.133   9.1  0.00029   0.019
##  5         4         0.7       0.667  20    0.423   600    
##  6        14.4       2.2       0.767   9.6 NA         3.85 
##  7         8.7       1.4       0.383  15.3 NA        20.5  
##  8         7        NA        NA      17   NA         0.045
##  9        10.1       2.9       0.333  13.9  0.07     14    
## 10         3        NA        NA      21    0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\textbf{Challenge}

Use \texttt{select(where())} to find only non-numeric columns

\begin{itemize}
\tightlist
\item
  Select the columns that include ``sleep'' in their names
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"sleep"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 3
##    sleep_total sleep_rem sleep_cycle
##          <dbl>     <dbl>       <dbl>
##  1        12.1      NA        NA    
##  2        17         1.8      NA    
##  3        14.4       2.4      NA    
##  4        14.9       2.3       0.133
##  5         4         0.7       0.667
##  6        14.4       2.2       0.767
##  7         8.7       1.4       0.383
##  8         7        NA        NA    
##  9        10.1       2.9       0.333
## 10         3        NA        NA    
## # ... with 73 more rows
\end{verbatim}

\begin{itemize}
\item
  Select the columns that include either ``sleep'' or ``wt'' in their names
\item
  Basic R way
\end{itemize}

\texttt{grepl} is one of the R base pattern matching functions.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep[}\KeywordTok{grepl}\NormalTok{(}\StringTok{"sleep|wt"}\NormalTok{, }\KeywordTok{names}\NormalTok{(msleep))]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 5
##    sleep_total sleep_rem sleep_cycle  brainwt  bodywt
##          <dbl>     <dbl>       <dbl>    <dbl>   <dbl>
##  1        12.1      NA        NA     NA        50    
##  2        17         1.8      NA      0.0155    0.48 
##  3        14.4       2.4      NA     NA         1.35 
##  4        14.9       2.3       0.133  0.00029   0.019
##  5         4         0.7       0.667  0.423   600    
##  6        14.4       2.2       0.767 NA         3.85 
##  7         8.7       1.4       0.383 NA        20.5  
##  8         7        NA        NA     NA         0.045
##  9        10.1       2.9       0.333  0.07     14    
## 10         3        NA        NA      0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\textbf{Challenge}

Use \texttt{select(match())} to find columns whose names include either ``sleep'' or ``wt''.

\begin{itemize}
\tightlist
\item
  Select the columns that start with ``b''
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(}\KeywordTok{starts\_with}\NormalTok{(}\StringTok{"b"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 2
##     brainwt  bodywt
##       <dbl>   <dbl>
##  1 NA        50    
##  2  0.0155    0.48 
##  3 NA         1.35 
##  4  0.00029   0.019
##  5  0.423   600    
##  6 NA         3.85 
##  7 NA        20.5  
##  8 NA         0.045
##  9  0.07     14    
## 10  0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select the columns that end with ``wt''
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(}\KeywordTok{ends\_with}\NormalTok{(}\StringTok{"wt"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 2
##     brainwt  bodywt
##       <dbl>   <dbl>
##  1 NA        50    
##  2  0.0155    0.48 
##  3 NA         1.35 
##  4  0.00029   0.019
##  5  0.423   600    
##  6 NA         3.85 
##  7 NA        20.5  
##  8 NA         0.045
##  9  0.07     14    
## 10  0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select the columns using both beginning and end string patterns
\end{itemize}

The key idea is you can use Boolean operators (\texttt{!}, \texttt{\&}, \texttt{\textbar{}})to combine different string pattern matching statements.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(}\KeywordTok{starts\_with}\NormalTok{(}\StringTok{"b"}\NormalTok{) }\OperatorTok{\&}\StringTok{ }\KeywordTok{ends\_with}\NormalTok{(}\StringTok{"wt"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 2
##     brainwt  bodywt
##       <dbl>   <dbl>
##  1 NA        50    
##  2  0.0155    0.48 
##  3 NA         1.35 
##  4  0.00029   0.019
##  5  0.423   600    
##  6 NA         3.85 
##  7 NA        20.5  
##  8 NA         0.045
##  9  0.07     14    
## 10  0.0982   14.8  
## # ... with 73 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select the order and move it before everything
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# By specifying a column}
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(order, }\KeywordTok{everything}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 11
##    order  name  genus vore  conservation sleep_total sleep_rem sleep_cycle awake
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl> <dbl>
##  1 Carni~ Chee~ Acin~ carni lc                  12.1      NA        NA      11.9
##  2 Prima~ Owl ~ Aotus omni  <NA>                17         1.8      NA       7  
##  3 Roden~ Moun~ Aplo~ herbi nt                  14.4       2.4      NA       9.6
##  4 Soric~ Grea~ Blar~ omni  lc                  14.9       2.3       0.133   9.1
##  5 Artio~ Cow   Bos   herbi domesticated         4         0.7       0.667  20  
##  6 Pilosa Thre~ Brad~ herbi <NA>                14.4       2.2       0.767   9.6
##  7 Carni~ Nort~ Call~ carni vu                   8.7       1.4       0.383  15.3
##  8 Roden~ Vesp~ Calo~ <NA>  <NA>                 7        NA        NA      17  
##  9 Carni~ Dog   Canis carni domesticated        10.1       2.9       0.333  13.9
## 10 Artio~ Roe ~ Capr~ herbi lc                   3        NA        NA      21  
## # ... with 73 more rows, and 2 more variables: brainwt <dbl>, bodywt <dbl>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select variables from a character vector.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(}\KeywordTok{any\_of}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"name"}\NormalTok{, }\StringTok{"order"}\NormalTok{))) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{colnames}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "name"  "order"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Select the variables named in character + number pattern
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{msleep}\OperatorTok{$}\NormalTok{week8 \textless{}{-}}\StringTok{ }\OtherTok{NA}

\NormalTok{msleep}\OperatorTok{$}\NormalTok{week12 \textless{}{-}}\StringTok{ }\OtherTok{NA}

\NormalTok{msleep}\OperatorTok{$}\NormalTok{week\_extra \textless{}{-}}\StringTok{ }\DecValTok{0}

\NormalTok{msleep }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(}\KeywordTok{num\_range}\NormalTok{(}\StringTok{"week"}\NormalTok{, }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{12}\NormalTok{)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 83 x 2
##    week8 week12
##    <lgl> <lgl> 
##  1 NA    NA    
##  2 NA    NA    
##  3 NA    NA    
##  4 NA    NA    
##  5 NA    NA    
##  6 NA    NA    
##  7 NA    NA    
##  8 NA    NA    
##  9 NA    NA    
## 10 NA    NA    
## # ... with 73 more rows
\end{verbatim}

\textbf{Additional tips}

\texttt{msleep} data has nicely cleaned column names. But real-world data are usually messier. The \texttt{janitor} package is useful to fix this kind of problem.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{messy\_df \textless{}{-}}\StringTok{ }\NormalTok{tibble}\OperatorTok{::}\KeywordTok{tribble}\NormalTok{(}
  \OperatorTok{\textasciitilde{}}\StringTok{"ColNum1"}\NormalTok{, }\OperatorTok{\textasciitilde{}}\StringTok{"COLNUM2"}\NormalTok{, }\OperatorTok{\textasciitilde{}}\StringTok{"COL \& NUM3"}\NormalTok{,}
  \DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{3}
\NormalTok{)}

\NormalTok{messy\_df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 3
##   ColNum1 COLNUM2 `COL & NUM3`
##     <dbl>   <dbl>        <dbl>
## 1       1       2            3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(janitor)}

\NormalTok{janitor}\OperatorTok{::}\KeywordTok{clean\_names}\NormalTok{(messy\_df)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 3
##   col_num1 colnum2 col_num3
##      <dbl>   <dbl>    <dbl>
## 1        1       2        3
\end{verbatim}

\texttt{janitor::tabyl()} is helpful for doing crosstabulation and a nice alternative to \texttt{table()} function.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Frequency table; The default output class is table}
\KeywordTok{table}\NormalTok{(gapminder}\OperatorTok{$}\NormalTok{country)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
##              Afghanistan                  Albania                  Algeria 
##                       12                       12                       12 
##                   Angola                Argentina                Australia 
##                       12                       12                       12 
##                  Austria                  Bahrain               Bangladesh 
##                       12                       12                       12 
##                  Belgium                    Benin                  Bolivia 
##                       12                       12                       12 
##   Bosnia and Herzegovina                 Botswana                   Brazil 
##                       12                       12                       12 
##                 Bulgaria             Burkina Faso                  Burundi 
##                       12                       12                       12 
##                 Cambodia                 Cameroon                   Canada 
##                       12                       12                       12 
## Central African Republic                     Chad                    Chile 
##                       12                       12                       12 
##                    China                 Colombia                  Comoros 
##                       12                       12                       12 
##         Congo, Dem. Rep.              Congo, Rep.               Costa Rica 
##                       12                       12                       12 
##            Cote d'Ivoire                  Croatia                     Cuba 
##                       12                       12                       12 
##           Czech Republic                  Denmark                 Djibouti 
##                       12                       12                       12 
##       Dominican Republic                  Ecuador                    Egypt 
##                       12                       12                       12 
##              El Salvador        Equatorial Guinea                  Eritrea 
##                       12                       12                       12 
##                 Ethiopia                  Finland                   France 
##                       12                       12                       12 
##                    Gabon                   Gambia                  Germany 
##                       12                       12                       12 
##                    Ghana                   Greece                Guatemala 
##                       12                       12                       12 
##                   Guinea            Guinea-Bissau                    Haiti 
##                       12                       12                       12 
##                 Honduras         Hong Kong, China                  Hungary 
##                       12                       12                       12 
##                  Iceland                    India                Indonesia 
##                       12                       12                       12 
##                     Iran                     Iraq                  Ireland 
##                       12                       12                       12 
##                   Israel                    Italy                  Jamaica 
##                       12                       12                       12 
##                    Japan                   Jordan                    Kenya 
##                       12                       12                       12 
##         Korea, Dem. Rep.              Korea, Rep.                   Kuwait 
##                       12                       12                       12 
##                  Lebanon                  Lesotho                  Liberia 
##                       12                       12                       12 
##                    Libya               Madagascar                   Malawi 
##                       12                       12                       12 
##                 Malaysia                     Mali               Mauritania 
##                       12                       12                       12 
##                Mauritius                   Mexico                 Mongolia 
##                       12                       12                       12 
##               Montenegro                  Morocco               Mozambique 
##                       12                       12                       12 
##                  Myanmar                  Namibia                    Nepal 
##                       12                       12                       12 
##              Netherlands              New Zealand                Nicaragua 
##                       12                       12                       12 
##                    Niger                  Nigeria                   Norway 
##                       12                       12                       12 
##                     Oman                 Pakistan                   Panama 
##                       12                       12                       12 
##                 Paraguay                     Peru              Philippines 
##                       12                       12                       12 
##                   Poland                 Portugal              Puerto Rico 
##                       12                       12                       12 
##                  Reunion                  Romania                   Rwanda 
##                       12                       12                       12 
##    Sao Tome and Principe             Saudi Arabia                  Senegal 
##                       12                       12                       12 
##                   Serbia             Sierra Leone                Singapore 
##                       12                       12                       12 
##          Slovak Republic                 Slovenia                  Somalia 
##                       12                       12                       12 
##             South Africa                    Spain                Sri Lanka 
##                       12                       12                       12 
##                    Sudan                Swaziland                   Sweden 
##                       12                       12                       12 
##              Switzerland                    Syria                   Taiwan 
##                       12                       12                       12 
##                 Tanzania                 Thailand                     Togo 
##                       12                       12                       12 
##      Trinidad and Tobago                  Tunisia                   Turkey 
##                       12                       12                       12 
##                   Uganda           United Kingdom            United States 
##                       12                       12                       12 
##                  Uruguay                Venezuela                  Vietnam 
##                       12                       12                       12 
##       West Bank and Gaza              Yemen, Rep.                   Zambia 
##                       12                       12                       12 
##                 Zimbabwe 
##                       12
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Frequency table (unique value, n, percentage)}
\NormalTok{janitor}\OperatorTok{::}\KeywordTok{tabyl}\NormalTok{(gapminder}\OperatorTok{$}\NormalTok{country)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##         gapminder$country  n     percent
##               Afghanistan 12 0.007042254
##                   Albania 12 0.007042254
##                   Algeria 12 0.007042254
##                    Angola 12 0.007042254
##                 Argentina 12 0.007042254
##                 Australia 12 0.007042254
##                   Austria 12 0.007042254
##                   Bahrain 12 0.007042254
##                Bangladesh 12 0.007042254
##                   Belgium 12 0.007042254
##                     Benin 12 0.007042254
##                   Bolivia 12 0.007042254
##    Bosnia and Herzegovina 12 0.007042254
##                  Botswana 12 0.007042254
##                    Brazil 12 0.007042254
##                  Bulgaria 12 0.007042254
##              Burkina Faso 12 0.007042254
##                   Burundi 12 0.007042254
##                  Cambodia 12 0.007042254
##                  Cameroon 12 0.007042254
##                    Canada 12 0.007042254
##  Central African Republic 12 0.007042254
##                      Chad 12 0.007042254
##                     Chile 12 0.007042254
##                     China 12 0.007042254
##                  Colombia 12 0.007042254
##                   Comoros 12 0.007042254
##          Congo, Dem. Rep. 12 0.007042254
##               Congo, Rep. 12 0.007042254
##                Costa Rica 12 0.007042254
##             Cote d'Ivoire 12 0.007042254
##                   Croatia 12 0.007042254
##                      Cuba 12 0.007042254
##            Czech Republic 12 0.007042254
##                   Denmark 12 0.007042254
##                  Djibouti 12 0.007042254
##        Dominican Republic 12 0.007042254
##                   Ecuador 12 0.007042254
##                     Egypt 12 0.007042254
##               El Salvador 12 0.007042254
##         Equatorial Guinea 12 0.007042254
##                   Eritrea 12 0.007042254
##                  Ethiopia 12 0.007042254
##                   Finland 12 0.007042254
##                    France 12 0.007042254
##                     Gabon 12 0.007042254
##                    Gambia 12 0.007042254
##                   Germany 12 0.007042254
##                     Ghana 12 0.007042254
##                    Greece 12 0.007042254
##                 Guatemala 12 0.007042254
##                    Guinea 12 0.007042254
##             Guinea-Bissau 12 0.007042254
##                     Haiti 12 0.007042254
##                  Honduras 12 0.007042254
##          Hong Kong, China 12 0.007042254
##                   Hungary 12 0.007042254
##                   Iceland 12 0.007042254
##                     India 12 0.007042254
##                 Indonesia 12 0.007042254
##                      Iran 12 0.007042254
##                      Iraq 12 0.007042254
##                   Ireland 12 0.007042254
##                    Israel 12 0.007042254
##                     Italy 12 0.007042254
##                   Jamaica 12 0.007042254
##                     Japan 12 0.007042254
##                    Jordan 12 0.007042254
##                     Kenya 12 0.007042254
##          Korea, Dem. Rep. 12 0.007042254
##               Korea, Rep. 12 0.007042254
##                    Kuwait 12 0.007042254
##                   Lebanon 12 0.007042254
##                   Lesotho 12 0.007042254
##                   Liberia 12 0.007042254
##                     Libya 12 0.007042254
##                Madagascar 12 0.007042254
##                    Malawi 12 0.007042254
##                  Malaysia 12 0.007042254
##                      Mali 12 0.007042254
##                Mauritania 12 0.007042254
##                 Mauritius 12 0.007042254
##                    Mexico 12 0.007042254
##                  Mongolia 12 0.007042254
##                Montenegro 12 0.007042254
##                   Morocco 12 0.007042254
##                Mozambique 12 0.007042254
##                   Myanmar 12 0.007042254
##                   Namibia 12 0.007042254
##                     Nepal 12 0.007042254
##               Netherlands 12 0.007042254
##               New Zealand 12 0.007042254
##                 Nicaragua 12 0.007042254
##                     Niger 12 0.007042254
##                   Nigeria 12 0.007042254
##                    Norway 12 0.007042254
##                      Oman 12 0.007042254
##                  Pakistan 12 0.007042254
##                    Panama 12 0.007042254
##                  Paraguay 12 0.007042254
##                      Peru 12 0.007042254
##               Philippines 12 0.007042254
##                    Poland 12 0.007042254
##                  Portugal 12 0.007042254
##               Puerto Rico 12 0.007042254
##                   Reunion 12 0.007042254
##                   Romania 12 0.007042254
##                    Rwanda 12 0.007042254
##     Sao Tome and Principe 12 0.007042254
##              Saudi Arabia 12 0.007042254
##                   Senegal 12 0.007042254
##                    Serbia 12 0.007042254
##              Sierra Leone 12 0.007042254
##                 Singapore 12 0.007042254
##           Slovak Republic 12 0.007042254
##                  Slovenia 12 0.007042254
##                   Somalia 12 0.007042254
##              South Africa 12 0.007042254
##                     Spain 12 0.007042254
##                 Sri Lanka 12 0.007042254
##                     Sudan 12 0.007042254
##                 Swaziland 12 0.007042254
##                    Sweden 12 0.007042254
##               Switzerland 12 0.007042254
##                     Syria 12 0.007042254
##                    Taiwan 12 0.007042254
##                  Tanzania 12 0.007042254
##                  Thailand 12 0.007042254
##                      Togo 12 0.007042254
##       Trinidad and Tobago 12 0.007042254
##                   Tunisia 12 0.007042254
##                    Turkey 12 0.007042254
##                    Uganda 12 0.007042254
##            United Kingdom 12 0.007042254
##             United States 12 0.007042254
##                   Uruguay 12 0.007042254
##                 Venezuela 12 0.007042254
##                   Vietnam 12 0.007042254
##        West Bank and Gaza 12 0.007042254
##               Yemen, Rep. 12 0.007042254
##                    Zambia 12 0.007042254
##                  Zimbabwe 12 0.007042254
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# If you want to add percentage ...}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tabyl}\NormalTok{(country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{adorn\_pct\_formatting}\NormalTok{(}\DataTypeTok{digits =} \DecValTok{0}\NormalTok{, }\DataTypeTok{affix\_sign =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                   country  n percent
##               Afghanistan 12      1%
##                   Albania 12      1%
##                   Algeria 12      1%
##                    Angola 12      1%
##                 Argentina 12      1%
##                 Australia 12      1%
##                   Austria 12      1%
##                   Bahrain 12      1%
##                Bangladesh 12      1%
##                   Belgium 12      1%
##                     Benin 12      1%
##                   Bolivia 12      1%
##    Bosnia and Herzegovina 12      1%
##                  Botswana 12      1%
##                    Brazil 12      1%
##                  Bulgaria 12      1%
##              Burkina Faso 12      1%
##                   Burundi 12      1%
##                  Cambodia 12      1%
##                  Cameroon 12      1%
##                    Canada 12      1%
##  Central African Republic 12      1%
##                      Chad 12      1%
##                     Chile 12      1%
##                     China 12      1%
##                  Colombia 12      1%
##                   Comoros 12      1%
##          Congo, Dem. Rep. 12      1%
##               Congo, Rep. 12      1%
##                Costa Rica 12      1%
##             Cote d'Ivoire 12      1%
##                   Croatia 12      1%
##                      Cuba 12      1%
##            Czech Republic 12      1%
##                   Denmark 12      1%
##                  Djibouti 12      1%
##        Dominican Republic 12      1%
##                   Ecuador 12      1%
##                     Egypt 12      1%
##               El Salvador 12      1%
##         Equatorial Guinea 12      1%
##                   Eritrea 12      1%
##                  Ethiopia 12      1%
##                   Finland 12      1%
##                    France 12      1%
##                     Gabon 12      1%
##                    Gambia 12      1%
##                   Germany 12      1%
##                     Ghana 12      1%
##                    Greece 12      1%
##                 Guatemala 12      1%
##                    Guinea 12      1%
##             Guinea-Bissau 12      1%
##                     Haiti 12      1%
##                  Honduras 12      1%
##          Hong Kong, China 12      1%
##                   Hungary 12      1%
##                   Iceland 12      1%
##                     India 12      1%
##                 Indonesia 12      1%
##                      Iran 12      1%
##                      Iraq 12      1%
##                   Ireland 12      1%
##                    Israel 12      1%
##                     Italy 12      1%
##                   Jamaica 12      1%
##                     Japan 12      1%
##                    Jordan 12      1%
##                     Kenya 12      1%
##          Korea, Dem. Rep. 12      1%
##               Korea, Rep. 12      1%
##                    Kuwait 12      1%
##                   Lebanon 12      1%
##                   Lesotho 12      1%
##                   Liberia 12      1%
##                     Libya 12      1%
##                Madagascar 12      1%
##                    Malawi 12      1%
##                  Malaysia 12      1%
##                      Mali 12      1%
##                Mauritania 12      1%
##                 Mauritius 12      1%
##                    Mexico 12      1%
##                  Mongolia 12      1%
##                Montenegro 12      1%
##                   Morocco 12      1%
##                Mozambique 12      1%
##                   Myanmar 12      1%
##                   Namibia 12      1%
##                     Nepal 12      1%
##               Netherlands 12      1%
##               New Zealand 12      1%
##                 Nicaragua 12      1%
##                     Niger 12      1%
##                   Nigeria 12      1%
##                    Norway 12      1%
##                      Oman 12      1%
##                  Pakistan 12      1%
##                    Panama 12      1%
##                  Paraguay 12      1%
##                      Peru 12      1%
##               Philippines 12      1%
##                    Poland 12      1%
##                  Portugal 12      1%
##               Puerto Rico 12      1%
##                   Reunion 12      1%
##                   Romania 12      1%
##                    Rwanda 12      1%
##     Sao Tome and Principe 12      1%
##              Saudi Arabia 12      1%
##                   Senegal 12      1%
##                    Serbia 12      1%
##              Sierra Leone 12      1%
##                 Singapore 12      1%
##           Slovak Republic 12      1%
##                  Slovenia 12      1%
##                   Somalia 12      1%
##              South Africa 12      1%
##                     Spain 12      1%
##                 Sri Lanka 12      1%
##                     Sudan 12      1%
##                 Swaziland 12      1%
##                    Sweden 12      1%
##               Switzerland 12      1%
##                     Syria 12      1%
##                    Taiwan 12      1%
##                  Tanzania 12      1%
##                  Thailand 12      1%
##                      Togo 12      1%
##       Trinidad and Tobago 12      1%
##                   Tunisia 12      1%
##                    Turkey 12      1%
##                    Uganda 12      1%
##            United Kingdom 12      1%
##             United States 12      1%
##                   Uruguay 12      1%
##                 Venezuela 12      1%
##                   Vietnam 12      1%
##        West Bank and Gaza 12      1%
##               Yemen, Rep. 12      1%
##                    Zambia 12      1%
##                  Zimbabwe 12      1%
\end{verbatim}

\hypertarget{create-variables}{%
\subsection{Create variables}\label{create-variables}}

\hypertarget{change-values-using-conditions}{%
\subsubsection{Change values using conditions}\label{change-values-using-conditions}}

You can think of \texttt{case\_when()} (multiple conditions) as an extended version of \texttt{ifelse()} (binary conditions).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars \textless{}{-}}\StringTok{ }\NormalTok{mtcars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{cyl\_dummy =} \KeywordTok{case\_when}\NormalTok{(}
\NormalTok{    cyl }\OperatorTok{\textgreater{}}\StringTok{ }\KeywordTok{median}\NormalTok{(cyl) }\OperatorTok{\textasciitilde{}}\StringTok{ "High"}\NormalTok{, }\CommentTok{\# if condition}
\NormalTok{    cyl }\OperatorTok{\textless{}}\StringTok{ }\KeywordTok{median}\NormalTok{(cyl) }\OperatorTok{\textasciitilde{}}\StringTok{ "Low"}\NormalTok{, }\CommentTok{\# else if condition}
    \OtherTok{TRUE} \OperatorTok{\textasciitilde{}}\StringTok{ "Median"}
\NormalTok{  )) }\CommentTok{\# else condition}

\NormalTok{mtcars }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{pull}\NormalTok{(cyl\_dummy)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "Median" "Median" "Low"    "Median" "High"   "Median" "High"   "Low"   
##  [9] "Low"    "Median" "Median" "High"   "High"   "High"   "High"   "High"  
## [17] "High"   "Low"    "Low"    "Low"    "Low"    "High"   "High"   "High"  
## [25] "High"   "Low"    "Low"    "Low"    "High"   "Median" "High"   "Low"
\end{verbatim}

\hypertarget{change-values-manually}{%
\subsubsection{Change values manually}\label{change-values-manually}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{cyl\_dummy =} \KeywordTok{recode}\NormalTok{(cyl\_dummy, }\CommentTok{\# Target column}
    \StringTok{"High"}\NormalTok{ =}\StringTok{ "2"}\NormalTok{, }\CommentTok{\# Old {-} New}
    \StringTok{"Low"}\NormalTok{ =}\StringTok{ "0"}\NormalTok{,}
    \StringTok{"Median"}\NormalTok{ =}\StringTok{ "1"}
\NormalTok{  )) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull}\NormalTok{(cyl\_dummy)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "1" "1" "0" "1" "2" "1" "2" "0" "0" "1" "1" "2" "2" "2" "2" "2" "2" "0" "0"
## [20] "0" "0" "2" "2" "2" "2" "0" "0" "0" "2" "1" "2" "0"
\end{verbatim}

\hypertarget{counting}{%
\subsection{Counting}\label{counting}}

\begin{itemize}
\tightlist
\item
  How many countries are in each continent?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(continent)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Let's arrange the result.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Just add a new argument \textasciigrave{}sort = TRUE\textasciigrave{}}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(continent, }\DataTypeTok{sort =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Asia        396
## 3 Europe      360
## 4 Americas    300
## 5 Oceania      24
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Same as above; How nice!}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(n))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent     n
##   <fct>     <int>
## 1 Africa      624
## 2 Asia        396
## 3 Europe      360
## 4 Americas    300
## 5 Oceania      24
\end{verbatim}

\textbf{Challenge}

Count the number of observations per \texttt{continent} and \texttt{year} and arrange them in descending order.

Let's take a deeper look at how things work under the hood.

\begin{itemize}
\item
  \texttt{tally()} works similar to \texttt{nrow()}: Calculate the total number of cases in a dataframe
\item
  \texttt{count} = \texttt{group\_by()} + \texttt{tally()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tally}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##       n
##   <int>
## 1  1704
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{add\_tally()} = \texttt{mutate(n\ =\ n())}
\end{itemize}

\textbf{Challenge}

What does n in the below example represent?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(continent, country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{add\_tally}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1,704 x 3
##    continent country         n
##    <fct>     <fct>       <int>
##  1 Asia      Afghanistan  1704
##  2 Asia      Afghanistan  1704
##  3 Asia      Afghanistan  1704
##  4 Asia      Afghanistan  1704
##  5 Asia      Afghanistan  1704
##  6 Asia      Afghanistan  1704
##  7 Asia      Afghanistan  1704
##  8 Asia      Afghanistan  1704
##  9 Asia      Afghanistan  1704
## 10 Asia      Afghanistan  1704
## # ... with 1,694 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{add\_count}
\end{itemize}

Add count as a column.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add count as a column}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{add\_count}\NormalTok{(year)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1,704 x 7
## # Groups:   continent [5]
##    country     continent  year lifeExp      pop gdpPercap     n
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl> <int>
##  1 Afghanistan Asia       1952    28.8  8425333      779.    33
##  2 Afghanistan Asia       1957    30.3  9240934      821.    33
##  3 Afghanistan Asia       1962    32.0 10267083      853.    33
##  4 Afghanistan Asia       1967    34.0 11537966      836.    33
##  5 Afghanistan Asia       1972    36.1 13079460      740.    33
##  6 Afghanistan Asia       1977    38.4 14880372      786.    33
##  7 Afghanistan Asia       1982    39.9 12881816      978.    33
##  8 Afghanistan Asia       1987    40.8 13867957      852.    33
##  9 Afghanistan Asia       1992    41.7 16317921      649.    33
## 10 Afghanistan Asia       1997    41.8 22227415      635.    33
## # ... with 1,694 more rows
\end{verbatim}

\textbf{Challenge}

Do cases 1 and 2 in the below code chunk produce the same outputs? If so, why?

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Case 1}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 60 x 3
## # Groups:   continent, year [60]
##    continent  year     n
##    <fct>     <int> <int>
##  1 Africa     1952    52
##  2 Africa     1957    52
##  3 Africa     1962    52
##  4 Africa     1967    52
##  5 Africa     1972    52
##  6 Africa     1977    52
##  7 Africa     1982    52
##  8 Africa     1987    52
##  9 Africa     1992    52
## 10 Africa     1997    52
## # ... with 50 more rows
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Case 2}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(year)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 60 x 3
## # Groups:   continent [5]
##    continent  year     n
##    <fct>     <int> <int>
##  1 Africa     1952    52
##  2 Africa     1957    52
##  3 Africa     1962    52
##  4 Africa     1967    52
##  5 Africa     1972    52
##  6 Africa     1977    52
##  7 Africa     1982    52
##  8 Africa     1987    52
##  9 Africa     1992    52
## 10 Africa     1997    52
## # ... with 50 more rows
\end{verbatim}

\texttt{count()} is a simple function, but it is still helpful to learn an essential concept underlying complex data wrangling: split-apply-combine strategy. For more information, read Wickham's article (2011) \href{http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.182.5667\&rep=rep1\&type=pdf}{``The Split-Apply-Combine Strategy for Data Analysis''} published in the \emph{Journal of Statistical Software} (especially pages 7-8). \href{https://github.com/hadley/plyr}{\texttt{plyr}} was the package (retired) that demonstrated this idea, which has evolved into two directions: \href{https://dplyr.tidyverse.org/}{dplyr} (for data frames) and \href{https://purrr.tidyverse.org/}{purrr} (for lists)

\hypertarget{summarizing}{%
\subsection{Summarizing}\label{summarizing}}

\hypertarget{basic}{%
\subsubsection{Basic}\label{basic}}

\begin{itemize}
\tightlist
\item
  Create a summary
\item
  Think of \texttt{summarise()} as an extended version of \texttt{count()}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{n =} \KeywordTok{n}\NormalTok{(),}
    \DataTypeTok{mean\_gdp =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{sd\_gdp =} \KeywordTok{sd}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##   continent     n mean_gdp sd_gdp
##   <fct>     <int>    <dbl>  <dbl>
## 1 Africa      624    2194.  2828.
## 2 Americas    300    7136.  6397.
## 3 Asia        396    7902. 14045.
## 4 Europe      360   14469.  9355.
## 5 Oceania      24   18622.  6359.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tablea \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{n =} \KeywordTok{n}\NormalTok{(),}
    \DataTypeTok{mean\_gdp =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{sd\_gdp =} \KeywordTok{sd}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Produce publishable tables
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  kableExtra,}
\NormalTok{  flextable}
\NormalTok{)}

\CommentTok{\# For HTML and LaTeX}
\NormalTok{tablea }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\NormalTok{kableExtra}\OperatorTok{::}\KeywordTok{kable}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{tabular}{l|r|r|r}
\hline
continent & n & mean\_gdp & sd\_gdp\\
\hline
Africa & 624 & 2193.755 & 2827.930\\
\hline
Americas & 300 & 7136.110 & 6396.764\\
\hline
Asia & 396 & 7902.150 & 14045.373\\
\hline
Europe & 360 & 14469.476 & 9355.213\\
\hline
Oceania & 24 & 18621.609 & 6358.983\\
\hline
\end{tabular}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# For HTML and MS Office suite}
\NormalTok{tablea }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\NormalTok{flextable}\OperatorTok{::}\KeywordTok{flextable}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\providecommand{\docline}[3]{\noalign{\global\setlength{\arrayrulewidth}{#1}}\arrayrulecolor[HTML]{#2}\cline{#3}}

\setlength{\tabcolsep}{2pt}

\renewcommand*{\arraystretch}{1.5}

\begin{longtable}[c]{|p{0.75in}|p{0.75in}|p{0.75in}|p{0.75in}}


\hhline{>{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}-}

\multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedright}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{continent}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{n}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{mean\_gdp}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}!{\color[HTML]{000000}\vrule width 0pt}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{sd\_gdp}}}} \\

\hhline{>{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}-}

\endfirsthead

\hhline{>{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}-}

\multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedright}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{continent}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{n}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{mean\_gdp}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}!{\color[HTML]{000000}\vrule width 0pt}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{sd\_gdp}}}} \\

\hhline{>{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}-}\endhead


\multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedright}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{Africa}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{624}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{2,193.755}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}!{\color[HTML]{000000}\vrule width 0pt}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{2,827.930}}}} \\


\multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedright}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{Americas}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{300}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{7,136.110}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}!{\color[HTML]{000000}\vrule width 0pt}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{6,396.764}}}} \\


\multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedright}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{Asia}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{396}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{7,902.150}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}!{\color[HTML]{000000}\vrule width 0pt}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{14,045.373}}}} \\


\multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedright}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{Europe}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{360}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{14,469.476}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}!{\color[HTML]{000000}\vrule width 0pt}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{9,355.213}}}} \\


\multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedright}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{Oceania}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{24}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{18,621.609}}}} & \multicolumn{1}{!{\color[HTML]{000000}\vrule width 0pt}>{\raggedleft}p{\dimexpr 0.75in+0\tabcolsep+0\arrayrulewidth}!{\color[HTML]{000000}\vrule width 0pt}}{\fontsize{11}{11}\selectfont{\textcolor[HTML]{000000}{\global\setmainfont{DejaVu Sans}{6,358.983}}}} \\

\hhline{>{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}->{\arrayrulecolor[HTML]{666666}\global\arrayrulewidth=2pt}-}


\end{longtable}

\hypertarget{scoped-summaries}{%
\subsubsection{Scoped summaries}\label{scoped-summaries}}

\begin{itemize}
\item
  Old way
\item
  \texttt{summarise\_all()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a wide{-}shaped data example}
\NormalTok{wide\_gapminder \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_wider}\NormalTok{(}
    \DataTypeTok{names\_from =}\NormalTok{ country,}
    \DataTypeTok{values\_from =}\NormalTok{ gdpPercap}
\NormalTok{  )}

\CommentTok{\# Apply summarise\_all}
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise\_all}\NormalTok{(mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Herzeg~` Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>                <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.                3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>,
## #   `Slovak Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{summarise\_if()}: using a logical condition
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise\_if}\NormalTok{(is.double, mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 31
##   lifeExp Albania Austria Belgium `Bosnia and Herzegovina` Bulgaria Croatia
##     <dbl>   <dbl>   <dbl>   <dbl>                    <dbl>    <dbl>   <dbl>
## 1    71.9   3255.  20412.  19901.                    3485.    6384.   9332.
## # ... with 24 more variables: `Czech Republic` <dbl>, Denmark <dbl>,
## #   Finland <dbl>, France <dbl>, Germany <dbl>, Greece <dbl>, Hungary <dbl>,
## #   Iceland <dbl>, Ireland <dbl>, Italy <dbl>, Montenegro <dbl>,
## #   Netherlands <dbl>, Norway <dbl>, Poland <dbl>, Portugal <dbl>,
## #   Romania <dbl>, Serbia <dbl>, `Slovak Republic` <dbl>, Slovenia <dbl>,
## #   Spain <dbl>, Sweden <dbl>, Switzerland <dbl>, Turkey <dbl>,
## #   `United Kingdom` <dbl>
\end{verbatim}

\begin{itemize}
\item
  \texttt{summarise\_at()}
\item
  \texttt{vars()\ =\ select()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise\_at}\NormalTok{(}\KeywordTok{vars}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{)),}
\NormalTok{    mean,}
    \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Herzeg~` Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>                <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.                3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>,
## #   `Slovak Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise\_at}\NormalTok{(}\KeywordTok{vars}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"life"}\NormalTok{)),}
\NormalTok{    mean,}
    \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   lifeExp
##     <dbl>
## 1    71.9
\end{verbatim}

\textbf{Additional tips}

\begin{figure}
\centering
\includegraphics{https://github.com/rstudio/concept-maps/raw/master/en/regular-expressions.svg}
\caption{Concept map for regular expressions. By Monica Alonso, Greg Wilson.}
\end{figure}

\begin{itemize}
\item
  New way
\item
  \texttt{summarise()} + \texttt{across()}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://github.com/rstudio/concept-maps/raw/master/en/across.svg}
\caption{Concept map for across. By Emma Vestesson}
\end{figure}

\begin{itemize}
\item
  If you find using \texttt{summarise\_all()}, \texttt{summarise\_if()} and \texttt{summarise\_at()} confusing, here's a solution: use \texttt{summarise()} with \texttt{across()}.
\item
  \texttt{summarise\_all()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(Albania}\OperatorTok{:}\StringTok{\textasciigrave{}}\DataTypeTok{United Kingdom}\StringTok{\textasciigrave{}}\NormalTok{, mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Herzeg~` Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>                <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.                3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>,
## #   `Slovak Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{), mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Herzeg~` Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>                <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.                3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>,
## #   `Slovak Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{summarise\_if()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(is.double, mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Predicate functions must be wrapped in `where()`.
## 
##   # Bad
##   data %>% select(is.double)
## 
##   # Good
##   data %>% select(where(is.double))
## 
## i Please update your code.
## This message is displayed once per session.
\end{verbatim}

\begin{verbatim}
## # A tibble: 1 x 31
##   lifeExp Albania Austria Belgium `Bosnia and Herzegovina` Bulgaria Croatia
##     <dbl>   <dbl>   <dbl>   <dbl>                    <dbl>    <dbl>   <dbl>
## 1    71.9   3255.  20412.  19901.                    3485.    6384.   9332.
## # ... with 24 more variables: `Czech Republic` <dbl>, Denmark <dbl>,
## #   Finland <dbl>, France <dbl>, Germany <dbl>, Greece <dbl>, Hungary <dbl>,
## #   Iceland <dbl>, Ireland <dbl>, Italy <dbl>, Montenegro <dbl>,
## #   Netherlands <dbl>, Norway <dbl>, Poland <dbl>, Portugal <dbl>,
## #   Romania <dbl>, Serbia <dbl>, `Slovak Republic` <dbl>, Slovenia <dbl>,
## #   Spain <dbl>, Sweden <dbl>, Switzerland <dbl>, Turkey <dbl>,
## #   `United Kingdom` <dbl>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{summarise\_at()}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{),}
\NormalTok{    mean,}
    \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{  ))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 30
##   Albania Austria Belgium `Bosnia and Herzeg~` Bulgaria Croatia `Czech Republic`
##     <dbl>   <dbl>   <dbl>                <dbl>    <dbl>   <dbl>            <dbl>
## 1   3255.  20412.  19901.                3485.    6384.   9332.           13920.
## # ... with 23 more variables: Denmark <dbl>, Finland <dbl>, France <dbl>,
## #   Germany <dbl>, Greece <dbl>, Hungary <dbl>, Iceland <dbl>, Ireland <dbl>,
## #   Italy <dbl>, Montenegro <dbl>, Netherlands <dbl>, Norway <dbl>,
## #   Poland <dbl>, Portugal <dbl>, Romania <dbl>, Serbia <dbl>,
## #   `Slovak Republic` <dbl>, Slovenia <dbl>, Spain <dbl>, Sweden <dbl>,
## #   Switzerland <dbl>, Turkey <dbl>, `United Kingdom` <dbl>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"life"}\NormalTok{),}
\NormalTok{    mean,}
    \DataTypeTok{na.rm =} \OtherTok{TRUE}
\NormalTok{  ))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   lifeExp
##     <dbl>
## 1    71.9
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wide\_gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"A"}\NormalTok{, }\DataTypeTok{ignore.case =} \OtherTok{FALSE}\NormalTok{)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 360 x 2
##    Albania Austria
##      <dbl>   <dbl>
##  1   1601.      NA
##  2   1942.      NA
##  3   2313.      NA
##  4   2760.      NA
##  5   3313.      NA
##  6   3533.      NA
##  7   3631.      NA
##  8   3739.      NA
##  9   2497.      NA
## 10   3193.      NA
## # ... with 350 more rows
\end{verbatim}

Note that this workshop does not cover creating and manipulating variables using \texttt{mutate()} because many techniques you learned from playing with \texttt{summarise()} can be directly applied to \texttt{mutate()}.

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Summarize the average GDP of countries whose names start with the alphabet ``A.''
\item
  Turn the summary dataframe into a publishable table using either \texttt{kableExtra} or \texttt{flextable} package.
\end{enumerate}

\hypertarget{tabulation-tbd}{%
\subsubsection{Tabulation (TBD)}\label{tabulation-tbd}}

\hypertarget{grouping}{%
\subsection{Grouping}\label{grouping}}

\hypertarget{grouped-summaries}{%
\subsubsection{Grouped summaries}\label{grouped-summaries}}

\begin{itemize}
\item
  Calculate the mean of \texttt{gdpPercap}.
\item
  Some functions are designed to work together. For instance, the \texttt{group\_by()} function defines the strata you will use for summary statistics. Then, use \texttt{summarise()} to obtain summary statistics.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\#}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{mean\_gdp =} \KeywordTok{mean}\NormalTok{(gdpPercap))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent mean_gdp
##   <fct>        <dbl>
## 1 Africa       2194.
## 2 Americas     7136.
## 3 Asia         7902.
## 4 Europe      14469.
## 5 Oceania     18622.
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Calculate multiple summary statistics.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\#}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{mean\_gdp =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{count =} \KeywordTok{n}\NormalTok{()}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 3
##   continent mean_gdp count
##   <fct>        <dbl> <int>
## 1 Africa       2194.   624
## 2 Americas     7136.   300
## 3 Asia         7902.   396
## 4 Europe      14469.   360
## 5 Oceania     18622.    24
\end{verbatim}

\textbf{Optional}

\begin{itemize}
\tightlist
\item
  Other summary statistics
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Measures of spread: \texttt{median(x)}, \texttt{sd(x)}, \texttt{IQR(x)}, \texttt{mad(x)} (the median absolute deviation)
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# The Interquartile Range = The Difference Between 75t and 25t Percentiles}

\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\#}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{IQR\_gdp =} \KeywordTok{IQR}\NormalTok{(gdpPercap))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent IQR_gdp
##   <fct>       <dbl>
## 1 Africa      1616.
## 2 Americas    4402.
## 3 Asia        7492.
## 4 Europe     13248.
## 5 Oceania     8072.
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Measures of rank: \texttt{min(x)}, \texttt{quantile(x,\ 0.25)}, \texttt{max(x)}
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\#}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{min\_gdp =} \KeywordTok{min}\NormalTok{(gdpPercap),}
    \DataTypeTok{max\_gdp =} \KeywordTok{max}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 3
##   continent min_gdp max_gdp
##   <fct>       <dbl>   <dbl>
## 1 Africa       241.  21951.
## 2 Americas    1202.  42952.
## 3 Asia         331  113523.
## 4 Europe       974.  49357.
## 5 Oceania    10040.  34435.
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Measures of position: \texttt{first(x)}, \texttt{last(x)}, \texttt{nth(x,\ 2)}
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{first\_gdp =} \KeywordTok{first}\NormalTok{(gdpPercap),}
    \DataTypeTok{last\_gdp =} \KeywordTok{last}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 3
##   continent first_gdp last_gdp
##   <fct>         <dbl>    <dbl>
## 1 Africa        2449.     470.
## 2 Americas      5911.   11416.
## 3 Asia           779.    2281.
## 4 Europe        1601.   33203.
## 5 Oceania      10040.   25185.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(gdpPercap) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Adding arrange}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}
    \DataTypeTok{first\_gdp =} \KeywordTok{first}\NormalTok{(gdpPercap),}
    \DataTypeTok{last\_gdp =} \KeywordTok{last}\NormalTok{(gdpPercap)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 3
##   continent first_gdp last_gdp
##   <fct>         <dbl>    <dbl>
## 1 Africa         241.   21951.
## 2 Americas      1202.   42952.
## 3 Asia           331   113523.
## 4 Europe         974.   49357.
## 5 Oceania      10040.   34435.
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Measures of counts: \texttt{n(x)} (all rows), \texttt{sum(!is.na(x))} (only non-missing rows) = \texttt{n\_distinct(x)}
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{ns =} \KeywordTok{n}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent    ns
##   <fct>     <int>
## 1 Africa      624
## 2 Americas    300
## 3 Asia        396
## 4 Europe      360
## 5 Oceania      24
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{4}
\tightlist
\item
  Counts and proportions of logical values: \texttt{sum(condition\ about\ x)} (the number of TRUEs in x), \texttt{mean(condition\ about\ x)} (the proportion of TRUEs in x)
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{rich\_countries =} \KeywordTok{mean}\NormalTok{(gdpPercap }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{20000}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   continent rich_countries
##   <fct>              <dbl>
## 1 Africa           0.00481
## 2 Americas         0.05   
## 3 Asia             0.111  
## 4 Europe           0.261  
## 5 Oceania          0.333
\end{verbatim}

\textbf{Additional tips}

Also, check out window functions such as \texttt{cumsum()} and \texttt{lag()}. Window functions are a variant of aggregate functions that take a vector as input then return a vector of the same length as an output.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vec \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{)}

\CommentTok{\# Typical aggregate function}
\KeywordTok{sum}\NormalTok{(vec) }\CommentTok{\# The output length is one}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 55
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Window function}
\KeywordTok{cumsum}\NormalTok{(vec) }\CommentTok{\# The output length is ten}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1]  1  3  6 10 15 21 28 36 45 55
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Let\textquotesingle{}s compare them side{-}by{-}side}
\KeywordTok{compare}\NormalTok{(}
  \KeywordTok{sum}\NormalTok{(vec),}
  \KeywordTok{cumsum}\NormalTok{(vec)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `old`: 55                         
## `new`:  1 3 6 10 15 21 28 36 45 55
\end{verbatim}

\hypertarget{joining}{%
\subsection{Joining}\label{joining}}

Relational data = multiple tables of data

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/245292d1ea724f6c3fd8a92063dcd7bfb9758d02/5751b/diagrams/relational-nycflights.png}
\caption{Relational data example}
\end{figure}

\textbf{Key ideas}

\begin{itemize}
\tightlist
\item
  A \textbf{primary key} ``uniquely identifies an observation in its table''
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Example}
\NormalTok{planes}\OperatorTok{$}\NormalTok{tailnum }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "N10156" "N102UW" "N103US" "N104UW" "N10575" "N105UW"
\end{verbatim}

Verify primary key

\texttt{tailnum} should be unique.

\textbf{Challenge}

What do you expect the outcome?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{planes }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(tailnum) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(n }\OperatorTok{\textgreater{}}\StringTok{ }\DecValTok{1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 0 x 2
## # ... with 2 variables: tailnum <chr>, n <int>
\end{verbatim}

\textbf{Optional}

If a dataframe doesn't have a primary key, you can add one called a \textbf{surrogate} key.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Toy example}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\DecValTok{4}\OperatorTok{:}\DecValTok{6}\NormalTok{)}
\NormalTok{)}

\CommentTok{\# Add a row\_index column}
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{df }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{rowid\_to\_column}\NormalTok{(}\StringTok{"ID"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  A \textbf{foreign} key ``uniquely identifies an observation in another table.''
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights}\OperatorTok{$}\NormalTok{tailnum }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{head}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "N14228" "N24211" "N619AA" "N804JB" "N668DN" "N39463"
\end{verbatim}

For joining, don't be distracted by other details and focus on KEYS!

\hypertarget{mutating-joins}{%
\subsubsection{Mutating joins}\label{mutating-joins}}

\begin{quote}
Add new variables to one data frame from matching observations in another"
\end{quote}

Using a simple toy example is great because it is easy to see how things work in that much narrow context.

\begin{itemize}
\tightlist
\item
  Toy example
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Table 1}
\NormalTok{x \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{key =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{),}
  \DataTypeTok{val\_x =} \KeywordTok{c}\NormalTok{(}\StringTok{"x1"}\NormalTok{, }\StringTok{"x2"}\NormalTok{, }\StringTok{"x3"}\NormalTok{, }\StringTok{"x4"}\NormalTok{)}
\NormalTok{)}

\CommentTok{\# Table 2}
\NormalTok{y \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{key =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{5}\NormalTok{),}
  \DataTypeTok{val\_y =} \KeywordTok{c}\NormalTok{(}\StringTok{"y1"}\NormalTok{, }\StringTok{"y2"}\NormalTok{, }\StringTok{"y3"}\NormalTok{, }\StringTok{"y4"}\NormalTok{, }\StringTok{"y5"}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Inner Join
\end{itemize}

\texttt{inner\_join()} keeps the matched values in both tables. If the left table is a subset of the right table, then \texttt{left\_join()} is the same as \texttt{inner\_join()}.

\textbf{Challenge}

What is going to be the shared keys?

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{inner\_join}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "key"
\end{verbatim}

\begin{verbatim}
## # A tibble: 4 x 3
##     key val_x val_y
##   <int> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    y3   
## 4     4 x4    y4
\end{verbatim}

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/aeab386461820b029b7e7606ccff1286f623bae1/ef0d4/diagrams/join-venn.png}
\caption{Mutating joins}
\end{figure}

\begin{itemize}
\tightlist
\item
  Left Join
\end{itemize}

\texttt{left\_join()}, \texttt{right\_join()} and \texttt{full\_join()} are outer join functions. Unlike \texttt{inner\_join()}, outer join functions keep observations that appear in at least one of the tables.

\texttt{left\_join()} keeps only the matched observations in the right table.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{left\_join}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "key"
\end{verbatim}

\begin{verbatim}
## # A tibble: 4 x 3
##     key val_x val_y
##   <int> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    y3   
## 4     4 x4    y4
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Right Join
\end{itemize}

\texttt{right\_join()} does the opposite.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{right\_join}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "key"
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 3
##     key val_x val_y
##   <int> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    y3   
## 4     4 x4    y4   
## 5     5 <NA>  y5
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Full Join
\end{itemize}

\texttt{full\_join()} keeps the observations from both tables. NAs were recorded in one of the two tables if they were unmatched.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{full\_join}\NormalTok{(x, y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "key"
\end{verbatim}

\begin{verbatim}
## # A tibble: 5 x 3
##     key val_x val_y
##   <int> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    y3   
## 4     4 x4    y4   
## 5     5 <NA>  y5
\end{verbatim}

\hypertarget{filtering-joins}{%
\subsubsection{Filtering joins}\label{filtering-joins}}

\begin{quote}
Filter observations from one data frame based on whether they match an observation in the other table.
\end{quote}

\begin{itemize}
\tightlist
\item
  Semi Join
\end{itemize}

In SQL, this type of query is also called subqueries.

\begin{itemize}
\tightlist
\item
  Filtering without joining
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create the list of the top 10 destinations}
\NormalTok{top\_dest \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(dest, }\DataTypeTok{sort =} \OtherTok{TRUE}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Selecting by n
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Filter}
\NormalTok{filtered \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(dest }\OperatorTok{\%in\%}\StringTok{ }\NormalTok{top\_dest}\OperatorTok{$}\NormalTok{dest)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Using semi join: only keep (INCLUDE) the rows that were matched between the two tables
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{joined \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{semi\_join}\NormalTok{(top\_dest)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "dest"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{head}\NormalTok{(filtered }\OperatorTok{==}\StringTok{ }\NormalTok{joined)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##      year month  day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## [1,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [2,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [3,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [4,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [5,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
## [6,] TRUE  TRUE TRUE     TRUE           TRUE      TRUE     TRUE           TRUE
##      arr_delay carrier flight tailnum origin dest air_time distance hour minute
## [1,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [2,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [3,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [4,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [5,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
## [6,]      TRUE    TRUE   TRUE    TRUE   TRUE TRUE     TRUE     TRUE TRUE   TRUE
##      time_hour
## [1,]      TRUE
## [2,]      TRUE
## [3,]      TRUE
## [4,]      TRUE
## [5,]      TRUE
## [6,]      TRUE
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Anti Join
\end{itemize}

\texttt{anti\_join()} does the opposite. Exclude the rows that were matched between the two tables. A great technique to filter stopwords when you do computational text analysis.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{anti\_join}\NormalTok{(planes, }\DataTypeTok{by =} \StringTok{"tailnum"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(tailnum, }\DataTypeTok{sort =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 722 x 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # ... with 712 more rows
\end{verbatim}

\hypertarget{modeling-broom}{%
\section{Modeling (broom)}\label{modeling-broom}}

\hypertarget{nesting}{%
\subsection{Nesting}\label{nesting}}

\hypertarget{nest}{%
\subsubsection{nest}\label{nest}}

The following example comes from \href{https://r4ds.had.co.nz/many-models.html}{R for Data Science} by Garrett Grolemund and Hadley Wickham.

\begin{itemize}
\tightlist
\item
  How can you run multiple models simultaneously? Using a nested data frame.
\end{itemize}

\begin{itemize}
\item
  \textbf{Grouped data: each row = an observation}
\item
  \textbf{Nested data: each row = a group}
\end{itemize}

\textbf{Challenge}

Why did we use \texttt{country} and \texttt{continent} for nesting variables in the following example?

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nested \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nest}\NormalTok{()}

\KeywordTok{head}\NormalTok{(nested)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 3
## # Groups:   country, continent [6]
##   country     continent data             
##   <fct>       <fct>     <list>           
## 1 Afghanistan Asia      <tibble [12 x 4]>
## 2 Albania     Europe    <tibble [12 x 4]>
## 3 Algeria     Africa    <tibble [12 x 4]>
## 4 Angola      Africa    <tibble [12 x 4]>
## 5 Argentina   Americas  <tibble [12 x 4]>
## 6 Australia   Oceania   <tibble [12 x 4]>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nested}\OperatorTok{$}\NormalTok{data }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{pluck}\NormalTok{(}\DecValTok{1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 12 x 4
##     year lifeExp      pop gdpPercap
##    <int>   <dbl>    <int>     <dbl>
##  1  1952    28.8  8425333      779.
##  2  1957    30.3  9240934      821.
##  3  1962    32.0 10267083      853.
##  4  1967    34.0 11537966      836.
##  5  1972    36.1 13079460      740.
##  6  1977    38.4 14880372      786.
##  7  1982    39.9 12881816      978.
##  8  1987    40.8 13867957      852.
##  9  1992    41.7 16317921      649.
## 10  1997    41.8 22227415      635.
## 11  2002    42.1 25268405      727.
## 12  2007    43.8 31889923      975.
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Custom function
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{lm\_model \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(df) \{}
  \KeywordTok{lm}\NormalTok{(lifeExp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{year, }\DataTypeTok{data =}\NormalTok{ df)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Apply function to the nested data
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Apply m\_model to the nested data}

\NormalTok{nested \textless{}{-}}\StringTok{ }\NormalTok{nested }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{models =} \KeywordTok{map}\NormalTok{(data, lm\_model)) }\CommentTok{\# Add the list object as a new column}

\KeywordTok{head}\NormalTok{(nested)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 4
## # Groups:   country, continent [6]
##   country     continent data              models
##   <fct>       <fct>     <list>            <list>
## 1 Afghanistan Asia      <tibble [12 x 4]> <lm>  
## 2 Albania     Europe    <tibble [12 x 4]> <lm>  
## 3 Algeria     Africa    <tibble [12 x 4]> <lm>  
## 4 Angola      Africa    <tibble [12 x 4]> <lm>  
## 5 Argentina   Americas  <tibble [12 x 4]> <lm>  
## 6 Australia   Oceania   <tibble [12 x 4]> <lm>
\end{verbatim}

S3 is part of R's object-oriented systems. If you need further information, check out \href{http://adv-r.had.co.nz/S3.html}{this section} in Hadley's Advanced R.

\hypertarget{unnest}{%
\subsubsection{unnest}\label{unnest}}

\begin{itemize}
\tightlist
\item
  glance()
\end{itemize}

\texttt{glance()} function from \texttt{broom} package inspects the quality of a statistical model.

\textbf{Additional tips}

\begin{itemize}
\tightlist
\item
  \texttt{broom::glance(model)}: for evaluating model quality and/or complexity
\item
  \texttt{broom::tidy(model)}: for extracting each coefficient in the model (the estimates + its variability)
\item
  \texttt{broom::augment(model,\ data)}: for getting extra values (residuals, and influence statistics). A convenient tool if you want to plot fitted values and raw data together.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://www.youtube.com/watch?v=7VGPUBWGv6g\&ab_channel=Work-Bench}
\caption{Broom: Converting Statistical Models to Tidy Data Frames by David Robinson}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{glanced \textless{}{-}}\StringTok{ }\NormalTok{nested }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{glance =} \KeywordTok{map}\NormalTok{(models, broom}\OperatorTok{::}\NormalTok{glance))}

\CommentTok{\# Pluck the first item on the list}
\NormalTok{glanced}\OperatorTok{$}\NormalTok{glance }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{pluck}\NormalTok{(}\DecValTok{1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic      p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>        <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.948         0.942  1.22      181. 0.0000000984     1  -18.3  42.7  44.1
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Pull p.value}
\NormalTok{glanced}\OperatorTok{$}\NormalTok{glance }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pluck}\NormalTok{(}\DecValTok{1}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull}\NormalTok{(p.value)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##        value 
## 9.835213e-08
\end{verbatim}

\texttt{unnest()} unpacks the list objects stored in the \texttt{glanced} column

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{glanced }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest}\NormalTok{(glance) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{arrange}\NormalTok{(r.squared)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 142 x 16
## # Groups:   country, continent [142]
##    country     continent data     models r.squared adj.r.squared sigma statistic
##    <fct>       <fct>     <list>   <list>     <dbl>         <dbl> <dbl>     <dbl>
##  1 Rwanda      Africa    <tibble> <lm>      0.0172      -0.0811   6.56     0.175
##  2 Botswana    Africa    <tibble> <lm>      0.0340      -0.0626   6.11     0.352
##  3 Zimbabwe    Africa    <tibble> <lm>      0.0562      -0.0381   7.21     0.596
##  4 Zambia      Africa    <tibble> <lm>      0.0598      -0.0342   4.53     0.636
##  5 Swaziland   Africa    <tibble> <lm>      0.0682      -0.0250   6.64     0.732
##  6 Lesotho     Africa    <tibble> <lm>      0.0849      -0.00666  5.93     0.927
##  7 Cote d'Ivo~ Africa    <tibble> <lm>      0.283        0.212    3.93     3.95 
##  8 South Afri~ Africa    <tibble> <lm>      0.312        0.244    4.74     4.54 
##  9 Uganda      Africa    <tibble> <lm>      0.342        0.276    3.19     5.20 
## 10 Congo, Dem~ Africa    <tibble> <lm>      0.348        0.283    2.43     5.34 
## # ... with 132 more rows, and 8 more variables: p.value <dbl>, df <dbl>,
## #   logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>,
## #   nobs <int>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{glanced }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest}\NormalTok{(glance) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(continent, r.squared)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_jitter}\NormalTok{(}\DataTypeTok{width =} \FloatTok{0.5}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-167-1.pdf}

\begin{itemize}
\tightlist
\item
  tidy()
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{nested \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nest}\NormalTok{()}

\NormalTok{nested \textless{}{-}}\StringTok{ }\NormalTok{nested }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{models =} \KeywordTok{map}\NormalTok{(data, }\OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{lm}\NormalTok{(lifeExp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{year }\OperatorTok{+}\StringTok{ }\NormalTok{country, }\DataTypeTok{data =}\NormalTok{ .)))}

\NormalTok{tidied \textless{}{-}}\StringTok{ }\NormalTok{nested }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{tidied =} \KeywordTok{map}\NormalTok{(models, broom}\OperatorTok{::}\NormalTok{tidy))}

\NormalTok{model\_out \textless{}{-}}\StringTok{ }\NormalTok{tidied }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest}\NormalTok{(tidied) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{term =} \KeywordTok{str\_replace}\NormalTok{(term, }\StringTok{"country"}\NormalTok{, }\StringTok{""}\NormalTok{)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{select}\NormalTok{(continent, term, estimate, p.value) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{p\_threshold =} \KeywordTok{ifelse}\NormalTok{(p.value }\OperatorTok{\textless{}}\StringTok{ }\FloatTok{0.05}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{))}

\NormalTok{model\_out }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(p\_threshold }\OperatorTok{==}\StringTok{ }\DecValTok{1}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull}\NormalTok{(term) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unique}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   [1] "(Intercept)"              "year"                    
##   [3] "Bahrain"                  "Bangladesh"              
##   [5] "Cambodia"                 "China"                   
##   [7] "Hong Kong, China"         "India"                   
##   [9] "Indonesia"                "Iran"                    
##  [11] "Iraq"                     "Israel"                  
##  [13] "Japan"                    "Jordan"                  
##  [15] "Korea, Dem. Rep."         "Korea, Rep."             
##  [17] "Kuwait"                   "Lebanon"                 
##  [19] "Malaysia"                 "Mongolia"                
##  [21] "Myanmar"                  "Nepal"                   
##  [23] "Oman"                     "Pakistan"                
##  [25] "Philippines"              "Saudi Arabia"            
##  [27] "Singapore"                "Sri Lanka"               
##  [29] "Syria"                    "Taiwan"                  
##  [31] "Thailand"                 "Vietnam"                 
##  [33] "West Bank and Gaza"       "Yemen, Rep."             
##  [35] "Austria"                  "Belgium"                 
##  [37] "Croatia"                  "Czech Republic"          
##  [39] "Denmark"                  "Finland"                 
##  [41] "France"                   "Germany"                 
##  [43] "Greece"                   "Iceland"                 
##  [45] "Ireland"                  "Italy"                   
##  [47] "Montenegro"               "Netherlands"             
##  [49] "Norway"                   "Poland"                  
##  [51] "Portugal"                 "Slovak Republic"         
##  [53] "Slovenia"                 "Spain"                   
##  [55] "Sweden"                   "Switzerland"             
##  [57] "Turkey"                   "United Kingdom"          
##  [59] "Angola"                   "Benin"                   
##  [61] "Botswana"                 "Burkina Faso"            
##  [63] "Burundi"                  "Cameroon"                
##  [65] "Central African Republic" "Chad"                    
##  [67] "Comoros"                  "Congo, Dem. Rep."        
##  [69] "Congo, Rep."              "Cote d'Ivoire"           
##  [71] "Djibouti"                 "Equatorial Guinea"       
##  [73] "Eritrea"                  "Ethiopia"                
##  [75] "Gabon"                    "Gambia"                  
##  [77] "Ghana"                    "Guinea"                  
##  [79] "Guinea-Bissau"            "Kenya"                   
##  [81] "Lesotho"                  "Liberia"                 
##  [83] "Madagascar"               "Malawi"                  
##  [85] "Mali"                     "Mauritania"              
##  [87] "Mauritius"                "Mozambique"              
##  [89] "Namibia"                  "Niger"                   
##  [91] "Nigeria"                  "Reunion"                 
##  [93] "Rwanda"                   "Senegal"                 
##  [95] "Sierra Leone"             "Somalia"                 
##  [97] "South Africa"             "Sudan"                   
##  [99] "Swaziland"                "Tanzania"                
## [101] "Togo"                     "Uganda"                  
## [103] "Zambia"                   "Zimbabwe"                
## [105] "Bolivia"                  "Brazil"                  
## [107] "Canada"                   "Colombia"                
## [109] "Dominican Republic"       "Ecuador"                 
## [111] "El Salvador"              "Guatemala"               
## [113] "Haiti"                    "Honduras"                
## [115] "Mexico"                   "Nicaragua"               
## [117] "Paraguay"                 "Peru"                    
## [119] "Puerto Rico"              "Trinidad and Tobago"     
## [121] "United States"            "Venezuela"               
## [123] "New Zealand"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model\_out }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(p\_threshold }\OperatorTok{==}\StringTok{ }\DecValTok{0}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull}\NormalTok{(term) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unique}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "Bosnia and Herzegovina" "Bulgaria"               "Hungary"               
##  [4] "Romania"                "Serbia"                 "Egypt"                 
##  [7] "Libya"                  "Morocco"                "Sao Tome and Principe" 
## [10] "Tunisia"                "Chile"                  "Costa Rica"            
## [13] "Cuba"                   "Jamaica"                "Panama"                
## [16] "Uruguay"
\end{verbatim}

\hypertarget{mapping}{%
\subsection{Mapping}\label{mapping}}

We tasted a bit of how \texttt{map()} function works. Let's dig into it more in-depth, as this family of functions is useful. See Rebecca Barter's excellent tutorial on the \texttt{purrr} package for more information. In her words, this is ``the tidyverse's answer to apply functions for iteration''. \texttt{map()} function can take a vector (of any type), a list, and a dataframe for input.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{multiply \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x) \{}
\NormalTok{  x }\OperatorTok{*}\StringTok{ }\NormalTok{x}
\NormalTok{\}}

\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}
  \DataTypeTok{first\_obs =} \KeywordTok{rnorm}\NormalTok{(}\DecValTok{7}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DataTypeTok{sd =} \DecValTok{1}\NormalTok{),}
  \DataTypeTok{second\_obs =} \KeywordTok{rnorm}\NormalTok{(}\DecValTok{7}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DataTypeTok{sd =} \DecValTok{2}\NormalTok{)}
\NormalTok{) }\CommentTok{\# normal distribution}
\end{Highlighting}
\end{Shaded}

\textbf{Challenge}

Try \texttt{map\_df(.x\ =\ df,\ .f\ =\ multiply)} and tell me what's the difference between the output you got and what you saw earlier.

If you want to know more about the power and joy of functional programming in R (e.g., \texttt{purrr::map()}), then please take \href{https://github.com/dlab-berkeley/R-functional-programming}{``How to Automate Repeated Things in R''} workshop.

\hypertarget{hypothesis-testing}{%
\subsection{Hypothesis testing}\label{hypothesis-testing}}

Statistical inference: does the effect/difference in observed data occur by chance?

Null hypothesis: everything was random
Alternative hypothesis: everything was not random. Note that this does not mean that a particular factor influenced the outcome of interest. Statistical inference != Causal inference (causes and effects)

\(Y = X_{1} + X_{2} + X_{3} \epsilon\)

\href{https://github.com/tidymodels/infer}{\texttt{infer}} is for tidyverse-friendly statistical inference.

\textbf{Workflow}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \texttt{specify()} specify a formula
\item
  \texttt{hypothesize()} declare the null hypothesis
\item
  \texttt{generate()} generate data based on the null hypothesis
\item
  \texttt{calculate()} calculate a distribution of statistics from the generated data to form the null distribution
\end{enumerate}

\begin{figure}
\centering
\includegraphics{https://raw.githubusercontent.com/tidymodels/infer/master/figs/ht-diagram.png}
\caption{From infer package}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{log\_pop =} \KeywordTok{log}\NormalTok{(pop))}

\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ log\_pop, }\DataTypeTok{y =}\NormalTok{ lifeExp), }\DataTypeTok{data =}\NormalTok{ gapminder) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"lm"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-170-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Calculate the observed statistic: Observed slopes}
\NormalTok{observed\_slopes \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# specify(formula = lifeExp \textasciitilde{} log\_pop) \%\textgreater{}\%}
\StringTok{  }\KeywordTok{specify}\NormalTok{(}\DataTypeTok{formula =}\NormalTok{ lifeExp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{log\_pop) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{calculate}\NormalTok{(}\DataTypeTok{stat =} \StringTok{"slope"}\NormalTok{)}

\CommentTok{\# Generate the null distribution: Null slopes}
\NormalTok{null\_slopes \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Specify a formula}
\StringTok{  }\KeywordTok{specify}\NormalTok{(}\DataTypeTok{formula =}\NormalTok{ lifeExp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{log\_pop) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Hypothesize (point estimation)}
\StringTok{  }\KeywordTok{hypothesize}\NormalTok{(}\DataTypeTok{null =} \StringTok{"point"}\NormalTok{, }\DataTypeTok{mu =} \DecValTok{0}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Generate sampling distributions (bootstrapping)}
\StringTok{  }\KeywordTok{generate}\NormalTok{(}\DataTypeTok{reps =} \DecValTok{1000}\NormalTok{, }\DataTypeTok{type =} \StringTok{"bootstrap"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Calculate statistics}
\StringTok{  }\KeywordTok{calculate}\NormalTok{(}\DataTypeTok{stat =} \StringTok{"slope"}\NormalTok{)}

\CommentTok{\# Return data}
\NormalTok{null\_slopes }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# p{-}value is just the probability that observed pattern could arise if the null hypothesis was true}
\StringTok{  }\CommentTok{\# In social science convention, if alpha is below 0.005 (note: this is totally arbitrary), then the observed distribution is statistically significant.}
\StringTok{  }\KeywordTok{get\_p\_value}\NormalTok{(}
    \DataTypeTok{obs\_stat =}\NormalTok{ observed\_slopes,}
    \DataTypeTok{direction =} \StringTok{"both"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.972
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Visualize output}
\KeywordTok{visualize}\NormalTok{(null\_slopes) }\OperatorTok{+}
\StringTok{  }\KeywordTok{shade\_p\_value}\NormalTok{(}
    \DataTypeTok{obs\_stat =}\NormalTok{ observed\_slopes,}
    \DataTypeTok{direction =} \StringTok{"both"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-170-2.pdf}

\hypertarget{visualizing-ggplot2}{%
\section{Visualizing (ggplot2)}\label{visualizing-ggplot2}}

\begin{itemize}
\item
  The following material is adapted from Kieran Healy's excellent book (2019) on \href{https://socviz.co/}{data visualization} and Hadley Wickham's equally excellent book on \href{https://ggplot2-book.org/}{ggplot2}. For more theoretical discussions, I recommend you to read \href{https://link.springer.com/book/10.1007\%2F0-387-28695-0}{The Grammar of Graphics} by Leland Wilkinson.
\item
  Why should we care about data visualization? More precisely, why should we learn the grammar of statistical graphics?
\item
  Sometimes, pictures are better tools than words in 1) exploring, 2) understanding, and 3) explaining data.
\end{itemize}

\hypertarget{motivation-2}{%
\subsection{Motivation}\label{motivation-2}}

\href{https://en.wikipedia.org/wiki/Frank_Anscombe}{Anscombe}'s quarter comprises four datasets, which are so alike in terms of their descriptive statistics but quite different when presented graphically.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Set theme}
\KeywordTok{theme\_set}\NormalTok{(}\KeywordTok{theme\_minimal}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Data}
\NormalTok{anscombe}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    x1 x2 x3 x4    y1   y2    y3    y4
## 1  10 10 10  8  8.04 9.14  7.46  6.58
## 2   8  8  8  8  6.95 8.14  6.77  5.76
## 3  13 13 13  8  7.58 8.74 12.74  7.71
## 4   9  9  9  8  8.81 8.77  7.11  8.84
## 5  11 11 11  8  8.33 9.26  7.81  8.47
## 6  14 14 14  8  9.96 8.10  8.84  7.04
## 7   6  6  6  8  7.24 6.13  6.08  5.25
## 8   4  4  4 19  4.26 3.10  5.39 12.50
## 9  12 12 12  8 10.84 9.13  8.15  5.56
## 10  7  7  7  8  4.82 7.26  6.42  7.91
## 11  5  5  5  8  5.68 4.74  5.73  6.89
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Correlation}
\KeywordTok{cor}\NormalTok{(anscombe)[}\KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{4}\NormalTok{), }\KeywordTok{c}\NormalTok{(}\DecValTok{5}\OperatorTok{:}\DecValTok{8}\NormalTok{)]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##            y1         y2         y3         y4
## x1  0.8164205  0.8162365  0.8162867 -0.3140467
## x2  0.8164205  0.8162365  0.8162867 -0.3140467
## x3  0.8164205  0.8162365  0.8162867 -0.3140467
## x4 -0.5290927 -0.7184365 -0.3446610  0.8165214
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# gather and select}
\NormalTok{anscombe\_processed \textless{}{-}}\StringTok{ }\NormalTok{anscombe }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(x\_name, x\_value, x1}\OperatorTok{:}\NormalTok{x4) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather}\NormalTok{(y\_name, y\_value, y1}\OperatorTok{:}\NormalTok{y4)}

\CommentTok{\# plot}
\NormalTok{anscombe\_processed }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ x\_value, }\DataTypeTok{y =}\NormalTok{ y\_value)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =}\NormalTok{ lm, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(x\_name }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{y\_name) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_bw}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"X values"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Y values"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Anscombe\textquotesingle{}s quartet"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-174-1.pdf}

\hypertarget{the-grammar-of-graphics}{%
\subsection{The grammar of graphics}\label{the-grammar-of-graphics}}

\begin{itemize}
\item
  the grammar of graphics

  \begin{itemize}
  \tightlist
  \item
    data
  \item
    aesthetic attributes (color, shape, size)
  \item
    geometric objects (points, lines, bars)
  \item
    stats (summary stats)
  \item
    scales (map values in the data space)
  \item
    coord (data coordinates)
  \item
    facet (facetting specifications)
  \end{itemize}
\end{itemize}

No worries about new terms. We're going to learn them by actually plotting.

\begin{itemize}
\item
  Workflow:

  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \tightlist
  \item
    Tidy data
  \item
    Mapping
  \item
    Geom
  \item
    Cor\_ordinates and scales
  \item
    Labels and guides
  \item
    Themes
  \item
    Save files
  \end{enumerate}
\end{itemize}

\hypertarget{mapping-and-geom}{%
\subsection{mapping and geom}\label{mapping-and-geom}}

\begin{itemize}
\item
  \texttt{aes} (aesthetic mappings or aesthetics) tells which variables (x, y) in your data should be represented by which visual elements (color, shape, size) in the plot.
\item
  \texttt{geom\_} tells the type of plot you are going to use
\end{itemize}

\hypertarget{basic-aes-x-y}{%
\subsection{basic aes (x , y)}\label{basic-aes-x-y}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p \textless{}{-}}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp)}
\NormalTok{) }\CommentTok{\# ggplot or R in general takes positional arguments too. So, you don\textquotesingle{}t need to name data, mapping each time you use ggplot2.}

\NormalTok{p}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-175-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-175-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_smooth}\NormalTok{() }\CommentTok{\# geom\_smooth has calculated a smoothed line;}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-175-3.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# the shaded area is the standard error for the line}
\end{Highlighting}
\end{Shaded}

\hypertarget{univariate-distribution}{%
\subsection{Univariate distribution}\label{univariate-distribution}}

\begin{itemize}
\tightlist
\item
  \texttt{geom\_histogram()}: For the probability distribution of a continuous variable. Bins divide the entire range of values into a series of intervals (see \href{https://en.wikipedia.org/wiki/Histogram}{the Wiki entry}).
\item
  \texttt{geom\_density()}: Also for the probability distribution of a continuous variable. It calculates a \href{https://en.wikipedia.org/wiki/Kernel_density_estimation}{kernel density estimate} of the underlying distribution.
\end{itemize}

\hypertarget{histogram}{%
\subsubsection{Histogram}\label{histogram}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{data}\NormalTok{(midwest) }\CommentTok{\# load midwest dataset}

\NormalTok{midwest}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 437 x 28
##      PID county  state  area poptotal popdensity popwhite popblack popamerindian
##    <int> <chr>   <chr> <dbl>    <int>      <dbl>    <int>    <int>         <int>
##  1   561 ADAMS   IL    0.052    66090      1271.    63917     1702            98
##  2   562 ALEXAN~ IL    0.014    10626       759      7054     3496            19
##  3   563 BOND    IL    0.022    14991       681.    14477      429            35
##  4   564 BOONE   IL    0.017    30806      1812.    29344      127            46
##  5   565 BROWN   IL    0.018     5836       324.     5264      547            14
##  6   566 BUREAU  IL    0.05     35688       714.    35157       50            65
##  7   567 CALHOUN IL    0.017     5322       313.     5298        1             8
##  8   568 CARROLL IL    0.027    16805       622.    16519      111            30
##  9   569 CASS    IL    0.024    13437       560.    13384       16             8
## 10   570 CHAMPA~ IL    0.058   173025      2983.   146506    16559           331
## # ... with 427 more rows, and 19 more variables: popasian <int>,
## #   popother <int>, percwhite <dbl>, percblack <dbl>, percamerindan <dbl>,
## #   percasian <dbl>, percother <dbl>, popadults <int>, perchsd <dbl>,
## #   percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
## #   percpovertyknown <dbl>, percbelowpoverty <dbl>, percchildbelowpovert <dbl>,
## #   percadultpoverty <dbl>, percelderlypoverty <dbl>, inmetro <int>,
## #   category <chr>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{midwest }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ area)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\CommentTok{\# not working.}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{midwest }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ area)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_histogram}\NormalTok{() }\CommentTok{\# stat\_bin argument picks up 30 bins (or "bucket") by default.}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-178-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{midwest }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ area)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_histogram}\NormalTok{(}\DataTypeTok{bins =} \DecValTok{10}\NormalTok{) }\CommentTok{\# only 10 bins.}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-178-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =} \KeywordTok{subset}\NormalTok{(midwest, state }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"OH"}\NormalTok{, }\StringTok{"IN"}\NormalTok{)),}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ percollege, }\DataTypeTok{fill =}\NormalTok{ state)}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_histogram}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.7}\NormalTok{, }\DataTypeTok{bins =} \DecValTok{20}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-178-3.pdf}

\hypertarget{density}{%
\subsubsection{Density}\label{density}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{midwest }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ area, }\DataTypeTok{fill =}\NormalTok{ state, }\DataTypeTok{color =}\NormalTok{ state)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_density}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-179-1.pdf}

\hypertarget{advanced-aes-size-color}{%
\subsection{Advanced aes (size, color)}\label{advanced-aes-size-color}}

\begin{itemize}
\item
  There's also \texttt{fill} argument (mostly used in \texttt{geom\_bar()}). Color \texttt{aes} affects the appearance of lines and points, fill is for the filled areas of bars, polygons, and in some cases, the interior of a smoother's standard error ribbon.
\item
  The property size/color/fill represents\ldots{}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{size =}\NormalTok{ pop}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-180-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{size =}\NormalTok{ pop,}
    \DataTypeTok{color =}\NormalTok{ continent}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-181-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# try red instead of "red"}
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{size =}\NormalTok{ pop,}
    \DataTypeTok{color =} \StringTok{"red"}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-182-1.pdf}

Aesthetics also can be mapped per Geom.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-183-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}\StringTok{ }\CommentTok{\# alpha controls transparency}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{color =} \StringTok{"red"}\NormalTok{, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{, }\DataTypeTok{size =} \DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-183-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}\StringTok{ }\CommentTok{\# alpha controls transparency}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{color =} \StringTok{"red"}\NormalTok{, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{, }\DataTypeTok{size =} \DecValTok{2}\NormalTok{, }\DataTypeTok{method =} \StringTok{"lm"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-183-3.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{color =}\NormalTok{ continent}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-184-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}
  \DataTypeTok{data =}\NormalTok{ gapminder,}
  \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =}\NormalTok{ gdpPercap, }\DataTypeTok{y =}\NormalTok{ lifeExp,}
    \DataTypeTok{color =}\NormalTok{ continent,}
    \DataTypeTok{fill =}\NormalTok{ continent}
\NormalTok{  )}
\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-184-2.pdf}

\hypertarget{co-ordinates-and-scales}{%
\subsection{Co-ordinates and scales}\label{co-ordinates-and-scales}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\CommentTok{\# coord\_type}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-185-1.pdf}

The data is heavily bunched up against the left side.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\CommentTok{\# without scaling}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-186-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\CommentTok{\# scales the axis of a plot to a log 10 basis}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-186-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"lm"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-186-3.pdf}

\hypertarget{labels-and-guides}{%
\subsection{Labels and guides}\label{labels-and-guides}}

\texttt{scales} package has some useful premade formatting functions. You can either load scales or just grab the function you need from the library using \texttt{scales::}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-187-1.pdf}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{5}
\tightlist
\item
  Themes
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_economist}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-188-1.pdf}

\hypertarget{ggsave}{%
\subsection{ggsave}\label{ggsave}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{figure\_example \textless{}{-}}\StringTok{ }\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"gam"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"log GDP"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life Expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"A Gapminder Plot"}\NormalTok{,}
    \DataTypeTok{subtitle =} \StringTok{"Data points are country{-}years"}\NormalTok{,}
    \DataTypeTok{caption =} \StringTok{"Source: Gapminder"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_economist}\NormalTok{()}

\KeywordTok{ggsave}\NormalTok{(figure\_example, }\KeywordTok{here}\NormalTok{(}\StringTok{"outputs"}\NormalTok{, }\StringTok{"figure\_example.png"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\hypertarget{many-plots}{%
\subsection{Many plots}\label{many-plots}}

Basic ideas:

\begin{itemize}
\tightlist
\item
  Grouping: tell \texttt{ggplot2} about the structure of your data
\item
  Facetting: break up your data into pieces for a plot
\end{itemize}

\hypertarget{grouping-1}{%
\subsubsection{Grouping}\label{grouping-1}}

\begin{itemize}
\tightlist
\item
  Can you guess what's wrong?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p \textless{}{-}}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(gapminder, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ gdpPercap))}

\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-190-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-190-2.pdf}

\texttt{geom\_line} joins up all the lines for each particular year in the order they appear in the dataset. \texttt{ggplot2} does not know the yearly observations in your data are grouped by country.

You need grouping when the grouping information you need to tell is not built into the mapped variables (like continent).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1,704 x 7
##    country     continent  year lifeExp      pop gdpPercap log_pop
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>   <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.    15.9
##  2 Afghanistan Asia       1957    30.3  9240934      821.    16.0
##  3 Afghanistan Asia       1962    32.0 10267083      853.    16.1
##  4 Afghanistan Asia       1967    34.0 11537966      836.    16.3
##  5 Afghanistan Asia       1972    36.1 13079460      740.    16.4
##  6 Afghanistan Asia       1977    38.4 14880372      786.    16.5
##  7 Afghanistan Asia       1982    39.9 12881816      978.    16.4
##  8 Afghanistan Asia       1987    40.8 13867957      852.    16.4
##  9 Afghanistan Asia       1992    41.7 16317921      649.    16.6
## 10 Afghanistan Asia       1997    41.8 22227415      635.    16.9
## # ... with 1,694 more rows
\end{verbatim}

\hypertarget{facetting}{%
\subsubsection{Facetting}\label{facetting}}

Facetting is to make small multiples.

\begin{itemize}
\item
  \texttt{facet\_wrap}: based on a single categorical variable like \texttt{facet\_wrap(\textasciitilde{}single\_categorical\_variable)}. Your panels will be laid out in order and then wrapped into a grid.
\item
  \texttt{facet\_grid}: when you want to cross-classify some data by two categorical variables like \texttt{facet\_grid(one\_cat\_variable\ \textasciitilde{}\ two\_cat\_variable)}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p \textless{}{-}}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(gapminder, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ gdpPercap))}

\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{group =}\NormalTok{ country)) }\CommentTok{\# group by, \# The outlier is Kuwait.}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-192-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{group =}\NormalTok{ country)) }\OperatorTok{+}\StringTok{ }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent) }\CommentTok{\# facetting}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-192-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{group =}\NormalTok{ country), }\DataTypeTok{color =} \StringTok{"gray70"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{size =} \FloatTok{1.1}\NormalTok{, }\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_y\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent, }\DataTypeTok{ncol =} \DecValTok{5}\NormalTok{) }\OperatorTok{+}\StringTok{ }\CommentTok{\# for single categorical variable; for multiple categorical variables use facet\_grid()}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"GDP per capita"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"GDP per capita on Five continents"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{axis.text.x =} \KeywordTok{element\_text}\NormalTok{(}\DataTypeTok{angle =} \DecValTok{90}\NormalTok{, }\DataTypeTok{hjust =} \DecValTok{1}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-192-3.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_line}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{group =}\NormalTok{ country), }\DataTypeTok{color =} \StringTok{"gray70"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{size =} \FloatTok{1.1}\NormalTok{, }\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\DataTypeTok{se =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_y\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{dollar) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent) }\OperatorTok{+}\StringTok{ }\CommentTok{\# for single categorical variable; for multiple categorical variables use facet\_grid()}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"GDP per capita"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"GDP per capita on Five continents"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{axis.text.x =} \KeywordTok{element\_text}\NormalTok{(}\DataTypeTok{angle =} \DecValTok{90}\NormalTok{, }\DataTypeTok{hjust =} \DecValTok{1}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-193-1.pdf}

\hypertarget{transforming}{%
\subsection{Transforming}\label{transforming}}

\begin{itemize}
\tightlist
\item
  Transforming: perform some calculations on or summarize your data before producing the plot
\end{itemize}

\hypertarget{use-pipes-to-summarize-data}{%
\subsubsection{Use pipes to summarize data}\label{use-pipes-to-summarize-data}}

Also, we experiment with bar charts here. By default, \texttt{geom\_bar} \href{https://www.rdocumentation.org/packages/ggplot2/versions/1.0.1/topics/geom_bar}{uses} stat = ``bins'', which makes the height of each bar equal to the number of cases in each group. If you have a y column, then you should use \texttt{stat\ =\ "identity"} argument. Alternatively, you can use \texttt{geom\_col().}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder\_formatted \textless{}{-}}\StringTok{ }\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =}\NormalTok{ gapminder\_formatted, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean, }\DataTypeTok{color =}\NormalTok{ continent)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy on Five continents"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-194-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean, }\DataTypeTok{color =}\NormalTok{ country)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-194-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# geom point}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{country)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-195-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# geom bar}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_bar}\NormalTok{(}\DataTypeTok{stat =} \StringTok{"identity"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{country)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-195-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# no facet}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ year, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean, }\DataTypeTok{fill =}\NormalTok{ country)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_bar}\NormalTok{(}\DataTypeTok{stat =} \StringTok{"identity"}\NormalTok{) }\OperatorTok{+}\StringTok{ }\CommentTok{\# even if you not stack, still the plot looks messy or you can use geom\_col()}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Year"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-195-3.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ country, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_boxplot}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Country"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-196-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# without ordering}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{reorder}\NormalTok{(country, lifeExp\_mean), }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_boxplot}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Country"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-197-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# reorder}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Europe"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(country, year) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{reorder}\NormalTok{(country, }\OperatorTok{{-}}\NormalTok{lifeExp\_mean), }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_boxplot}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Country"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Life expectancy"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Life expectancy in Europe"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-197-2.pdf}

\hypertarget{plotting-text}{%
\subsubsection{Plotting text}\label{plotting-text}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Asia"} \OperatorTok{|}\StringTok{ }\NormalTok{continent }\OperatorTok{==}\StringTok{ "Americas"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ gdp\_mean, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_text}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{label =}\NormalTok{ country)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-198-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# with label}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Asia"} \OperatorTok{|}\StringTok{ }\NormalTok{continent }\OperatorTok{==}\StringTok{ "Americas"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ gdp\_mean, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_label}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{label =}\NormalTok{ country)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-199-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# no overlaps}
\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(continent }\OperatorTok{==}\StringTok{ "Asia"} \OperatorTok{|}\StringTok{ }\NormalTok{continent }\OperatorTok{==}\StringTok{ "Americas"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(continent, country) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarize}\NormalTok{(}
    \DataTypeTok{gdp\_mean =} \KeywordTok{mean}\NormalTok{(gdpPercap),}
    \DataTypeTok{lifeExp\_mean =} \KeywordTok{mean}\NormalTok{(lifeExp)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ gdp\_mean, }\DataTypeTok{y =}\NormalTok{ lifeExp\_mean)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_text\_repel}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{label =}\NormalTok{ country)) }\OperatorTok{+}\StringTok{ }\CommentTok{\# there\textquotesingle{}s also geom\_label\_repel}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_grid}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{continent)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
\end{verbatim}

\begin{verbatim}
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
\end{verbatim}

\begin{verbatim}
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-200-1.pdf}

\hypertarget{ploting-models}{%
\subsection{Ploting models}\label{ploting-models}}

In plotting models, we extensively use David Robinson's \href{https://cran.r-project.org/web/packages/broom/vignettes/broom.html}{broom package} in R. The idea is to transform model outputs (i.e., predictions and estimations) into tidy objects so that we can easily combine, separate, and visualize these elements.

\hypertarget{plotting-several-fits-at-the-same-time}{%
\subsubsection{Plotting several fits at the same time}\label{plotting-several-fits-at-the-same-time}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{model\_colors \textless{}{-}}\StringTok{ }\NormalTok{RColorBrewer}\OperatorTok{::}\KeywordTok{brewer.pal}\NormalTok{(}\DecValTok{3}\NormalTok{, }\StringTok{"Set1"}\NormalTok{) }\CommentTok{\# select three qualitatively different colors from a larger palette.}

\NormalTok{gapminder }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{log}\NormalTok{(gdpPercap), }\DataTypeTok{y =}\NormalTok{ lifeExp)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"lm"}\NormalTok{, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{color =} \StringTok{"OLS"}\NormalTok{, }\DataTypeTok{fill =} \StringTok{"OLS"}\NormalTok{)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}
    \DataTypeTok{method =} \StringTok{"lm"}\NormalTok{, }\DataTypeTok{formula =}\NormalTok{ y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{splines}\OperatorTok{::}\KeywordTok{bs}\NormalTok{(x, }\DataTypeTok{df =} \DecValTok{3}\NormalTok{),}
    \KeywordTok{aes}\NormalTok{(}\DataTypeTok{color =} \StringTok{"Cubic Spline"}\NormalTok{, }\DataTypeTok{fill =} \StringTok{"Cubic Spline"}\NormalTok{)}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{"loess"}\NormalTok{, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{color =} \StringTok{"LOESS"}\NormalTok{, }\DataTypeTok{fill =} \StringTok{"LOESS"}\NormalTok{)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{legend.position =} \StringTok{"top"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_color\_manual}\NormalTok{(}\DataTypeTok{name =} \StringTok{"Models"}\NormalTok{, }\DataTypeTok{values =}\NormalTok{ model\_colors) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_manual}\NormalTok{(}\DataTypeTok{name =} \StringTok{"Models"}\NormalTok{, }\DataTypeTok{values =}\NormalTok{ model\_colors)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
\end{verbatim}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-201-1.pdf}

\hypertarget{extracting-model-outcomes}{%
\subsubsection{Extracting model outcomes}\label{extracting-model-outcomes}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# regression model}
\NormalTok{out \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(}
  \DataTypeTok{formula =}\NormalTok{ lifeExp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{gdpPercap }\OperatorTok{+}\StringTok{ }\NormalTok{pop }\OperatorTok{+}\StringTok{ }\NormalTok{continent,}
  \DataTypeTok{data =}\NormalTok{ gapminder}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\texttt{tidy()} is a method in the \texttt{broom} package. It ``constructs a dataframe that summarizes the model's statistical findings''. As the description states, tidy is a function that can be used for various models. For instance, a tidy can extract the following information from a regression model.

\begin{itemize}
\tightlist
\item
  \texttt{Term}: a term being estimated
\item
  \texttt{p.value}
\item
  \texttt{statistic}: a test statistic used to compute p-value
\item
  \texttt{estimate}
\item
  \texttt{conf.low}: the low end of a confidence interval
\item
  \texttt{conf.high}: the high end of a confidence interval
\item
  \texttt{df}: degrees of freedom
\end{itemize}

\textbf{Challenge}

Try \texttt{glance(out)}; what did you get from these commands? If you're curious, you can try \texttt{?glance}.

The followings are to show your degree of confidence.

\hypertarget{coefficients}{%
\paragraph{Coefficients}\label{coefficients}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# estimates}
\NormalTok{out\_comp \textless{}{-}}\StringTok{ }\KeywordTok{tidy}\NormalTok{(out)}

\NormalTok{p \textless{}{-}}\StringTok{ }\NormalTok{out\_comp }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ term, }\DataTypeTok{y =}\NormalTok{ estimate))}

\NormalTok{p }\OperatorTok{+}\StringTok{ }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_bw}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-203-1.pdf}

\hypertarget{confidence-intervals}{%
\paragraph{Confidence intervals}\label{confidence-intervals}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# plus confidence intervals}
\NormalTok{out\_conf \textless{}{-}}\StringTok{ }\KeywordTok{tidy}\NormalTok{(out, }\DataTypeTok{conf.int =} \OtherTok{TRUE}\NormalTok{)}

\CommentTok{\# plotting coefficients using ggplot2 (pointrange)}
\NormalTok{out\_conf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{reorder}\NormalTok{(term, estimate), }\DataTypeTok{y =}\NormalTok{ estimate, }\DataTypeTok{ymin =}\NormalTok{ conf.low, }\DataTypeTok{ymax =}\NormalTok{ conf.high)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_pointrange}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{""}\NormalTok{, }\DataTypeTok{y =} \StringTok{"OLS Estimate"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_bw}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-204-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# another way to do it (errorbar)}
\NormalTok{out\_conf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ estimate, }\DataTypeTok{y =} \KeywordTok{reorder}\NormalTok{(term, estimate))) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_errorbarh}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{xmin =}\NormalTok{ conf.low, }\DataTypeTok{xmax =}\NormalTok{ conf.high)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{y =} \StringTok{""}\NormalTok{, }\DataTypeTok{x =} \StringTok{"OLS Estimate"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme\_bw}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{03_tidy_data_files/figure-latex/unnamed-chunk-204-2.pdf}

You can also calculate marginal effects using the \href{https://vincentarelbundock.github.io/marginaleffects/}{\texttt{marginaleffects}} package.

\hypertarget{functional_programming}{%
\chapter{Automating repeated things}\label{functional_programming}}

\hypertarget{the-big-picture-5}{%
\section{The Big Picture}\label{the-big-picture-5}}

\begin{quote}
Anything that can be automated should be automated. Do as little as possible by hand. Do as much as possible with functions.
- Hadley Wickham
\end{quote}

This chapter helps you to step up your R skills with functional programming. The \texttt{purrr} package provides easy-to-use tools to automate repeated things in your entire R workflow (e.g., wrangling, modeling, and visualization). The result is cleaner, faster, more readable, and extendable code.

\includegraphics{https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSmywiiOutD0NPieYCKxaD2wN9Fbt2I3iS87A\&usqp=CAU}

\hypertarget{objectives}{%
\section{Objectives}\label{objectives}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{-1}
\tightlist
\item
  How to use control flow in R using \texttt{if\_}, \texttt{for\ loop}, and \texttt{apply}
\item
  How to use \texttt{map()} to automate workflow in a cleaner, faster, and more extendable way\\
\item
  How to use \texttt{map2()} and \texttt{pmap()} to avoid writing nested loops
\item
  How to use \texttt{map()} and \texttt{glue()} to automate creating multiple plots
\item
  How to use \texttt{reduce()} to automate joining multiple dataframes
\item
  How to use \texttt{slowly()} and \texttt{future\_} to make the automation process either slower or faster
\item
  How to use \texttt{safely()} and \texttt{possibly()} to make error handling easier
\item
  How to develop your data products (e.g., R packages, Shiny apps)
\end{enumerate}

\hypertarget{setup-3}{%
\section{Setup}\label{setup-3}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Install packages}
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) \{}
  \KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\NormalTok{\}}

\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  tidyverse, }\CommentTok{\# tidyverse pkgs including purrr}
\NormalTok{  bench, }\CommentTok{\# performance test }
\NormalTok{  tictoc, }\CommentTok{\# performance test}
\NormalTok{  broom, }\CommentTok{\# tidy modeling}
\NormalTok{  glue, }\CommentTok{\# paste string and objects}
\NormalTok{  furrr, }\CommentTok{\# parallel processing}
\NormalTok{  rvest, }\CommentTok{\# web scraping}
\NormalTok{  devtools, }\CommentTok{\# dev tools }
\NormalTok{  usethis, }\CommentTok{\# workflow     }
\NormalTok{  roxygen2, }\CommentTok{\# documentation }
\NormalTok{  testthat, }\CommentTok{\# testing }
\NormalTok{  patchwork) }\CommentTok{\# arranging ggplots }
\end{Highlighting}
\end{Shaded}

\hypertarget{flow}{%
\section{Flow control}\label{flow}}

\begin{itemize}
\item
  Control structures = putting logic in code to control flow (e.g., \texttt{if}, \texttt{else}, \texttt{for}, \texttt{while}, \texttt{repeat}, \texttt{break}, \texttt{next})
\item
  Almost all the conditional operators used in Python also work in R. The basic loop setup is also very similar, with some small syntax adjustments.
\item
  \texttt{if()} is a function whose arguments must be specified inside parentheses.
\item
  \texttt{else}, however, is a reserved operator that takes no arguments. Note that there is no \texttt{elif} option --- one simply writes \texttt{else\ if()}.
\item
  Whereas operations to be executed after conditional evaluations in Python come after a \texttt{:}, R operations must only be enclosed in curly brackets: \texttt{\{\}}. Furthermore, there is no requirement for indentation.
\end{itemize}

\hypertarget{if-one-condition}{%
\subsection{if (one condition)}\label{if-one-condition}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{5}

\ControlFlowTok{if}\NormalTok{ (x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{) \{ }\CommentTok{\# Condition }
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is negative"}\NormalTok{) }\CommentTok{\# Do something }
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{{-}5}

\ControlFlowTok{if}\NormalTok{ (x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{) \{}
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is negative"}\NormalTok{)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "x is negative"
\end{verbatim}

\hypertarget{if-else-two-conditions}{%
\subsection{if + else (two conditions)}\label{if-else-two-conditions}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{5}

\ControlFlowTok{if}\NormalTok{ (x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{) \{}
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is negative"}\NormalTok{)}
\NormalTok{\} }\ControlFlowTok{else}\NormalTok{\{}
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is positive"}\NormalTok{)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "x is positive"
\end{verbatim}

\hypertarget{if-else-if-else-three-conditions}{%
\subsection{if + else if + else (three conditions)}\label{if-else-if-else-three-conditions}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\DecValTok{0}

\ControlFlowTok{if}\NormalTok{ (x }\OperatorTok{\textless{}}\StringTok{ }\DecValTok{0}\NormalTok{) \{ }\CommentTok{\# Condition }
  \KeywordTok{print}\NormalTok{(}\StringTok{"x is negative"}\NormalTok{) }\CommentTok{\# Do something }

\NormalTok{  \} }\ControlFlowTok{else} \ControlFlowTok{if}\NormalTok{ (x }\OperatorTok{==}\StringTok{ }\DecValTok{0}\NormalTok{) \{ }
  
    \KeywordTok{print}\NormalTok{(}\StringTok{"x is zero"}\NormalTok{) }\CommentTok{\# Do something else }

\NormalTok{    \} }\ControlFlowTok{else}\NormalTok{ \{}\KeywordTok{print}\NormalTok{(}\StringTok{"x is positive"}\NormalTok{) }\CommentTok{\# Do something else }

\NormalTok{      \}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "x is zero"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  In general, it's not a good idea to write nested code (lots of \texttt{else\_if()} or \texttt{ifelse()}). It is not easy to read, debug, modulize, and extend.
\item
  Instead, write functions and, if necessary, use \texttt{if()} only. We'll come back to this later.
\end{itemize}

\hypertarget{functions}{%
\subsection{Functions}\label{functions}}

While functions are defined in Python using the \texttt{def} reserved operator, R sees functions as just another type of named object. Thus, they require explicit assignment to an object. This is done using the function \texttt{function()}, which creates a function taking the arguments specified in parentheses.

function = input + computation (begin -\textgreater{} end) + output

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{simple.function \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
  \KeywordTok{print}\NormalTok{(x }\OperatorTok{+}\StringTok{ }\DecValTok{1}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{simple.function}\NormalTok{(}\DataTypeTok{x =} \DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{less.simple.function \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x, y)\{}
  \KeywordTok{print}\NormalTok{(x }\OperatorTok{{-}}\StringTok{ }\NormalTok{y }\OperatorTok{+}\StringTok{ }\DecValTok{1}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{less.simple.function}\NormalTok{(}\DataTypeTok{x =} \DecValTok{2}\NormalTok{, }\DataTypeTok{y =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] -7
\end{verbatim}

Concerning returning function output, most of the same rules apply to Python. Be sure to remember that \texttt{return()} will only process a single object, so multiple items must usually be returned as a list. Note that your ordering of the functions matters, too.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dumbfun \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
  \KeywordTok{return}\NormalTok{(x)}
  \KeywordTok{print}\NormalTok{(}\StringTok{"This will never print :("}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{dumbfun}\NormalTok{(}\DataTypeTok{x =} \StringTok{"something"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "something"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dumbfun \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
  \KeywordTok{print}\NormalTok{(}\StringTok{"Why did I print?"}\NormalTok{)}
  \KeywordTok{return}\NormalTok{(x)}
\NormalTok{\}}

\KeywordTok{dumbfun}\NormalTok{(}\DataTypeTok{x =} \StringTok{"something"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Why did I print?"
\end{verbatim}

\begin{verbatim}
## [1] "something"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dumbfun \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x,y)\{}
\NormalTok{  thing1 \textless{}{-}}\StringTok{ }\NormalTok{x}
\NormalTok{  thing2 \textless{}{-}}\StringTok{ }\NormalTok{y}
  \KeywordTok{return}\NormalTok{(}\KeywordTok{list}\NormalTok{(thing1, thing2))}
\NormalTok{\}}

\KeywordTok{dumbfun}\NormalTok{(}\DataTypeTok{x =} \StringTok{"some text"}\NormalTok{, }\DataTypeTok{y =} \StringTok{"some data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "some text"
## 
## [[2]]
## [1] "some data"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dumbfun}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{c}\NormalTok{(}\DecValTok{5}\NormalTok{,}\DecValTok{10}\NormalTok{,}\DecValTok{15}\NormalTok{), }\DataTypeTok{y =} \StringTok{"some data"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1]  5 10 15
## 
## [[2]]
## [1] "some data"
\end{verbatim}

R functions also allow you to set default argument values:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{less.simple.function \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x, }\DataTypeTok{y =} \DecValTok{0}\NormalTok{)\{}
  \KeywordTok{print}\NormalTok{(x }\OperatorTok{{-}}\StringTok{ }\NormalTok{y }\OperatorTok{+}\StringTok{ }\DecValTok{1}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{less.simple.function}\NormalTok{(}\DataTypeTok{x =} \DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 3
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{less.simple.function}\NormalTok{(}\DataTypeTok{x =} \DecValTok{2}\NormalTok{, }\DataTypeTok{y =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] -7
\end{verbatim}

Concerning specifying arguments, one can either use argument \textbf{position} specifications (i.e., the order) or argument \textbf{name} specifications. The latter is strongly preferred, as it is straightforward to specify incorrect argument values accidentally.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{send \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(message, recipient, }\DataTypeTok{cc=}\OtherTok{NULL}\NormalTok{, }\DataTypeTok{bcc=}\OtherTok{NULL}\NormalTok{)\{}
  \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(message, recipient, }\DataTypeTok{sep =} \StringTok{", "}\NormalTok{))}
  \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{"CC:"}\NormalTok{, cc, }\DataTypeTok{sep =} \StringTok{" "}\NormalTok{))}
  \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{"BCC:"}\NormalTok{, bcc, }\DataTypeTok{sep =} \StringTok{" "}\NormalTok{))}
\NormalTok{\}}

\KeywordTok{send}\NormalTok{(}\DataTypeTok{message =} \StringTok{"Hello"}\NormalTok{, }\DataTypeTok{recipient =} \StringTok{"World"}\NormalTok{, }\DataTypeTok{cc =} \StringTok{"Sun"}\NormalTok{, }\DataTypeTok{bcc =} \StringTok{"Jane"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, World"
## [1] "CC: Sun"
## [1] "BCC: Jane"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{send}\NormalTok{(}\StringTok{"Hello"}\NormalTok{, }\StringTok{"World"}\NormalTok{, }\StringTok{"Sun"}\NormalTok{, }\StringTok{"Jane"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, World"
## [1] "CC: Sun"
## [1] "BCC: Jane"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{send}\NormalTok{(}\StringTok{"Hello"}\NormalTok{, }\StringTok{"Sun"}\NormalTok{, }\StringTok{"Jane"}\NormalTok{, }\StringTok{"World"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, Sun"
## [1] "CC: Jane"
## [1] "BCC: World"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{send}\NormalTok{(}\DataTypeTok{message =} \StringTok{"Hello"}\NormalTok{, }\DataTypeTok{cc =} \StringTok{"Sun"}\NormalTok{, }\DataTypeTok{bcc =} \KeywordTok{c}\NormalTok{(}\StringTok{"Jane"}\NormalTok{, }\StringTok{"Rochelle"}\NormalTok{), }\DataTypeTok{recipient =} \StringTok{"World"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Hello, World"
## [1] "CC: Sun"
## [1] "BCC: Jane"     "BCC: Rochelle"
\end{verbatim}

Also, note that functions don't have what CS people called side-effects. Functions only define local variables = They don't change objects stored in the global environment. (Consider the difference between \texttt{\textless{}-} and \texttt{=} for assignments.) That's why you can use functions for reusable tasks since it does not interrupt other essential things in your system.

See \href{https://darrenjw.wordpress.com/2011/11/23/lexical-scope-and-function-closures-in-r/}{the following example} from Wilkinson.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{a =}\StringTok{ }\DecValTok{1} 
\NormalTok{b =}\StringTok{ }\DecValTok{2}

\NormalTok{f \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)}
\NormalTok{\{}
\NormalTok{  a}\OperatorTok{*}\NormalTok{x }\OperatorTok{+}\StringTok{ }\NormalTok{b}
\NormalTok{\}}

\KeywordTok{f}\NormalTok{(}\DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{g \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)}
\NormalTok{\{}
\NormalTok{  a =}\StringTok{ }\DecValTok{2}
\NormalTok{  b =}\StringTok{ }\DecValTok{1}
  \KeywordTok{f}\NormalTok{(x)}
\NormalTok{\}}

\KeywordTok{g}\NormalTok{(}\DecValTok{2}\NormalTok{) }\CommentTok{\# a equals still 1 }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 4
\end{verbatim}

\textbf{Additional tips}

\begin{itemize}
\tightlist
\item
  Nonstandard evaluation
\end{itemize}

Nonstandard evaluation is an advanced subject. If you feel overwhelmed, you are more than welcome to skip this. But if you are serious about R programming, this is something you want to check out. For a deeper understanding of this issue, I recommend reading \href{https://renkun.me/2014/12/03/tips-on-non-standard-evaluation-in-r/}{Ren Kun's very informative blog post} carefully.

This part draws on one of the {[}the dplyr package articles{]}(\url{https://dplyr.tidyverse.org/articles/programming.html}.

In tidyverse, calling a variable with or without quotation mark (string or not) makes little difference because tidyeval is a non-standard evaluation.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Using \textasciigrave{}mpg\textasciigrave{} instead of \textasciigrave{}mtcars$mpg\textasciigrave{} is called data masking.}

\NormalTok{mtcars }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{select}\NormalTok{(mpg)}

\NormalTok{mtcars }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{select}\NormalTok{(}\StringTok{"mpg"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Data and env-variables

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# df = environment variable }
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{data.frame}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{5}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{c}\NormalTok{(}\DecValTok{6}\OperatorTok{:}\DecValTok{10}\NormalTok{)}
\NormalTok{  )}

\CommentTok{\# x, y = data variables }
\NormalTok{df}\OperatorTok{$}\NormalTok{x}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 1 2 3 4 5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df}\OperatorTok{$}\NormalTok{y}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1]  6  7  8  9 10
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Problem
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ }\OtherTok{NULL} 

\NormalTok{var\_summary \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(env\_var, data\_var)\{}
 
\NormalTok{   env\_var }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(data\_var))}

\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

You may expect that the output is mean = 2.5 \ldots{} but

It's because the mean() function doesn't take \texttt{df\$x} for data\_var but \texttt{x.} So it would be best if you linked x with the environment variable.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{var\_summary}\NormalTok{(df, x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in mean.default(data_var): argument is not numeric or logical: returning
## NA
\end{verbatim}

\begin{verbatim}
##   mean
## 1   NA
\end{verbatim}

This is how you can fix this.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Solution}
\NormalTok{vs\_fix \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(env\_var, data\_var)\{}
 
\NormalTok{   env\_var }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(\{\{data\_var\}\}))}

\NormalTok{\}}

\CommentTok{\# You can also do this. }
\NormalTok{vs\_fix\_enhanced \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(env\_var, data\_var)\{}
 
\NormalTok{   env\_var }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{summarise}\NormalTok{(}\StringTok{"mean\_\{\{data\_var\}\}"} \OperatorTok{:}\ErrorTok{=}\StringTok{ }\KeywordTok{mean}\NormalTok{(\{\{data\_var\}\})) }\CommentTok{\# If you use the glue package, this syntax is very intuitive.}

\NormalTok{\}}

\KeywordTok{vs\_fix\_enhanced}\NormalTok{(df, x)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   mean_x
## 1      3
\end{verbatim}

If you have a character vector input \ldots{}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{mtcars\_count \textless{}{-}}\StringTok{ }\NormalTok{mtcars }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{names}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{map}\NormalTok{(}\OperatorTok{\textasciitilde{}}\KeywordTok{count}\NormalTok{(mtcars, .data[[.x]])) }\CommentTok{\# We\textquotesingle{}re going to learn about map in the rest of this session.}

\NormalTok{mtcars\_count[[}\DecValTok{1}\NormalTok{]]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     mpg n
## 1  10.4 2
## 2  13.3 1
## 3  14.3 1
## 4  14.7 1
## 5  15.0 1
## 6  15.2 2
## 7  15.5 1
## 8  15.8 1
## 9  16.4 1
## 10 17.3 1
## 11 17.8 1
## 12 18.1 1
## 13 18.7 1
## 14 19.2 2
## 15 19.7 1
## 16 21.0 2
## 17 21.4 2
## 18 21.5 1
## 19 22.8 2
## 20 24.4 1
## 21 26.0 1
## 22 27.3 1
## 23 30.4 2
## 24 32.4 1
## 25 33.9 1
\end{verbatim}

\hypertarget{for-loop}{%
\subsection{for loop}\label{for-loop}}

\begin{figure}
\centering
\includegraphics{https://teachtogether.tech/en/figures/for-loop.svg}
\caption{Concept map for a for loop. Source: \url{https://teachtogether.tech/en/index.html\#s:memory-concept-maps}}
\end{figure}

Loops in R also work the same way as in Python, with just a few adjustments. First, recall that index positions in R start at 1. Second, \texttt{while()} and \texttt{for()} are functions rather than reserved operators, meaning they must take arguments in parentheses. Third, just like \texttt{else}, the \texttt{in} operator \emph{is} reserved and takes no arguments in parentheses. Fourth, the conditional execution must appear between curly brackets. Finally, indentation is meaningless, but each new operation must appear on a new line.

\begin{itemize}
\tightlist
\item
  \texttt{while()}: when we have no idea how many times loop needs to be executed.
\item
  \texttt{for()}: when we know how many times loop needs to be executed. This is likely to be the loop you will use most frequently.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fruits \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"apples"}\NormalTok{, }\StringTok{"oranges"}\NormalTok{, }\StringTok{"pears"}\NormalTok{, }\StringTok{"bananas"}\NormalTok{)}

\CommentTok{\# a while loop}
\NormalTok{i \textless{}{-}}\StringTok{ }\DecValTok{1}
\ControlFlowTok{while}\NormalTok{ (i }\OperatorTok{\textless{}=}\StringTok{ }\KeywordTok{length}\NormalTok{(fruits)) \{}
  \KeywordTok{print}\NormalTok{(fruits[i])}
\NormalTok{  i \textless{}{-}}\StringTok{ }\NormalTok{i }\OperatorTok{+}\StringTok{ }\DecValTok{1}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "apples"
## [1] "oranges"
## [1] "pears"
## [1] "bananas"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# a for loop}
\ControlFlowTok{for}\NormalTok{ (i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\KeywordTok{length}\NormalTok{(fruits)) \{}
  \KeywordTok{print}\NormalTok{(fruits[i])}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "apples"
## [1] "oranges"
## [1] "pears"
## [1] "bananas"
\end{verbatim}

\hypertarget{apply-family}{%
\subsection{apply family}\label{apply-family}}

While and for loops in R can be very slow. For this reason, R has many built-in iteration methods to speed up execution times. In many cases, packages will have ``behind-the-scenes'' ways to avoid \texttt{for\ loops}, but what if you need to write your function?

A common method of getting around for loops is the \textbf{apply} family of functions. These take a data structure and a function and apply a function over all the object elements.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{fruit \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"apple"}\NormalTok{, }\StringTok{"orange"}\NormalTok{, }\StringTok{"pear"}\NormalTok{, }\StringTok{"banana"}\NormalTok{)}

\CommentTok{\# make function that takes in only one element}
\NormalTok{make.plural \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
\NormalTok{   plural \textless{}{-}}\StringTok{ }\KeywordTok{paste}\NormalTok{(x, }\StringTok{\textquotesingle{}s\textquotesingle{}}\NormalTok{, }\DataTypeTok{sep =} \StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{) }\CommentTok{\# sep is for collapse, so collpase \textquotesingle{}\textquotesingle{}}
   \KeywordTok{return}\NormalTok{(plural)}
\NormalTok{\}}

\KeywordTok{make.plural}\NormalTok{(}\StringTok{\textquotesingle{}apple\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "apples"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \texttt{apply()} : loop over the margins (1 = row, 2 = column) of an array
\item
  \texttt{lapply()} : loop over a list then returns a list
\item
  \texttt{sapply()} : loop over a list then returns a named vector
\item
  \texttt{tapply()}: loop over subsets of a vector
\item
  \texttt{mapply()}: multivariate version of \texttt{lapply()}. Use this if you have a function that takes in 2 or more arguments.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# apply that function to every element}
\KeywordTok{lapply}\NormalTok{(fruit, make.plural) }\CommentTok{\# returns a list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "apples"
## 
## [[2]]
## [1] "oranges"
## 
## [[3]]
## [1] "pears"
## 
## [[4]]
## [1] "bananas"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{sapply}\NormalTok{(fruit, make.plural) }\CommentTok{\# returns a named vector}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     apple    orange      pear    banana 
##  "apples" "oranges"   "pears" "bananas"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{library}\NormalTok{(purrr) }\CommentTok{\# load package}
\KeywordTok{map}\NormalTok{(fruit, make.plural) }\CommentTok{\# type consistent}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "apples"
## 
## [[2]]
## [1] "oranges"
## 
## [[3]]
## [1] "pears"
## 
## [[4]]
## [1] "bananas"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Why sapply is bad }

\KeywordTok{sapply}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{100}\NormalTok{, paste) }\CommentTok{\# return character }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
##  [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
##  [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
##  [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
##  [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
##  [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
##  [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
##  [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
##  [97] "97"  "98"  "99"  "100"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{sapply}\NormalTok{(}\KeywordTok{integer}\NormalTok{(), paste) }\CommentTok{\# return list!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## list()
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{library}\NormalTok{(purrr)}
\KeywordTok{map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{100}\NormalTok{, paste) }\CommentTok{\# return list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "1"
## 
## [[2]]
## [1] "2"
## 
## [[3]]
## [1] "3"
## 
## [[4]]
## [1] "4"
## 
## [[5]]
## [1] "5"
## 
## [[6]]
## [1] "6"
## 
## [[7]]
## [1] "7"
## 
## [[8]]
## [1] "8"
## 
## [[9]]
## [1] "9"
## 
## [[10]]
## [1] "10"
## 
## [[11]]
## [1] "11"
## 
## [[12]]
## [1] "12"
## 
## [[13]]
## [1] "13"
## 
## [[14]]
## [1] "14"
## 
## [[15]]
## [1] "15"
## 
## [[16]]
## [1] "16"
## 
## [[17]]
## [1] "17"
## 
## [[18]]
## [1] "18"
## 
## [[19]]
## [1] "19"
## 
## [[20]]
## [1] "20"
## 
## [[21]]
## [1] "21"
## 
## [[22]]
## [1] "22"
## 
## [[23]]
## [1] "23"
## 
## [[24]]
## [1] "24"
## 
## [[25]]
## [1] "25"
## 
## [[26]]
## [1] "26"
## 
## [[27]]
## [1] "27"
## 
## [[28]]
## [1] "28"
## 
## [[29]]
## [1] "29"
## 
## [[30]]
## [1] "30"
## 
## [[31]]
## [1] "31"
## 
## [[32]]
## [1] "32"
## 
## [[33]]
## [1] "33"
## 
## [[34]]
## [1] "34"
## 
## [[35]]
## [1] "35"
## 
## [[36]]
## [1] "36"
## 
## [[37]]
## [1] "37"
## 
## [[38]]
## [1] "38"
## 
## [[39]]
## [1] "39"
## 
## [[40]]
## [1] "40"
## 
## [[41]]
## [1] "41"
## 
## [[42]]
## [1] "42"
## 
## [[43]]
## [1] "43"
## 
## [[44]]
## [1] "44"
## 
## [[45]]
## [1] "45"
## 
## [[46]]
## [1] "46"
## 
## [[47]]
## [1] "47"
## 
## [[48]]
## [1] "48"
## 
## [[49]]
## [1] "49"
## 
## [[50]]
## [1] "50"
## 
## [[51]]
## [1] "51"
## 
## [[52]]
## [1] "52"
## 
## [[53]]
## [1] "53"
## 
## [[54]]
## [1] "54"
## 
## [[55]]
## [1] "55"
## 
## [[56]]
## [1] "56"
## 
## [[57]]
## [1] "57"
## 
## [[58]]
## [1] "58"
## 
## [[59]]
## [1] "59"
## 
## [[60]]
## [1] "60"
## 
## [[61]]
## [1] "61"
## 
## [[62]]
## [1] "62"
## 
## [[63]]
## [1] "63"
## 
## [[64]]
## [1] "64"
## 
## [[65]]
## [1] "65"
## 
## [[66]]
## [1] "66"
## 
## [[67]]
## [1] "67"
## 
## [[68]]
## [1] "68"
## 
## [[69]]
## [1] "69"
## 
## [[70]]
## [1] "70"
## 
## [[71]]
## [1] "71"
## 
## [[72]]
## [1] "72"
## 
## [[73]]
## [1] "73"
## 
## [[74]]
## [1] "74"
## 
## [[75]]
## [1] "75"
## 
## [[76]]
## [1] "76"
## 
## [[77]]
## [1] "77"
## 
## [[78]]
## [1] "78"
## 
## [[79]]
## [1] "79"
## 
## [[80]]
## [1] "80"
## 
## [[81]]
## [1] "81"
## 
## [[82]]
## [1] "82"
## 
## [[83]]
## [1] "83"
## 
## [[84]]
## [1] "84"
## 
## [[85]]
## [1] "85"
## 
## [[86]]
## [1] "86"
## 
## [[87]]
## [1] "87"
## 
## [[88]]
## [1] "88"
## 
## [[89]]
## [1] "89"
## 
## [[90]]
## [1] "90"
## 
## [[91]]
## [1] "91"
## 
## [[92]]
## [1] "92"
## 
## [[93]]
## [1] "93"
## 
## [[94]]
## [1] "94"
## 
## [[95]]
## [1] "95"
## 
## [[96]]
## [1] "96"
## 
## [[97]]
## [1] "97"
## 
## [[98]]
## [1] "98"
## 
## [[99]]
## [1] "99"
## 
## [[100]]
## [1] "100"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(}\KeywordTok{integer}\NormalTok{(), paste) }\CommentTok{\# return list}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## list()
\end{verbatim}

\hypertarget{purrr}{%
\section{purrr}\label{purrr}}

\hypertarget{why-map}{%
\subsection{Why map?}\label{why-map}}

\hypertarget{objectives-1}{%
\subsubsection{Objectives}\label{objectives-1}}

\begin{itemize}
\tightlist
\item
  How to use \texttt{purrr} to automate workflow in a cleaner, faster, and more extendable way
\end{itemize}

\hypertarget{copy-and-paste-programming}{%
\subsubsection{Copy-and-paste programming}\label{copy-and-paste-programming}}

\begin{quote}
Copy-and-paste programming, sometimes referred to as just pasting, is the production of highly repetitive computer programming code, as produced by copy and paste operations. It is primarily a pejorative term; those who use the term are often implying a lack of programming competence. It may also be the result of technology limitations (e.g., an insufficiently expressive development environment) as subroutines or libraries would normally be used instead. However, there are occasions when copy-and-paste programming is considered acceptable or necessary, such as for boilerplate, loop unrolling (when not supported automatically by the compiler), or certain programming idioms, and it is supported by some source code editors in the form of snippets. - Wikipedia
\end{quote}

\begin{itemize}
\item
  The following exercise was inspired by \href{http://adv-r.had.co.nz/Functional-programming.html}{Wickham's example}.
\item
  Let's imagine \texttt{df} is a survey dataset.

  \begin{itemize}
  \item
    \texttt{a,\ b,\ c,\ d} = Survey questions
  \item
    \texttt{-99}: non-responses
  \item
    Your goal: replace \texttt{-99} with \texttt{NA}
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Data}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility}

\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \StringTok{"a"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \StringTok{"b"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \StringTok{"c"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \StringTok{"d"}\NormalTok{ =}\StringTok{ }\KeywordTok{sample}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\OperatorTok{{-}}\DecValTok{99}\NormalTok{, }\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{), }\DataTypeTok{size =} \DecValTok{5}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Copy and paste}
\NormalTok{df}\OperatorTok{$}\NormalTok{a[df}\OperatorTok{$}\NormalTok{a }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{b[df}\OperatorTok{$}\NormalTok{b }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{c[df}\OperatorTok{$}\NormalTok{c }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{df}\OperatorTok{$}\NormalTok{d[df}\OperatorTok{$}\NormalTok{d }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##       a     b     c     d
##   <dbl> <dbl> <dbl> <dbl>
## 1     3     3     3     1
## 2     3     2     3     1
## 3     1    NA     1     2
## 4     1    NA     2     1
## 5    NA     1     1     3
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \textbf{Challenge}. Explain why this solution is not very efficient (Hint: If \texttt{df\$a{[}df\$a\ ==\ -99{]}\ \textless{}-\ NA} has an error, how will you fix it? A solution is not scalable if it's not automatable.
\end{itemize}

\hypertarget{using-a-function}{%
\subsubsection{Using a function}\label{using-a-function}}

\begin{itemize}
\item
  Let's recall what's function in R: \texttt{input\ +\ computation\ +\ output}
\item
  If you write a function, you gain efficiency because you don't need to copy and paste the computation part.
\end{itemize}

`
function(input)\{

computation

return(output)
\}
`

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Function}

\NormalTok{fix\_missing \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x) \{}
\NormalTok{  x[x }\OperatorTok{==}\StringTok{ }\DecValTok{{-}99}\NormalTok{] \textless{}{-}}\StringTok{ }\OtherTok{NA}
\NormalTok{  x}
\NormalTok{\}}

\CommentTok{\# Apply function to each column (vector)}

\NormalTok{df}\OperatorTok{$}\NormalTok{a \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{a)}
\NormalTok{df}\OperatorTok{$}\NormalTok{b \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{b)}
\NormalTok{df}\OperatorTok{$}\NormalTok{c \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{c)}
\NormalTok{df}\OperatorTok{$}\NormalTok{d \textless{}{-}}\StringTok{ }\KeywordTok{fix\_missing}\NormalTok{(df}\OperatorTok{$}\NormalTok{d)}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##       a     b     c     d
##   <dbl> <dbl> <dbl> <dbl>
## 1     3     3     3     1
## 2     3     2     3     1
## 3     1    NA     1     2
## 4     1    NA     2     1
## 5    NA     1     1     3
\end{verbatim}

\begin{itemize}
\item
  \textbf{Challenge} Why is using function more efficient than 100\% copying and pasting? Can you think about a way we can automate the process?
\item
  Many options for automation in R: \texttt{for\ loop}, \texttt{apply} family, etc.
\item
  Here's a tidy solution that comes from the \texttt{purrr} package.
\item
  The power and joy of one-liner.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{map\_df}\NormalTok{(df, fix\_missing)}

\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 4
##       a     b     c     d
##   <dbl> <dbl> <dbl> <dbl>
## 1     3     3     3     1
## 2     3     2     3     1
## 3     1    NA     1     2
## 4     1    NA     2     1
## 5    NA     1     1     3
\end{verbatim}

\texttt{map()} is a \href{https://en.wikipedia.org/wiki/Map_(higher-order_function)}{higher-order function} that applies a given function to each element of a list/vector.

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/f0494d020aa517ae7b1011cea4c4a9f21702df8b/2577b/diagrams/functionals/map.png}
\caption{This is how map() works. It's easier to understand with a picture.}
\end{figure}

\begin{verbatim}
- Input: Takes a vector/list. 

- Computation: Calls the function once for each element of the vector 

- Output: Returns in a list or whatever data format you prefer (e.g., `_df helper: dataframe`)
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \textbf{Challenge} If you run the code below, what will be the data type of the output?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(df, fix\_missing)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## $a
## [1]  3  3  1  1 NA
## 
## $b
## [1]  3  2 NA NA  1
## 
## $c
## [1] 3 3 1 2 1
## 
## $d
## [1] 1 1 2 1 3
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Why \texttt{map()} is a good alternative to \texttt{for\ loop}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Built{-}in data}
\KeywordTok{data}\NormalTok{(}\StringTok{"airquality"}\NormalTok{)}

\KeywordTok{tic}\NormalTok{()}

\CommentTok{\# Placeholder}
\NormalTok{out1 \textless{}{-}}\StringTok{ }\KeywordTok{vector}\NormalTok{(}\StringTok{"double"}\NormalTok{, }\KeywordTok{ncol}\NormalTok{(airquality))}

\CommentTok{\# Sequence variable}
\ControlFlowTok{for}\NormalTok{ (i }\ControlFlowTok{in} \KeywordTok{seq\_along}\NormalTok{(airquality)) \{ }

  \CommentTok{\# Assign an iteration result to each element of the placeholder list }
\NormalTok{  out1[[i]] \textless{}{-}}\StringTok{ }\KeywordTok{mean}\NormalTok{(airquality[[i]], }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{\}}

\KeywordTok{toc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 0.008 sec elapsed
\end{verbatim}

\texttt{map} is faster because it applies function to the items on the list/vector in parallel. Also, using \texttt{map\_dbl} reduces an extra step you need to take. Hint: \texttt{map\_dbl(x,\ mean,\ na.rm\ =\ TRUE)} = \texttt{vapply(x,\ mean,\ na.rm\ =\ TRUE,\ FUN.VALUE\ =\ double(1))}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{tic}\NormalTok{()}
\NormalTok{out1 \textless{}{-}}\StringTok{ }\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{map\_dbl}\NormalTok{(mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\KeywordTok{toc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 0.003 sec elapsed
\end{verbatim}

\begin{itemize}
\item
  In short, \texttt{map()} is more readable, faster, and easily extendable with other data science tasks (e.g., wrangling, modeling, and visualization) using \texttt{\%\textgreater{}\%}.
\item
  Final point: Why not base R \texttt{apply} family?
\item
  Short answer: \texttt{purrr::map()} is simpler to write.
\end{itemize}

\textbf{Additional tips}

Performance testing (profiling) is an important part of programming. \texttt{tictoc()} measures the time needed to run a target function for once. If you want a more robust measure of timing as well as information on memory (\textbf{speed} and \textbf{space} both matter for performance testing), consider using the \href{https://github.com/r-lib/bench}{\texttt{bench} package} that is designed for high precision timing of R expressions.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{map\_mark \textless{}{-}}\StringTok{ }\NormalTok{bench}\OperatorTok{::}\KeywordTok{mark}\NormalTok{(}

\NormalTok{  out1 \textless{}{-}}\StringTok{ }\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{map\_dbl}\NormalTok{(mean, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}

\NormalTok{  )}

\NormalTok{map\_mark}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 6
##   expression                                              min   median `itr/sec`
##   <bch:expr>                                         <bch:tm> <bch:tm>     <dbl>
## 1 out1 <- airquality %>% map_dbl(mean, na.rm = TRUE)   61.4us   72.5us    13247.
## # ... with 2 more variables: mem_alloc <bch:byt>, `gc/sec` <dbl>
\end{verbatim}

\hypertarget{applications}{%
\subsubsection{Applications}\label{applications}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Many models
\end{enumerate}

\begin{itemize}
\tightlist
\item
  One popular application of \texttt{map()} is to run regression models (or whatever model you want to run) on list-columns. No more copying and pasting for running many regression models on subgroups!
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Have you ever tried this?}
\NormalTok{lm\_A \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_A"}\NormalTok{))}
\NormalTok{lm\_B \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_B"}\NormalTok{))}
\NormalTok{lm\_C \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_C"}\NormalTok{))}
\NormalTok{lm\_D \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_D"}\NormalTok{))}
\NormalTok{lm\_E \textless{}{-}}\StringTok{ }\KeywordTok{lm}\NormalTok{(y }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{x, }\KeywordTok{subset}\NormalTok{(data, subgroup }\OperatorTok{==}\StringTok{ "group\_E"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  For more information on this technique, read the Many Models subchapter of the \href{https://r4ds.had.co.nz/many-models.html\#creating-list-columns}{R for Data Science}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Function}
\NormalTok{lm\_model \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(df) \{}
  \KeywordTok{lm}\NormalTok{(Temp }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{Ozone, }\DataTypeTok{data =}\NormalTok{ df)}
\NormalTok{\}}

\CommentTok{\# Map}
\NormalTok{models \textless{}{-}}\StringTok{ }\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(Month) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nest}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Create list{-}columns}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{ols =} \KeywordTok{map}\NormalTok{(data, lm\_model)) }\CommentTok{\# Map}
\NormalTok{models}\OperatorTok{$}\NormalTok{ols[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## 
## Call:
## lm(formula = Temp ~ Ozone, data = df)
## 
## Coefficients:
## (Intercept)        Ozone  
##     62.8842       0.1629
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add tidying}
\NormalTok{tidy\_lm\_model \textless{}{-}}\StringTok{ }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{compose}\NormalTok{( }\CommentTok{\# compose multiple functions}
\NormalTok{  broom}\OperatorTok{::}\NormalTok{tidy, }\CommentTok{\# convert lm objects into tidy tibbles}
\NormalTok{  lm\_model}
\NormalTok{)}

\NormalTok{tidied\_models \textless{}{-}}\StringTok{ }\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(Month) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{nest}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Create list{-}columns}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{ols =} \KeywordTok{map}\NormalTok{(data, tidy\_lm\_model))}

\NormalTok{tidied\_models}\OperatorTok{$}\NormalTok{ols[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   62.9      1.61       39.2  2.88e-23
## 2 Ozone          0.163    0.0500      3.26 3.31e- 3
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Simulations
\end{enumerate}

A good friend of \texttt{map()} function is \texttt{rerun()} function. This combination is really useful for simulations. Consider the following example.

\begin{itemize}
\tightlist
\item
  Base R approach
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}

\NormalTok{small\_n \textless{}{-}}\StringTok{ }\DecValTok{100}\NormalTok{ ; k \textless{}{-}}\StringTok{ }\DecValTok{1000}\NormalTok{ ; mu \textless{}{-}}\StringTok{ }\DecValTok{500}\NormalTok{ ; sigma \textless{}{-}}\StringTok{ }\DecValTok{20} 

\NormalTok{y\_list \textless{}{-}}\StringTok{ }\KeywordTok{rep}\NormalTok{(}\KeywordTok{list}\NormalTok{(}\OtherTok{NA}\NormalTok{), k)}

\ControlFlowTok{for}\NormalTok{ (i }\ControlFlowTok{in} \KeywordTok{seq}\NormalTok{(k)) \{}
        
\NormalTok{    y\_list[[i]] \textless{}{-}}\StringTok{ }\KeywordTok{rnorm}\NormalTok{(small\_n, mu, sigma)}
        
\NormalTok{\}}

\NormalTok{y\_means \textless{}{-}}\StringTok{ }\KeywordTok{unlist}\NormalTok{(}\KeywordTok{lapply}\NormalTok{(y\_list, mean))}

\KeywordTok{qplot}\NormalTok{(y\_means) }\OperatorTok{+}
\StringTok{   }\KeywordTok{geom\_vline}\NormalTok{(}\DataTypeTok{xintercept =} \DecValTok{500}\NormalTok{, }\DataTypeTok{linetype =} \StringTok{"dotted"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-32-1.pdf}

\begin{itemize}
\tightlist
\item
  rerun() + map()
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{small\_n \textless{}{-}}\StringTok{ }\DecValTok{100}\NormalTok{ ; k \textless{}{-}}\StringTok{ }\DecValTok{1000}\NormalTok{; mu \textless{}{-}}\StringTok{ }\DecValTok{500}\NormalTok{ ; sigma \textless{}{-}}\StringTok{ }\DecValTok{20} 

\NormalTok{y\_tidy \textless{}{-}}\StringTok{ }\KeywordTok{rerun}\NormalTok{(k, }\KeywordTok{rnorm}\NormalTok{(small\_n, mu, sigma)) }

\NormalTok{y\_means\_tidy \textless{}{-}}\StringTok{ }\KeywordTok{map\_dbl}\NormalTok{(y\_tidy, mean)}

\CommentTok{\# Visualize }
\NormalTok{(}\KeywordTok{qplot}\NormalTok{(y\_means) }\OperatorTok{+}
\StringTok{   }\KeywordTok{geom\_vline}\NormalTok{(}\DataTypeTok{xintercept =} \DecValTok{500}\NormalTok{, }\DataTypeTok{linetype =} \StringTok{"dotted"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{)) }\OperatorTok{+}
\NormalTok{(}\KeywordTok{qplot}\NormalTok{(y\_means\_tidy) }\OperatorTok{+}
\StringTok{   }\KeywordTok{geom\_vline}\NormalTok{(}\DataTypeTok{xintercept =} \DecValTok{500}\NormalTok{, }\DataTypeTok{linetype =} \StringTok{"dotted"}\NormalTok{, }\DataTypeTok{color =} \StringTok{"red"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-33-1.pdf}

\hypertarget{map2}{%
\section{Automate 2 or 2+ tasks}\label{map2}}

\hypertarget{objectives-2}{%
\subsection{Objectives}\label{objectives-2}}

\begin{itemize}
\tightlist
\item
  Learning how to use \texttt{map2()} and \texttt{pmap()} to avoid writing nested loops.
\end{itemize}

\hypertarget{problem}{%
\subsection{Problem}\label{problem}}

\begin{itemize}
\tightlist
\item
  Problem: How can you create something like the below?
\end{itemize}

{[}1{]} ``University = Berkeley \textbar{} Department = waterbenders''

{[}1{]} ``University = Berkeley \textbar{} Department = earthbenders''

{[}1{]} ``University = Berkeley \textbar{} Department = firebenders''

{[}1{]} ``University = Berkeley \textbar{} Department = airbenders''

{[}1{]} ``University = Stanford \textbar{} Department = waterbenders''

{[}1{]} ``University = Stanford \textbar{} Department = earthbenders''

{[}1{]} ``University = Stanford \textbar{} Department = firebenders''

{[}1{]} ``University = Stanford \textbar{} Department = airbenders''

\begin{itemize}
\tightlist
\item
  The most manual way: You can copy and paste eight times.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{paste}\NormalTok{(}\StringTok{"University = Berkeley | Department = CS"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University = Berkeley | Department = CS"
\end{verbatim}

\hypertarget{for-loop-1}{%
\subsection{For loop}\label{for-loop-1}}

\begin{itemize}
\item
  A slightly more efficient way: using a for loop.
\item
  Think about which part of the statement is constant and which part varies ( = parameters).\\
\item
  Do we need a placeholder? No.~We don't need a placeholder because we don't store the result of iterations.
\item
  \textbf{Challenge}: How many parameters do you need to solve the problem below?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Outer loop}

\ControlFlowTok{for}\NormalTok{ (univ }\ControlFlowTok{in} \KeywordTok{c}\NormalTok{(}\StringTok{"Berkeley"}\NormalTok{, }\StringTok{"Stanford"}\NormalTok{)) \{}

  \CommentTok{\# Inner loop}

  \ControlFlowTok{for}\NormalTok{ (dept }\ControlFlowTok{in} \KeywordTok{c}\NormalTok{(}\StringTok{"waterbenders"}\NormalTok{, }\StringTok{"earthbenders"}\NormalTok{, }\StringTok{"firebenders"}\NormalTok{, }\StringTok{"airbenders"}\NormalTok{)) \{}
    \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{"University = "}\NormalTok{, univ, }\StringTok{"|"}\NormalTok{, }\StringTok{"Department = "}\NormalTok{, dept))}
\NormalTok{  \}}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Berkeley | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Berkeley | Department =  airbenders"
## [1] "University =  Stanford | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Stanford | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  This is not bad, but \ldots{} \texttt{n} arguments -\textgreater{} \texttt{n-nested\ for\ loops}. As a scale of your problem grows, your code gets complicated.
\end{itemize}

\begin{quote}
To become significantly more reliable, code must become more transparent. In particular, nested conditions and loops must be viewed with great suspicion. Complicated control flows confuse programmers. Messy code often hides bugs. --- \href{https://en.wikipedia.org/wiki/Bjarne_Stroustrup}{Bjarne Stroustrup}
\end{quote}

\hypertarget{map2-pmap}{%
\subsection{map2 \& pmap}\label{map2-pmap}}

\begin{itemize}
\item
  Step 1: Define inputs and a function.
\item
  \textbf{Challenge} Why are we using \texttt{rep()} to create input vectors? For instance, for \texttt{univ\_list} why not just use \texttt{c("Berkeley",\ "Stanford")}?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Inputs (remember the length of these inputs should be identical)}

\NormalTok{univ\_list \textless{}{-}}\StringTok{ }\KeywordTok{rep}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"Berkeley"}\NormalTok{, }\StringTok{"Stanford"}\NormalTok{), }\DecValTok{4}\NormalTok{)}
\NormalTok{dept\_list \textless{}{-}}\StringTok{ }\KeywordTok{rep}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"waterbenders"}\NormalTok{, }\StringTok{"earthbenders"}\NormalTok{, }\StringTok{"firebenders"}\NormalTok{, }\StringTok{"airbenders"}\NormalTok{), }\DecValTok{2}\NormalTok{)}

\CommentTok{\# Function}

\NormalTok{print\_lists \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(univ, dept) \{}
  \KeywordTok{print}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}
    \StringTok{"University = "}\NormalTok{, univ, }\StringTok{"|"}\NormalTok{,}
    \StringTok{"Department = "}\NormalTok{, dept}
\NormalTok{  ))}
\NormalTok{\}}

\CommentTok{\# Test}

\KeywordTok{print\_lists}\NormalTok{(univ\_list[}\DecValTok{1}\NormalTok{], dept\_list[}\DecValTok{1}\NormalTok{])}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University =  Berkeley | Department =  waterbenders"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Step2: Using \texttt{map2()} or \texttt{pmap()}
\end{itemize}

\includegraphics{https://dcl-prog.stanford.edu/images/map2.png}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 2 arguments}
\NormalTok{map2\_output \textless{}{-}}\StringTok{ }\KeywordTok{map2}\NormalTok{(univ\_list, dept\_list, print\_lists)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
\end{verbatim}

\includegraphics{https://d33wubrfki0l68.cloudfront.net/e426c5755e2e65bdcc073d387775db79791f32fd/92902/diagrams/functionals/pmap.png}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 3+ arguments}
\NormalTok{pmap\_output \textless{}{-}}\StringTok{ }\KeywordTok{pmap}\NormalTok{(}\KeywordTok{list}\NormalTok{(univ\_list, dept\_list), print\_lists)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
## [1] "University =  Berkeley | Department =  waterbenders"
## [1] "University =  Stanford | Department =  earthbenders"
## [1] "University =  Berkeley | Department =  firebenders"
## [1] "University =  Stanford | Department =  airbenders"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  \textbf{Challenge} Have you noticed that we used a slightly different input for \texttt{pmap()} compared to \texttt{map()} or \texttt{map2()}? What is the difference?
\end{itemize}

\hypertarget{glue}{%
\section{Automate plotting}\label{glue}}

\hypertarget{objective}{%
\subsection{Objective}\label{objective}}

\begin{itemize}
\tightlist
\item
  Learning how to use \texttt{map()} and \texttt{glue()} to automate creating multiple plots
\end{itemize}

\hypertarget{problem-1}{%
\subsection{Problem}\label{problem-1}}

\begin{itemize}
\tightlist
\item
  Making the following data visualization process more efficient.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{data}\NormalTok{(}\StringTok{"airquality"}\NormalTok{)}

\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Ozone, }\DataTypeTok{y =}\NormalTok{ Solar.R)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{title =} \StringTok{"Relationship between Ozone and Solar.R"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Solar.R"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Removed 42 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-39-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Ozone, }\DataTypeTok{y =}\NormalTok{ Wind)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{title =} \StringTok{"Relationship between Ozone and Wind"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Wind"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-39-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Ozone, }\DataTypeTok{y =}\NormalTok{ Temp)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{title =} \StringTok{"Relationship between Ozone and Temp"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Temp"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-39-3.pdf}

\hypertarget{solution}{%
\subsection{Solution}\label{solution}}

\begin{itemize}
\item
  Learn how \texttt{glue()} works.
\item
  \texttt{glue()} combines strings and objects and it works simpler and faster than \texttt{paste()} or \texttt{sprintif()}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{names \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Jae"}\NormalTok{, }\StringTok{"Aniket"}\NormalTok{, }\StringTok{"Avery"}\NormalTok{)}

\NormalTok{fields \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"Political Science"}\NormalTok{, }\StringTok{"Law"}\NormalTok{, }\StringTok{"Public Health"}\NormalTok{)}

\KeywordTok{glue}\NormalTok{(}\StringTok{"\{names\} studies \{fields\}."}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Jae studies Political Science.
## Aniket studies Law.
## Avery studies Public Health.
\end{verbatim}

So, our next step is to combine \texttt{glue()} and \texttt{map()}.

First, let's think about writing a function that includes \texttt{glue()}.

\textbf{Challenge}
How can you create the character vector of column names?
How can you make \texttt{ggplot2()} take strings as x and y variable names? (Hint: Type \texttt{?aes\_string()})

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes\_string}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{names}\NormalTok{(airquality)[}\DecValTok{1}\NormalTok{], }\DataTypeTok{y =} \KeywordTok{names}\NormalTok{(airquality)[}\DecValTok{2}\NormalTok{])) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{title =} \KeywordTok{glue}\NormalTok{(}\StringTok{"Relationship between Ozone and \{names(airquality)[2]\}"}\NormalTok{),}
    \DataTypeTok{y =} \KeywordTok{glue}\NormalTok{(}\StringTok{"\{names(airquality)[2]\}"}\NormalTok{)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Removed 42 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-41-1.pdf}

\begin{itemize}
\item
  The next step is to write an automatic plotting function.

  \begin{itemize}
  \tightlist
  \item
    Note that in the function argument \texttt{i} (abstract) replaced 2 (specific): abstraction
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{create\_point\_plot \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(i) \{}
\NormalTok{  airquality }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes\_string}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{names}\NormalTok{(airquality)[}\DecValTok{1}\NormalTok{], }\DataTypeTok{y =} \KeywordTok{names}\NormalTok{(airquality)[i])) }\OperatorTok{+}
\StringTok{    }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{    }\KeywordTok{labs}\NormalTok{(}
      \DataTypeTok{title =} \KeywordTok{glue}\NormalTok{(}\StringTok{"Relationship between Ozone and \{names(airquality)[i]\}"}\NormalTok{),}
      \DataTypeTok{y =} \KeywordTok{glue}\NormalTok{(}\StringTok{"\{names(airquality)[i]\}"}\NormalTok{)}
\NormalTok{    )}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  The final step is to put the function in \texttt{map()}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(}\DecValTok{2}\OperatorTok{:}\KeywordTok{ncol}\NormalTok{(airquality), create\_point\_plot)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 42 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-43-1.pdf}

\begin{verbatim}
## 
## [[2]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-43-2.pdf}

\begin{verbatim}
## 
## [[3]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-43-3.pdf}

\begin{verbatim}
## 
## [[4]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-43-4.pdf}

\begin{verbatim}
## 
## [[5]]
\end{verbatim}

\begin{verbatim}
## Warning: Removed 37 rows containing missing values (geom_point).
\end{verbatim}

\includegraphics{04_functional_programming_files/figure-latex/unnamed-chunk-43-5.pdf}

\hypertarget{reduce}{%
\section{Automate joining}\label{reduce}}

\hypertarget{objective-1}{%
\subsection{Objective}\label{objective-1}}

\begin{itemize}
\tightlist
\item
  Learning how to use \texttt{reduce()} to automate row-binding multiple dataframes
\end{itemize}

\hypertarget{problem-2}{%
\subsection{Problem}\label{problem-2}}

\begin{itemize}
\tightlist
\item
  How can you make row-binding multiple dataframes more efficient?
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df1 \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{z =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}

\NormalTok{df2 \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{z =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}

\NormalTok{df3 \textless{}{-}}\StringTok{ }\KeywordTok{tibble}\NormalTok{(}
  \DataTypeTok{x =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{y =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{z =} \KeywordTok{sample}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\DataTypeTok{size =} \DecValTok{3}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{copy-and-paste}{%
\subsection{Copy and paste}\label{copy-and-paste}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{first\_bind \textless{}{-}}\StringTok{ }\KeywordTok{bind\_rows}\NormalTok{(df1, df2)}

\NormalTok{second\_bind \textless{}{-}}\StringTok{ }\KeywordTok{bind\_rows}\NormalTok{(first\_bind, df3)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  \textbf{Challenge}
  Why is the above solution not efficient?
\end{itemize}

\hypertarget{reduce-1}{%
\subsection{reduce}\label{reduce-1}}

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/9c239e1227c69b7a2c9c2df234c21f3e1c74dd57/eec0e/diagrams/functionals/reduce.png}
\caption{How reduce() works.}
\end{figure}

\begin{verbatim}
- Input: Takes a vector of length n

- Computation: Calls a function with a pair of values at a time

- Output: Returns a vector of length 1
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{reduced \textless{}{-}}\StringTok{ }\KeywordTok{reduce}\NormalTok{(}\KeywordTok{list}\NormalTok{(df1, df2, df3), bind\_rows)}
\end{Highlighting}
\end{Shaded}

\hypertarget{speed}{%
\section{Make automation slower or faster}\label{speed}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Install packages }
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(tidyverse, }\CommentTok{\# tidyverse pkgs including purrr}
\NormalTok{               tictoc, }\CommentTok{\# performance test }
\NormalTok{               furrr) }\CommentTok{\# parallel processing  reproducibility }
\end{Highlighting}
\end{Shaded}

\hypertarget{objectives-3}{%
\subsection{Objectives}\label{objectives-3}}

\begin{itemize}
\tightlist
\item
  Learning how to use \texttt{slowly()} and \texttt{future\_} to make the automation process either slower or faster
\end{itemize}

\hypertarget{how-to-make-automation-slower}{%
\subsection{How to Make Automation Slower}\label{how-to-make-automation-slower}}

Scraping 50 pages from a website, you don't want to overload the server. How can you do that?

\hypertarget{for-loop-2}{%
\subsubsection{For loop}\label{for-loop-2}}

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{for}\NormalTok{ (i }\ControlFlowTok{in} \DecValTok{1}\OperatorTok{:}\DecValTok{50}\NormalTok{) \{}
  
  \KeywordTok{message}\NormalTok{(}\StringTok{"Scraping page "}\NormalTok{,i)}
  
  \ControlFlowTok{if}\NormalTok{ ((i }\OperatorTok{\%\%}\StringTok{ }\DecValTok{10}\NormalTok{) }\OperatorTok{==}\StringTok{ }\DecValTok{0}\NormalTok{) \{}
    
    \KeywordTok{message}\NormalTok{(}\StringTok{"Break time"}\NormalTok{)}
    
    \KeywordTok{Sys.sleep}\NormalTok{(}\DecValTok{1}\NormalTok{) }\CommentTok{\# 1 second }
\NormalTok{  \}}
  
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\hypertarget{map}{%
\subsubsection{Map}\label{map}}

\begin{itemize}
\tightlist
\item
  \texttt{walk()} works the same as \texttt{map()} but doesn't store its output.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{walk}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{50}\NormalTok{, }\ControlFlowTok{function}\NormalTok{(x)\{}\KeywordTok{message}\NormalTok{(}\StringTok{"Scraping page"}\NormalTok{, x)\})}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  If you're web scraping, one problem with this approach is it's too fast by human standards.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{tic}\NormalTok{(}\StringTok{"Scraping pages"}\NormalTok{)}
\KeywordTok{walk}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\ControlFlowTok{function}\NormalTok{(x)\{}\KeywordTok{message}\NormalTok{(}\StringTok{"Scraping page"}\NormalTok{, x)\}) }\CommentTok{\# Anonymous function; I don\textquotesingle{}t name the function }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Scraping page1
\end{verbatim}

\begin{verbatim}
## Scraping page2
\end{verbatim}

\begin{verbatim}
## Scraping page3
\end{verbatim}

\begin{verbatim}
## Scraping page4
\end{verbatim}

\begin{verbatim}
## Scraping page5
\end{verbatim}

\begin{verbatim}
## Scraping page6
\end{verbatim}

\begin{verbatim}
## Scraping page7
\end{verbatim}

\begin{verbatim}
## Scraping page8
\end{verbatim}

\begin{verbatim}
## Scraping page9
\end{verbatim}

\begin{verbatim}
## Scraping page10
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{toc}\NormalTok{(}\DataTypeTok{log =} \OtherTok{TRUE}\NormalTok{) }\CommentTok{\# save toc }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Scraping pages: 0.006 sec elapsed
\end{verbatim}

\begin{itemize}
\tightlist
\item
  If you want to make the function run slowly \ldots{}
\end{itemize}

\begin{quote}
slowly() takes a function and modifies it to wait a given amount of time between each call. - \texttt{purrr} package vignette
\end{quote}

\begin{itemize}
\tightlist
\item
  If a function is a verb, then a helper function is an adverb (modifying the behavior of the verb).
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 49.05 sec elapsed}

\KeywordTok{tic}\NormalTok{(}\StringTok{"scraping pages with deplay"}\NormalTok{, }\DataTypeTok{log =} \OtherTok{TRUE}\NormalTok{)}

\KeywordTok{walk}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{10}\NormalTok{, }\KeywordTok{slowly}\NormalTok{(}\ControlFlowTok{function}\NormalTok{(x)\{}\KeywordTok{message}\NormalTok{(}\StringTok{"Scraping page"}\NormalTok{, x)\},   }
                    \DataTypeTok{rate =} \KeywordTok{rate\_delay}\NormalTok{(}\DataTypeTok{pause =} \DecValTok{1}\NormalTok{))) }\CommentTok{\# pause = Delay between attempts in seconds}

\KeywordTok{toc}\NormalTok{(}\DataTypeTok{log =} \OtherTok{TRUE}\NormalTok{)}

\KeywordTok{tic.log}\NormalTok{(}\DataTypeTok{format =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{how-to-make-automation-faster}{%
\subsection{How to Make Automation Faster}\label{how-to-make-automation-faster}}

In a different situation, you want to make your function run faster. This is a common situation when you collect and analyze data a large-scale. You can solve this problem using parallel processing. A modern processor has a multi-core. You can divide tasks among these cores. R uses a single thread or only core. You can configure this default setting by the following code. For further information on the parallel processing in R (there are many other options), read \href{https://yxue-me.com/post/2019-05-12-a-glossary-of-parallel-computing-packages-in-r-2019/}{this review}.

\begin{itemize}
\item
  Parallel processing setup

  \begin{itemize}
  \item
    Step1: Determine the number of max workers (\texttt{availableCores()})
  \item
    Step2: Determine the parallel processing mode (\texttt{plan()})
  \end{itemize}
\end{itemize}

We do \texttt{availableCores()\ -\ 1} to save some processing power for other programs.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Setup }
\NormalTok{n\_cores \textless{}{-}}\StringTok{ }\KeywordTok{availableCores}\NormalTok{() }\OperatorTok{{-}}\StringTok{ }\DecValTok{1}
\NormalTok{n\_cores }\CommentTok{\# This number depends on your computer spec.}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## system 
##      7
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{plan}\NormalTok{(multiprocess, }\CommentTok{\# multicore, if supported, otherwise multisession}
     \DataTypeTok{workers =}\NormalTok{ n\_cores) }\CommentTok{\# the maximum number of workers}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Strategy 'multiprocess' is deprecated in future (>= 1.20.0). Instead,
## explicitly specify either 'multisession' or 'multicore'. In the current R
## session, 'multiprocess' equals 'multicore'.
\end{verbatim}

\textbf{What's the difference between multisession and multicore?}

I skip technical explanations and only focus on their usages.

\begin{itemize}
\tightlist
\item
  multisession : fast and relatively stable. It works across different OSs and also for RStudio.
\item
  multicore :faster but unstable. It doesn't work for Windows/RStudio.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{plan}\NormalTok{(sequential)}

\KeywordTok{tic}\NormalTok{(}\StringTok{"averaging 100000 without parallel processing"}\NormalTok{, }\DataTypeTok{log =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{map100000 \textless{}{-}}\StringTok{ }\KeywordTok{future\_map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{100000}\NormalTok{, mean)}
\KeywordTok{toc}\NormalTok{(}\DataTypeTok{log =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{plan}\NormalTok{(multiprocess, }\CommentTok{\# multicore, if supported, otherwise multisession}
     \DataTypeTok{workers =}\NormalTok{ n\_cores) }\CommentTok{\# the maximum number of workers}

\KeywordTok{tic}\NormalTok{(}\StringTok{"averaging 100000 with parallel processing"}\NormalTok{, }\DataTypeTok{log =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{map100000 \textless{}{-}}\StringTok{ }\KeywordTok{future\_map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{100000}\NormalTok{, mean)}
\KeywordTok{toc}\NormalTok{(}\DataTypeTok{log =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{tic.log}\NormalTok{(}\DataTypeTok{format =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [1] "Scraping pages: 0.006 sec elapsed"
\end{verbatim}

Parallel processing does not always increase performance because of the overhead cost (e.g., time spent communicating data between processing). Use this technique either when the computation part is heavy or when you need to repeat the process many times.

\hypertarget{robustness}{%
\section{Make error handling easier}\label{robustness}}

\hypertarget{learning-objective}{%
\subsection{Learning objective}\label{learning-objective}}

\begin{itemize}
\item
  Learning how to use \texttt{safely()} and \texttt{possibly()} to make error handling easier
  \#\#\# Problem
\item
  \textbf{Challenge}
\item
  Explain why we can't run \texttt{map(url\_list,\ read\_html)}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{url\_list \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}
  \StringTok{"https://en.wikipedia.org/wiki/University\_of\_California,\_Berkeley"}\NormalTok{,}
  \StringTok{"https://en.wikipedia.org/wiki/Stanford\_University"}\NormalTok{,}
  \StringTok{"https://en.wikipedia.org/wiki/Carnegie\_Mellon\_University"}\NormalTok{,}
  \StringTok{"https://DLAB"}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(url\_list, read\_html)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  This is a straightforward problem, so it's easy to tell where the problem is. How can you make your error more informative?
\end{itemize}

\hypertarget{solution-1}{%
\subsection{Solution}\label{solution-1}}

\hypertarget{try-catch}{%
\subsubsection{Try-catch}\label{try-catch}}

\begin{itemize}
\item
  There are three kinds of messages you will run into if your code has an error based on the following functions.

  \begin{itemize}
  \tightlist
  \item
    \texttt{stop()}: errors; Functions must stop.
  \item
    \texttt{warning()}: warnings; Functions may still work. Nonetheless, something is possibly messed up.
  \item
    \texttt{message()}: messages; Some actions happened.
  \end{itemize}
\item
  The basic logic of \texttt{try-catch}, R's basic error handling function, works like the following.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{tryCatch}\NormalTok{(}
\NormalTok{  \{}
    \KeywordTok{map}\NormalTok{(url\_list, read\_html)}
\NormalTok{  \},}
  \DataTypeTok{warning =} \ControlFlowTok{function}\NormalTok{(w) \{}
    \StringTok{"Warning"}
\NormalTok{  \},}
  \DataTypeTok{error =} \ControlFlowTok{function}\NormalTok{(e) \{}
    \StringTok{"Error"}
\NormalTok{  \},}
  \DataTypeTok{finally =}\NormalTok{ \{}
    \StringTok{"Message"}
\NormalTok{  \}}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "Error"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Here's \texttt{purrr} version of the \texttt{try-catch} mechanism (evaluates code and assigns exception handlers).
\end{itemize}

\hypertarget{safely}{%
\subsubsection{safely}\label{safely}}

\textbf{Outputs}

\begin{itemize}
\tightlist
\item
  result: result or \texttt{NULL}
\item
  error: \texttt{NULL} or \texttt{error}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(url\_list, }\KeywordTok{safely}\NormalTok{(read\_html))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## [[1]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[1]]$error
## NULL
## 
## 
## [[2]]
## [[2]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[2]]$error
## NULL
## 
## 
## [[3]]
## [[3]]$result
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[3]]$error
## NULL
## 
## 
## [[4]]
## [[4]]$result
## NULL
## 
## [[4]]$error
## <simpleError in open.connection(x, "rb"): Could not resolve host: DLAB>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  The easier way to solve this problem is just to avoid the error.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(url\_list, }\KeywordTok{safely}\NormalTok{(read\_html)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{map}\NormalTok{(}\StringTok{"result"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\CommentTok{\# = map(function(x) x[["result"]]) = map(\textasciitilde{}.x[["name"]])}
\StringTok{  }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{compact}\NormalTok{() }\CommentTok{\# Remove empty elements}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[2]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
## 
## [[3]]
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject  ...
\end{verbatim}

\hypertarget{possibly}{%
\subsubsection{possibly}\label{possibly}}

What if the best way to solve the problem is not to ignore the error \ldots{}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# If error occurred, "The URL is broken." will be stored in that element(s).}
\NormalTok{out \textless{}{-}}\StringTok{ }\KeywordTok{map}\NormalTok{(}
\NormalTok{  url\_list,}
  \KeywordTok{possibly}\NormalTok{(read\_html,}
    \DataTypeTok{otherwise =} \StringTok{"The URL is broken."}
\NormalTok{  )}
\NormalTok{)}

\CommentTok{\# Let\textquotesingle{}s find the broken URL.}
\NormalTok{url\_list[out[}\KeywordTok{seq}\NormalTok{(out)] }\OperatorTok{==}\StringTok{ "The URL is broken."}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "https://DLAB"
\end{verbatim}

\hypertarget{products}{%
\chapter{Developing data products}\label{products}}

\begin{quote}
A data product is the production output from a statistical analysis. - \href{https://sites.google.com/view/bcaffo/home}{Brian Caffo}
\end{quote}

\hypertarget{developing-r-packages}{%
\section{Developing R packages}\label{developing-r-packages}}

\hypertarget{the-big-picture-6}{%
\subsection{The Big Picture}\label{the-big-picture-6}}

Why develop R packages?

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Reuse your code
\item
  Automate your workflow
\item
  Help others (be part of an open-source development community)
\end{enumerate}

\hypertarget{workflow}{%
\subsection{Workflow}\label{workflow}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Create a GitHub repo
\item
  Clone the GitHub repo
\item
  Make the cloned repo R package project using \texttt{usethis::create\_package(here())}
\item
  Write code in \texttt{\textbackslash{}R}
\item
  Document code in \texttt{\textbackslash{}man} (automated by \texttt{roxygen2} package)
\end{enumerate}

\begin{itemize}
\tightlist
\item
  \texttt{devtools::document()}
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{5}
\tightlist
\item
  Check dependencies in \texttt{NAMESPACE}
\end{enumerate}

\begin{itemize}
\tightlist
\item
  \texttt{devtools::update()} updates the documentation (if you made changes)
\item
  \texttt{devtools::check()} to see whether your package is ready to be submitted to CRAN
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{6}
\tightlist
\item
  Build a package (for more information, read \href{http://r-pkgs.had.co.nz/package.html}{this section} in Hadley's R package development book)
\end{enumerate}

\begin{itemize}
\tightlist
\item
  \texttt{devtools::build()}
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{7}
\tightlist
\item
  (Optional) Test (\texttt{devtools::test()}), teach in \texttt{\textbackslash{}vignettes}, and add data in \texttt{\textbackslash{}data}
\item
  Distribute the package either via CRAN or GitHub (don't forget to make sure your repo is public.)
\end{enumerate}

\includegraphics{http://r-pkgs.had.co.nz/diagrams/package-files.png}

It's time to learn five R code states: source, bundled, binary, installed, and in-memory.

If you're using an R package, you're only concerned of the last two states: \texttt{install.packages("pkg")} and \texttt{library(pkg)} If you're developing an R package, you first write source code (\texttt{*.R}), bundle it (compressed file like \texttt{*.tar.gz}; done by \texttt{devtools::build()}), then make it binary (\texttt{devtools::build(binary\ =\ TRUE)}; This is how a package is stored in CRAN/GitHub, etc.).

\hypertarget{required-components}{%
\subsection{Required Components}\label{required-components}}

The 4 required components are necessary to build and distribute a minimally viable R package. The other steps are optional.

\begin{itemize}
\tightlist
\item
  Package

  \begin{itemize}
  \tightlist
  \item
    \texttt{\textbackslash{}R}: R functions
  \item
    \texttt{\textbackslash{}man}: function documentations
  \item
    DESCRIPTION: provides meta data about the package (e.g., author)
  \item
    LICENSE

    \begin{itemize}
    \tightlist
    \item
      GNU, MIT, etc.
    \end{itemize}
  \item
    NAMESPACE: package dependencies (to make your package self-contained)
  \item
    README (optional)
  \end{itemize}
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Setup (\textbf{DESCRIPTION})
\end{enumerate}

I assume that you've already created and cloned a git repo. Move to your cloned repo file path in the file system.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This function creates DESCRIPTION file }
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{create\_package}\NormalTok{(}\KeywordTok{here}\NormalTok{())}

\CommentTok{\# License the package }
\CommentTok{\# You can use the MIT license by typing devtools::use\_mit\_license("author name"). The function produces MIT license{-}related files (LICENSE, LICENSE.md).}
\KeywordTok{use\_mit\_license}\NormalTok{(}\StringTok{"Jae Yeon Kim"}\NormalTok{)}

\CommentTok{\# Add news (optional) }
\CommentTok{\# Helps track changes }
\KeywordTok{use\_news\_md}\NormalTok{() }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Write code (\textbf{R})
\end{enumerate}

If you want to turn your R markdown file into R script use `knitr::purl(input = ``x.Rmd'',
output = ``x.R''). The \href{https://thinkr-open.github.io/fusen/}{fusen package} helps to develop an R package based on R markdown files.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_r}\NormalTok{(}\StringTok{"rbind\_mutate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#\textquotesingle{} Add two numbers}
\CommentTok{\#\textquotesingle{}}
\CommentTok{\#\textquotesingle{} @param x A number}
\CommentTok{\#\textquotesingle{} @param y A number}
\CommentTok{\#\textquotesingle{} @return The sum of x and y }
\CommentTok{\#\textquotesingle{} @export}

\NormalTok{add \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x, y)\{}
  
\NormalTok{  x }\OperatorTok{+}\StringTok{ }\NormalTok{y}
  
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

If you used a function from other packages, you need to reference it in the following way: \texttt{\#\textquotesingle{}\ @importFrom\ \textless{}package\textgreater{}\ \textless{}function\textgreater{}}

Many of us use \texttt{\%\textgreater{}\%} operator in the code. If you want to add this to your documentation, do \texttt{usethis::use\_pipe()}.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Document (\textbf{man})
\end{enumerate}

This documentation is for the function manual.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Document }
\CommentTok{\# The function creates documentation related files (NAMESPACE, function\_name.rd)}
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{document}\NormalTok{()}

\CommentTok{\# Check; updates the documentation; builds and checks the package }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{check}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Organize (\textbf{NAMESPACE})
\end{enumerate}

This documentation is for \href{https://en.wikipedia.org/wiki/Namespace}{namespace}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_package}\NormalTok{(}\StringTok{"dplyr"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{optional-components}{%
\subsection{Optional Components}\label{optional-components}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Test (\textbf{test})
\end{enumerate}

Although I said optional, automated unit testing is not an option when you're writing a complex package. Testing will save you tons of time and energy.

\begin{itemize}
\tightlist
\item
  Setup
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_testthat}\NormalTok{()}

\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_test}\NormalTok{(}\StringTok{"rbind\_mutate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Testing
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Make changes }

\CommentTok{\# Load functions }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{load\_all}\NormalTok{()}

\CommentTok{\# Test }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{test}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Add data (\textbf{data})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x \textless{}{-}}\StringTok{ "Jae"}
\NormalTok{y \textless{}{-}}\StringTok{ "Sun"}
\NormalTok{z \textless{}{-}}\StringTok{ "Jane"}

\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_data}\NormalTok{(x, y, z, }\DataTypeTok{overwrite =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Teach (\textbf{vignetts})
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_vignette}\NormalTok{(}\StringTok{"rbind\_mutate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{title}\OperatorTok{:}\StringTok{ "Vignette title"}
\NormalTok{author}\OperatorTok{:}\StringTok{ "Vignette author"}
\NormalTok{date}\OperatorTok{:}\StringTok{ "2022{-}01{-}30"}
\NormalTok{output}\OperatorTok{:}\StringTok{ }\NormalTok{rmarkdown}\OperatorTok{::}\NormalTok{html\_vignette}
\NormalTok{vignette}\OperatorTok{:}\StringTok{ }\NormalTok{blah blah}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  You can build a package website using \texttt{pkgdown}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# install.packages("pkgdown")}
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_pkgdown}\NormalTok{()}
\NormalTok{pkgdown}\OperatorTok{::}\KeywordTok{build\_site}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  A package site includes information on METADATA, Function references, Articles, News, etc.
\end{itemize}

\hypertarget{building-an-r-package}{%
\subsection{Building an R package}\label{building-an-r-package}}

\begin{itemize}
\tightlist
\item
  CMD (in the terminal)
\end{itemize}

You can run R commands in the terminal using R CMD.

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{R}\NormalTok{ CMD build mypkg }
\ExtensionTok{R}\NormalTok{ CMD INSTALL mypkg }
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  devtools
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Build }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{build}\NormalTok{()}

\CommentTok{\# Install }
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\hypertarget{distributing-an-r-package}{%
\subsection{Distributing an R package}\label{distributing-an-r-package}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Version update }
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_version}\NormalTok{()}

\CommentTok{\# Spell check}
\NormalTok{usethis}\OperatorTok{::}\KeywordTok{use\_spell\_check}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \href{https://cran.r-project.org/}{CRAN (The Comprehensive R Archive Network)}
\end{enumerate}

\begin{itemize}
\tightlist
\item
  R package submission should comply with \href{https://cran.r-project.org/}{the CRAN Repository Policy}
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  GitHub
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Push everything to the Git repository (you can do it using command-line interface or RStudio).
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{git}\NormalTok{ add . }
\FunctionTok{git}\NormalTok{ commit {-}m }\StringTok{"first push"}
\FunctionTok{git}\NormalTok{ push }
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Don't forget that your repository should be \texttt{public}.
\item
  I highly recommend connecting GitHub with SSH. For more information, visit \href{https://docs.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh}{this link}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{git}\NormalTok{ remote set{-}url origin git@github.com:user/repo }
\end{Highlighting}
\end{Shaded}

\textbf{Additional tips}

Sometimes, you get the following error: ``Undefined global functions or variables'' If you experience this problem, save the following script as \texttt{globals.r.}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{utils}\OperatorTok{::}\KeywordTok{globalVariables}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"\textless{}undefined variable name1\textgreater{}"}\NormalTok{, }\StringTok{"\textless{}undefined variable name2\textgreater{}"}\NormalTok{, }\StringTok{"\textless{}undefinedvariable name3"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "<undefined variable name1>" "<undefined variable name2>"
## [3] "<undefinedvariable name3"
\end{verbatim}

\hypertarget{developing-shiny-apps}{%
\section{Developing Shiny apps}\label{developing-shiny-apps}}

\href{https://shiny.rstudio.com/}{Shiny} is a ``framework for creating web applications using R code'' (\href{https://mastering-shiny.org/}{Wickham 2021}). You can create a \href{https://rstudio.github.io/shinydashboard/}{dashboard} or an \href{https://rviews.rstudio.com/2019/10/09/building-interactive-world-maps-in-shiny/}{interactive map} without knowing anything about HTML, CSS, or JavaScript. Developing a shiny app helps people with little technical expertise learn from your data analysis intuitively and interactively.

To learn more about Shiny applications, see \href{https://blog.rstudio.com/2020/07/13/winners-of-the-2nd-shiny-contest/}{the Winners of the 2nd Annual Shiny Contest} hosted by RStudio.

\hypertarget{workflow-1}{%
\subsection{Workflow}\label{workflow-1}}

The workflow follows what Hadley Wickham recommended in his book on mastering shiny.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Install libraries
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{install.packages}\NormalTok{(}\StringTok{"shiny"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Create app directory and file
\end{enumerate}

Add an \texttt{app.R} file.

The key objective here is defining your UI (User interface; how the app looks; front-end = INPUT) (defined in object \texttt{ui}) and server (how the app works; back-end = OUTPUT) (defined in object \texttt{server}). Shiny uses \textbf{reactive programming}. So if you change inputs on the user side, outputs will be automatically updated on the server end.

If you're creating a complex app, you can achieve the same goal with two files: \texttt{ui.R} and \texttt{server.R.}

\hypertarget{app.r}{%
\subsection{app.r}\label{app.r}}

\begin{itemize}
\tightlist
\item
  Front-end
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Load packages }
\CommentTok{\# Do not use install.packages(), pacman::p\_load(), or library() if you intend to deploy the app using shinyapps.io }

\KeywordTok{require}\NormalTok{(}\StringTok{"wordcloud2"}\NormalTok{)}
\KeywordTok{require}\NormalTok{(}\StringTok{"shiny"}\NormalTok{)}
\KeywordTok{require}\NormalTok{(}\StringTok{"shinydashboard"}\NormalTok{)}
\KeywordTok{require}\NormalTok{(}\StringTok{"colourpicker"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Load data }

\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{read.csv}\NormalTok{(}\KeywordTok{url}\NormalTok{(}\StringTok{"https://github.com/jaeyk/covid19antiasian/raw/master/processed\_data/hash\_counts.csv"}\NormalTok{))[,}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\texttt{fluidPage()}: provides the layout for the UI

\texttt{sliderInput()}: one of the input controls (e.g., \texttt{selectInput()}, \texttt{textInput()}, \texttt{numericInput()})

\texttt{wordcloud2Output()} one of the output controls (e.g., \texttt{tableOutput()})

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Defines the user interface; how the app looks}

\NormalTok{ui \textless{}{-}}\StringTok{ }\KeywordTok{fluidPage}\NormalTok{(}
  
    \CommentTok{\# Application title }
    \KeywordTok{titlePanel}\NormalTok{(}\StringTok{"Word Cloud on the Hashtags of the Tweets related to COVID{-}19 \& Asian|Chinese|Wuhan"}\NormalTok{),}
  
    \KeywordTok{h4}\NormalTok{(tags}\OperatorTok{$}\KeywordTok{a}\NormalTok{(}\DataTypeTok{href =} \StringTok{"https://jaeyk.github.io/"}\NormalTok{, }\StringTok{"Developer: Jae Yeon Kim"}\NormalTok{)),}
            
    \KeywordTok{sidebarLayout}\NormalTok{(}
      
      \CommentTok{\# Sidebar with sliders }
      \KeywordTok{sidebarPanel}\NormalTok{(}
        \KeywordTok{sliderInput}\NormalTok{(}\StringTok{"size"}\NormalTok{, }\CommentTok{\# Input ID: input$size }
                    \StringTok{"Font size:"}\NormalTok{,}
                    \DataTypeTok{min =} \DecValTok{1}\NormalTok{, }\DataTypeTok{max =} \DecValTok{10}\NormalTok{,}
                    \DataTypeTok{value =} \DecValTok{2}\NormalTok{)}
\NormalTok{      ),}
    
    \KeywordTok{mainPanel}\NormalTok{(}
          
          \KeywordTok{wordcloud2Output}\NormalTok{(}\StringTok{"cloud"}\NormalTok{),}
        
\NormalTok{        )}
    
\NormalTok{    )}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Back-end
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{server \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(input, output, session) \{}
  
\NormalTok{  output}\OperatorTok{$}\NormalTok{cloud \textless{}{-}}\StringTok{ }\KeywordTok{renderWordcloud2}\NormalTok{(\{ }
    
    \KeywordTok{wordcloud2}\NormalTok{(df, }
               \DataTypeTok{size =}\NormalTok{ input}\OperatorTok{$}\NormalTok{size, }
               \DataTypeTok{color =} \StringTok{"random{-}dark"}\NormalTok{) }
    
\NormalTok{    \})}

\NormalTok{  \}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Build a shiny app
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{shinyApp}\NormalTok{(}\DataTypeTok{ui =}\NormalTok{ ui, }\DataTypeTok{server =}\NormalTok{ server)}
\end{Highlighting}
\end{Shaded}

\hypertarget{deployment}{%
\subsection{Deployment}\label{deployment}}

\begin{itemize}
\tightlist
\item
  Deploy to \href{https://www.shinyapps.io/?_ga=2.5503866.871102833.1602978469-100003412.1602392815}{the shinyapps.io cloud}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Install packages }
\KeywordTok{install.packages}\NormalTok{(}\StringTok{"rsconnect"}\NormalTok{)}
\KeywordTok{library}\NormalTok{(rsconnect)}

\CommentTok{\# Setup }
\NormalTok{rsconnect}\OperatorTok{::}\KeywordTok{setAccountInfo}\NormalTok{(}\DataTypeTok{name =} \StringTok{"\textless{}Account name\textgreater{}"}\NormalTok{, }
                          \DataTypeTok{token =} \StringTok{"\textless{}Token\textgreater{}"}\NormalTok{,}
                          \DataTypeTok{secret =} \StringTok{"\textless{}Secret\textgreater{}"}\NormalTok{)}

\NormalTok{rsconnect}\OperatorTok{::}\KeywordTok{deployApp}\NormalTok{(}\DataTypeTok{appNames =} \StringTok{"\textless{}App name\textgreater{}"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{references-3}{%
\subsection{References}\label{references-3}}

\href{https://mastering-shiny.org/}{Mastering Shiny} by Hadley Wickham. For newbies.

\href{https://bookdown.org/yihui/rmarkdown/shiny-documents.html}{Shiny Documents} by Yihui Xie

\href{https://engineering-shiny.org/}{Engineering Production-Grade Shiny Apps} by Colin Fay, Sébastien Rochette, Vincent Guyader, Cervan Girard.

\href{https://stat545.com/shiny-tutorial.html}{Building Shiny Apps} by Dean Attali.

\hypertarget{other-useful-data-products}{%
\section{Other useful data products}\label{other-useful-data-products}}

\begin{itemize}
\tightlist
\item
  Automating data reports using rmarkdown (called \href{https://rmarkdown.rstudio.com/developer_parameterized_reports.html\%23parameter_types\%2F}{parameterized reports})
\item
  Automating R presentation using \href{http://slidify.org/index.html}{slidify}
\item
  Creating interactive web apps using \href{https://rstudio.github.io/leaflet/}{leaflet}
\end{itemize}

\hypertarget{semi_structured_data}{%
\chapter{Semi-structured data}\label{semi_structured_data}}

\hypertarget{setup-4}{%
\section{Setup}\label{setup-4}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Install packages }
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: pacman
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(tidyverse, }\CommentTok{\# tidyverse pkgs including purrr}
\NormalTok{               furrr, }\CommentTok{\# parallel processing }
\NormalTok{               tictoc, }\CommentTok{\# performance test  }
\NormalTok{               tcltk, }\CommentTok{\# GUI for choosing a dir path }
\NormalTok{               tidyjson, }\CommentTok{\# tidying JSON files }
\NormalTok{               XML, }\CommentTok{\# parsing XML}
\NormalTok{               rvest, }\CommentTok{\# parsing HTML}
\NormalTok{               jsonlite, }\CommentTok{\# downloading JSON file from web}
\NormalTok{               glue, }\CommentTok{\# pasting string and objects}
\NormalTok{               xopen, }\CommentTok{\# opepn URLs in browser }
\NormalTok{               urltools, }\CommentTok{\# regex and url parsing }
\NormalTok{               here) }\CommentTok{\# computational reproducibility}

\CommentTok{\#\# Install the current development version from GitHub}
\NormalTok{devtools}\OperatorTok{::}\KeywordTok{install\_github}\NormalTok{(}\StringTok{"jaeyk/tidytweetjson"}\NormalTok{, }\DataTypeTok{dependencies =} \OtherTok{TRUE}\NormalTok{) ; }\KeywordTok{library}\NormalTok{(tidytweetjson)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Skipping install of 'tidytweetjson' from a github remote, the SHA1 (9a00ec8a) has not changed since last install.
##   Use `force = TRUE` to force installation
\end{verbatim}

\hypertarget{the-big-picture-7}{%
\section{The Big Picture}\label{the-big-picture-7}}

\begin{itemize}
\tightlist
\item
  Automating the process of turning semi-structured data (input) into structured data (output)
\end{itemize}

\hypertarget{what-is-semi-structured-data}{%
\section{What is semi-structured data?}\label{what-is-semi-structured-data}}

\begin{quote}
Semi-structured data is a form of structured data that does not obey the tabular structure of data models associated with relational databases or other forms of data tables, but nonetheless contains tags or other markers to separate semantic elements and enforce hierarchies of records and fields within the data. Therefore, it is also known as a self-describing structure. - \href{https://en.wikipedia.org/wiki/Semi-structured_data\#:~:text=Semi\%2Dstructured\%20data\%20is\%20a,and\%20fields\%20within\%20the\%20data.}{Wikipedia}
\end{quote}

\begin{itemize}
\tightlist
\item
  Examples: HTML (e.g., websites), XML (e.g., government data), JSON (e.g., social media API)
\end{itemize}

Below is how JSON (tweet) looks like.

\begin{itemize}
\item
  A tree-like structure
\item
  Keys and values (key: value)
\end{itemize}

\{
``created\_at'': ``Thu Apr 06 15:24:15 +0000 2017'',
``id\_str'': ``850006245121695744'',
``text'': ``1/ Today we\u2019re sharing our vision for the future of the Twitter API platform!\nhttps://t.co/XweGngmxlP'',
``user'': \{
``id'': 2244994945,
``name'': ``Twitter Dev'',
``screen\_name'': ``TwitterDev'',
``location'': ``Internet'',
``url'': ``\url{https:////dev.twitter.com//}'',
``description'': ``Your official source for Twitter Platform news, updates \& events. Need technical help? Visit \url{https:////twittercommunity.com//} \u2328\ufe0f \#TapIntoTwitter''
\}
\}

\begin{itemize}
\item
  Why should we care about semi-structured data?

  \begin{itemize}
  \tightlist
  \item
    Because this is what the data frontier looks like: \# of unstructured data \textgreater{} \# of semi-structured data \textgreater{} \# of structured data
  \item
    There are easy and fast ways to turn semi-structured data into structured data (ideally in a tidy format) using R, Python, and command-line tools. See my own examples (\href{https://github.com/jaeyk/tidyethnicnews}{tidyethnicnews} and \href{https://github.com/jaeyk/tidytweetjson}{tidytweetjson}).
  \end{itemize}
\end{itemize}

\hypertarget{workflow-2}{%
\section{Workflow}\label{workflow-2}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Import/connect to a semi-structured file using \texttt{rvest,} \texttt{jsonlite,} \texttt{xml2,} \texttt{pdftools,} \texttt{tidyjson}, etc.
\item
  Define target elements in a single file and extract them
\end{enumerate}

\begin{itemize}
\item
  \href{https://readr.tidyverse.org/}{\texttt{readr}} package providers \texttt{parse\_} functions that are useful for vector parsing.
\item
  \href{https://stringr.tidyverse.org/}{\texttt{stringr}} package for string manipulations (e.g., using regular expressions in a tidy way). Quite useful for parsing PDF files (see \href{https://themockup.blog/posts/2020-04-03-beer-and-pdftools-a-vignette/}{this example}).
\item
  \href{https://github.com/tidyverse/rvest}{\texttt{rvest}} package for parsing HTML (R equivalent to \texttt{beautiful\ soup} in Python)
\item
  \href{https://github.com/sailthru/tidyjson}{\texttt{tidyjson}} package for parsing JSON data
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\item
  Create a list of files (in this case URLs) to parse
\item
  Write a parsing function
\item
  Automate parsing process
\end{enumerate}

\hypertarget{htmlcss-web-scraping}{%
\section{HTML/CSS: web scraping}\label{htmlcss-web-scraping}}

Let's go back to the example we covered in the earlier chapter of the book.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{url\_list \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}
  \StringTok{"https://en.wikipedia.org/wiki/University\_of\_California,\_Berkeley"}\NormalTok{,}
  \StringTok{"https://en.wikipedia.org/wiki/Stanford\_University"}\NormalTok{,}
  \StringTok{"https://en.wikipedia.org/wiki/Carnegie\_Mellon\_University"}\NormalTok{,}
  \StringTok{"https://DLAB"}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Step 1: Inspection
\end{itemize}

Examine the Berkeley website so that we could identify a node that indicates the school's motto. Then, if you're using Chrome, draw your interest elements, then \texttt{right\ click\ \textgreater{}\ inspect\ \textgreater{}\ copy\ full\ xpath.}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{url \textless{}{-}}\StringTok{ "https://en.wikipedia.org/wiki/University\_of\_California,\_Berkeley"}

\KeywordTok{download.file}\NormalTok{(url, }\DataTypeTok{destfile =} \StringTok{"scraped\_page.html"}\NormalTok{, }\DataTypeTok{quiet =} \OtherTok{TRUE}\NormalTok{)}

\NormalTok{target \textless{}{-}}\StringTok{ }\KeywordTok{read\_html}\NormalTok{(}\StringTok{"scraped\_page.html"}\NormalTok{)}

\CommentTok{\# If you want character vector output}
\NormalTok{target }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{html\_nodes}\NormalTok{(}\DataTypeTok{xpath =} \StringTok{"/html/body/div[3]/div[3]/div[5]/div[1]/table[1]"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{html\_text}\NormalTok{() }

\CommentTok{\# If you want table output }
\NormalTok{target }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{html\_nodes}\NormalTok{(}\DataTypeTok{xpath =} \StringTok{"/html/body/div[3]/div[3]/div[5]/div[1]/table[1]"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{html\_table}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Step 2: Write a function
\end{itemize}

I highly recommend writing your function working slowly by wrapping the function with \href{https://purrr.tidyverse.org/reference/insistently.html}{\texttt{slowly()}}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{get\_table\_from\_wiki \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(url)\{}
  
  \KeywordTok{download.file}\NormalTok{(url, }\DataTypeTok{destfile =} \StringTok{"scraped\_page.html"}\NormalTok{, }\DataTypeTok{quiet =} \OtherTok{TRUE}\NormalTok{)}

\NormalTok{  target \textless{}{-}}\StringTok{ }\KeywordTok{read\_html}\NormalTok{(}\StringTok{"scraped\_page.html"}\NormalTok{)}
  
\NormalTok{  table \textless{}{-}}\StringTok{ }\NormalTok{target }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{html\_nodes}\NormalTok{(}\DataTypeTok{xpath =} \StringTok{"/html/body/div[3]/div[3]/div[5]/div[1]/table[1]"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{html\_table}\NormalTok{() }
  
  \KeywordTok{return}\NormalTok{(table)}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Step 3: Test
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{get\_table\_from\_wiki}\NormalTok{(url\_list[[}\DecValTok{2}\NormalTok{]])}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Step 4: Automation
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(url\_list, get\_table\_from\_wiki)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Step 5: Error handling
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map}\NormalTok{(url\_lists, }\KeywordTok{safely}\NormalTok{(get\_table\_from\_wiki)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{map}\NormalTok{(}\StringTok{"result"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\CommentTok{\# = map(function(x) x[["result"]]) = map(\textasciitilde{}.x[["name"]])}
\StringTok{  }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{compact}\NormalTok{() }\CommentTok{\# Remove empty elements}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# If error occurred, "The URL is broken." will be stored in that element(s).}
\NormalTok{out \textless{}{-}}\StringTok{ }\KeywordTok{map}\NormalTok{(}
\NormalTok{  url\_list,}
  \KeywordTok{possibly}\NormalTok{(get\_table\_from\_wiki,}
    \DataTypeTok{otherwise =} \StringTok{"The URL is broken."}
\NormalTok{  )}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{xmljson-government-databasesocial-media-scraping}{%
\section{XML/JSON: government database/social media scraping}\label{xmljson-government-databasesocial-media-scraping}}

\hypertarget{governemnt-database-xml}{%
\subsection{Governemnt database (XML)}\label{governemnt-database-xml}}

The following tax return data example comes from the U.S. Internal Revenue Service (IRS) Amazon database. \href{https://www.irs.gov/pub/irs-pdf/f990.pdf}{This PDf file} shows what the original document looks like.

\textbf{Workflow}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Get the XML link and parse it
\item
  Go to the root of the XML document
\item
  Identify a specific node you care about
\item
  Get values related to that node
\end{enumerate}

\begin{figure}
\centering
\includegraphics{https://www.w3schools.com/xml/nodetree.gif}
\caption{XML DOM (Document Object Model). Source: \url{https://www.w3schools.com}}
\end{figure}

Step1: Get an XML document link

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xml\_link \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"http://s3.amazonaws.com/irs{-}form{-}990/201910919349301206\_public.xml"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

Step 2: Get the page and parse the XML document.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xml\_root \textless{}{-}}\StringTok{ }\NormalTok{xml\_link }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Get page and parse xml }
\StringTok{  }\KeywordTok{xmlTreeParse}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Get root}
\StringTok{  }\KeywordTok{xmlRoot}\NormalTok{()}

\CommentTok{\# Data output: list }
\KeywordTok{typeof}\NormalTok{(xml\_root) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "list"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Two elements. Our target is the second one.}
\KeywordTok{summary}\NormalTok{(xml\_root)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##              Length Class   Mode
## ReturnHeader 11     XMLNode list
## ReturnData    6     XMLNode list
\end{verbatim}

Step 3: Get nodes

We grab the mission statement of this org from its tax report (990). \texttt{//} is an \href{https://www.w3schools.com/xml/xpath_syntax.asp}{XPath syntax} that helps to ``select nodes in the document from the current node that matches the selection no matter where they are.''

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xml\_root }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{pluck}\NormalTok{(}\DecValTok{2}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Second element (Return Data)}
\StringTok{  }\KeywordTok{getNodeSet}\NormalTok{(}\StringTok{"//MissionDesc"}\NormalTok{) }\CommentTok{\# Mission statement }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
## <MissionDesc>DISTRIBUTION OF LITERATURE, MUSIC, AND OTHER RELATED RESOURCES WHICH COMPLIMENT LITERATURE; SUPPORT OF MINISTRIES.</MissionDesc>
\end{verbatim}

Step 4: Get values

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xml\_root }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{pluck}\NormalTok{(}\DecValTok{2}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Second element (Return Data)}
\StringTok{  }\KeywordTok{getNodeSet}\NormalTok{(}\StringTok{"//MissionDesc"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Mission statement }
\StringTok{  }\KeywordTok{xmlValue}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "DISTRIBUTION OF LITERATURE, MUSIC, AND OTHER RELATED RESOURCES WHICH COMPLIMENT LITERATURE; SUPPORT OF MINISTRIES."
\end{verbatim}

\hypertarget{social-media-api-json}{%
\subsection{Social media API (JSON)}\label{social-media-api-json}}

\hypertarget{objectives-4}{%
\subsubsection{Objectives}\label{objectives-4}}

\begin{itemize}
\tightlist
\item
  Learning what kind of social media data are accessible through application programming interfaces (APIs)
\end{itemize}

\textbf{Review question}

In the previous session, we learned the difference between semi-structured data and structured data. Can anyone tell us the difference between them?

\hypertarget{the-big-picture-for-digital-data-collection}{%
\subsubsection{The big picture for digital data collection}\label{the-big-picture-for-digital-data-collection}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Input: semi-structured data
\item
  Output: structured data
\item
  Process:

  \begin{itemize}
  \item
    Getting \textbf{target data} from a remote server

    \begin{itemize}
    \tightlist
    \item
      The target data is usually massive (\textgreater10 G.B.) by the traditional social science standard.
    \end{itemize}
  \item
    Parsing the target data your laptop/database

    \begin{itemize}
    \tightlist
    \item
      Laptop (sample-parse): Downsamle the large target data and parse it on your laptop. This is just one option to \href{https://rviews.rstudio.com/2019/07/17/3-big-data-strategies-for-r/}{deal with big data in R}. It's a simple strategy that doesn't require storing target data in your database.
    \end{itemize}
  \item
    Database (push-parse): Push the large target data to a database, then explore, select, and filter it. If you are interested in using this option, check out my \href{https://github.com/dlab-berkeley/sql-for-r-users}{SQL for R Users} workshop.
  \end{itemize}
\end{enumerate}

\begin{figure}
\centering
\includegraphics{https://rviews.rstudio.com/post/2019-07-01-3-big-data-paradigms-for-r_files/sample_model.png}
\caption{Sample-Parse. From RStudio.}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://rviews.rstudio.com/post/2019-07-01-3-big-data-paradigms-for-r_files/push_data.png}
\caption{Push-Parse. From RStudio.}
\end{figure}

\begin{itemize}
\item
  But what exactly is this target data?

  \begin{itemize}
  \item
    When you scrape websites, you mostly deal with HTML (defines a structure of a website), CSS (its style), and JavaScript (its dynamic interactions).
  \item
    When you access social media data through API, you deal with either XML or JSON (major formats for storing and transporting data; they are light and flexible).
  \item
    XML and JSON have tree-like (nested; a root and branches) structures and keys and values (or elements and attributes).
  \item
    If HTML, CSS, and JavaScript are storefronts, then XML and JSON are warehouses.
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://upload.wikimedia.org/wikipedia/commons/9/97/Automatisches_Kleinteilelager.jpg}
\caption{By Andreas Praefcke (Own work), via Wikimedia Commons}
\end{figure}

\hypertarget{opportunities-and-challenges-for-parsing-social-media-data}{%
\subsubsection{Opportunities and challenges for parsing social media data}\label{opportunities-and-challenges-for-parsing-social-media-data}}

This explanation draws on Pablo Barbara's \href{http://pablobarbera.com/social-media-workshop/social-media-slides.pdf}{LSE social media workshop slides}.

\textbf{Basic information}

\begin{itemize}
\item
  What is an API?: An interface (you can think of it as something akin to a restaurant menu. API parameters are API menu items.)

  \begin{itemize}
  \item
    \href{https://en.wikipedia.org/wiki/Representational_state_transfer}{REST} (Representational state transfer) API: static information (e.g., user profiles, list of followers and friends)
  \item
    \href{https://blog.axway.com/amplify/api-management/streaming-apis\#:~:text=Streaming\%20APIs\%20are\%20used\%20to,a\%20subset\%20of\%20Streaming\%20APIS.}{Streaming} API: dynamic information (e.g, new tweets)
  \end{itemize}
\end{itemize}

\textbf{Why should we care?}

\begin{itemize}
\item
  API is the new data frontier. \href{https://www.programmableweb.com/apis/directory}{ProgrammableWeb} shows that there are more than 24,046 APIs as of April 1, 2021.

  \begin{itemize}
  \item
    Big and streaming (real-time) data
  \item
    High-dimensional data (e.g., text, image, video, etc.)
  \item
    Lots of analytic opportunities (e.g., time-series, network, spatial analysis)
  \end{itemize}
\item
  Also, this type of data has many limitations (external validity, algorithmic bias, etc).
\item
  Think about taking the API + approach (i.e., API not replacing but augmenting traditional data collection)
\end{itemize}

\textbf{How API works}

Request (you form a request URL) \textless-\textgreater{} Response (API responses to your request by sending you data usually in JSON format)

\includegraphics{https://mk0appinventiv4394ey.kinstacdn.com/wp-content/uploads/sites/1/2018/05/What-are-APIs-Learn-How-API-Works.jpg}

\textbf{API Statuses}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Twitter
\end{enumerate}

\begin{itemize}
\item
  Twitter API is still widely accessible (\href{https://developer.twitter.com/en/docs/twitter-api/early-access}{v2}
\item
  In January 2021, Twitter introduced the \href{https://developer.twitter.com/en/solutions/academic-research}{academic Twitter API} that allows generous access to Twitter's historical data for academic researchers

  \begin{itemize}
  \item
    Many R packages exist for the Twitter API: \href{https://cran.r-project.org/web/packages/rtweet/rtweet.pdf}{rtweet} (REST + streaming), \href{https://github.com/pablobarbera/twitter_ideology/tree/master/pkg/tweetscores}{tweetscores} (REST), \href{https://github.com/pablobarbera/streamR}{streamR} (streaming)
  \item
    Some notable limitations. If Twitter users don't share their tweets' locations (e.g., GPS), you can't collect them.
  \end{itemize}
\end{itemize}

\begin{quote}
Twitter data is unique from data shared by most other social platforms because it reflects information that users \emph{choose} to share publicly. Our API platform provides broad access to public Twitter data that users have chosen to share with the world. - Twitter Help Center
\end{quote}

\begin{itemize}
\tightlist
\item
  What does this policy mean? If Twitter users don't share their tweets' locations (e.g., GPS), you can't collect them. However, you can get around this problem to identify a user's location based on their self-reported profile.
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Other APIs
\end{enumerate}

The following comments draw on Alexandra Siegel's talk on ``Collecting and Analyzing Social Media Data'' given at Montréal Methods Workshops.

\begin{itemize}
\item
  \href{https://developers.facebook.com/}{Facebook API} access has become constrained since the 2016 U.S. election.

  \begin{itemize}
  \item
    Exception: \href{https://socialscience.one/blog/unprecedented-facebook-urls-dataset-now-available-research-through-social-science-one}{Social Science One}.
  \item
    Also, check out \href{https://www.crowdtangle.com/}{Crowdtangle} for collecting public FB page data
  \item
    Using FB ads is still a popular method, especially among scholars studying developing countries.
  \end{itemize}
\item
  \href{https://developers.google.com/youtube/v3}{YouTube API}: generous access + (computer-generated) transcript in many languages

  \begin{itemize}
  \tightlist
  \item
    Documentation on \href{https://developers.google.com/youtube/v3/docs/captions}{captions} from YouTube
  \end{itemize}
\item
  \href{https://www.instagram.com/developer/}{Instragram API}: Data from public accounts are available.
\item
  \href{https://www.reddit.com/dev/api/}{Reddit API}: Well-annotated text data suitable for machine learning
\end{itemize}

\textbf{Upside}

\begin{itemize}
\tightlist
\item
  Legal and well-documented.
\end{itemize}

Web scraping (Wild Wild West) \textless\textgreater{} API (Big Gated Garden)

\begin{itemize}
\item
  You have legal but limited access to (growing) big data that can be divided into text, image, and video and transformed into cross-sectional (geocodes), longitudinal (timestamps), and historical event data (hashtags). See Zachary C. Steinert-Threlkeld's \href{https://github.com/ZacharyST/APSA2020_EventDataFromSocialMedia}{2020 APSA Short Course Generating Event Data From Social Media}.
\item
  Social media data are also well-organized, managed, and curated data. It's easy to navigate because XML and JSON have keys and values. If you find keys, you will find observations you look for.
\end{itemize}

\textbf{Downside}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Rate-limited.
\item
  If you want to access more and various data than those available, you need to pay for premium access.
\end{enumerate}

\hypertarget{next-steps}{%
\subsection{Next steps}\label{next-steps}}

We will learn how to access and collect data using Twitter and New York Times API. We are going to learn this in two ways: (1) using plug-and-play packages (both using RStudio and the terminal) and (2) getting API data from scratch (\texttt{httr,} \texttt{jsonlite}).

First, sign up for the Twitter developer account before everything else. If you want to know how to sign up for a new Twitter developer account and access Twitter API, see Steinert-Threlkeld's \href{https://github.com/ZacharyST/APSA2020_EventDataFromSocialMedia/blob/master/Presentation/02_AccessTwitter.pdf}{APSA workshop slides}.

\hypertarget{rtweet}{%
\subsection{rtweet}\label{rtweet}}

The \texttt{rtweet} examples come from \href{https://cbail.github.io/SICSS_APIs_markdown.html}{Chris Bail's tutorial}.

\hypertarget{setup-5}{%
\subsubsection{Setup}\label{setup-5}}

The first thing you need to do is set up.

Assuming that you already signed up for a Twitter developer account

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{app\_name \textless{}{-}}\StringTok{ "YOUR APP NAME"}
\NormalTok{consumer\_key \textless{}{-}}\StringTok{ "YOUR CONSUMER KEY"}
\NormalTok{consumer\_secret \textless{}{-}}\StringTok{ "YOUR CONSUMER SECRET"}

\NormalTok{rtweet}\OperatorTok{::}\KeywordTok{create\_token}\NormalTok{(}\DataTypeTok{app =}\NormalTok{ app\_name, }
                     \DataTypeTok{consumer\_key =}\NormalTok{ consumer\_key, }
                     \DataTypeTok{consumer\_secret =}\NormalTok{ consumer\_secret)}
\end{Highlighting}
\end{Shaded}

\hypertarget{search-api}{%
\subsubsection{Search API}\label{search-api}}

Using \textbf{search API}; This API returns a collection of Tweets mentioning a particular query.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Install and load rtweet }
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(pacman)) \{}\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)\}}

\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(rtweet)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# The past 6{-}9 days }
\NormalTok{rt \textless{}{-}}\StringTok{ }\KeywordTok{search\_tweets}\NormalTok{(}\DataTypeTok{q =} \StringTok{"\#stopasianhate"}\NormalTok{, }\DataTypeTok{n =} \DecValTok{1000}\NormalTok{, }\DataTypeTok{include\_rts =} \OtherTok{FALSE}\NormalTok{)}

\CommentTok{\# The longer term }
\CommentTok{\# search\_fullarchive() premium service}

\KeywordTok{head}\NormalTok{(rt}\OperatorTok{$}\NormalTok{text)}
\end{Highlighting}
\end{Shaded}

Can you guess what would be the class type of rt?

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(rt)}
\end{Highlighting}
\end{Shaded}

What would be the number of rows?

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{nrow}\NormalTok{(rt)}
\end{Highlighting}
\end{Shaded}

\hypertarget{time-series-analysis}{%
\subsubsection{Time series analysis}\label{time-series-analysis}}

\begin{itemize}
\tightlist
\item
  Time series analysis
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(ggplot2, ggthemes, rtweet)}

\KeywordTok{ts\_plot}\NormalTok{(rt, }\StringTok{"3 hours"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\NormalTok{ggthemes}\OperatorTok{::}\KeywordTok{theme\_fivethirtyeight}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Frequency of Tweets about StopAsianHate from the Past Day"}\NormalTok{,}
       \DataTypeTok{subtitle =} \StringTok{"Tweet counts aggregated using three{-}hour intervals"}\NormalTok{,}
       \DataTypeTok{source =} \StringTok{"Twitter\textquotesingle{}s Search API via rtweet"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{geographical-analysis}{%
\subsubsection{Geographical analysis}\label{geographical-analysis}}

\begin{itemize}
\tightlist
\item
  Geographical analysis
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(maps)}

\NormalTok{geocoded \textless{}{-}}\StringTok{ }\KeywordTok{lat\_lng}\NormalTok{(rt)}

\NormalTok{maps}\OperatorTok{::}\KeywordTok{map}\NormalTok{(}\StringTok{"state"}\NormalTok{, }\DataTypeTok{lwd =} \FloatTok{.25}\NormalTok{) }\CommentTok{\# lwd = line type }
\KeywordTok{with}\NormalTok{(geocoded, }\KeywordTok{points}\NormalTok{(lng, lat))}
\end{Highlighting}
\end{Shaded}

\hypertarget{hydrating}{%
\subsection{Hydrating}\label{hydrating}}

\hypertarget{objectives-5}{%
\subsubsection{Objectives}\label{objectives-5}}

\begin{itemize}
\tightlist
\item
  Learning how hydrating works
\item
  Learning how to use \href{https://github.com/DocNow/twarc}{Twarc} to communicate with Twitter's API
\end{itemize}

\textbf{Review question}

What are the main two types of Twitter's API?

\hypertarget{hydrating-an-alternative-way-to-collect-historical-twitter-data}{%
\subsubsection{Hydrating: An Alternative Way to Collect Historical Twitter Data}\label{hydrating-an-alternative-way-to-collect-historical-twitter-data}}

\begin{itemize}
\item
  You can collect Twitter data using Twitter's API, or you can hydrate Tweet IDs collected by other researchers. This is an excellent resource to collect historical Twitter data.
\item
  \href{http://www.panacealab.org/covid19/}{Covid-19 Twitter chatter dataset for scientific use} by Panacealab
\item
  \href{https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5ZVMOR}{Women's March Dataset} by Littman and Park
\item
  Harvard Dataverse has many dehydrated Tweet IDs that could be of interest to social scientists.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://github.com/jaeyk/digital_data_collection_workshop/raw/master/misc/dehydrated_tweets.png}
\caption{Dehydrated Tweet IDs}
\end{figure}

\hypertarget{twarc-one-solution-to-almost-all-twitters-api-problems}{%
\subsubsection{Twarc: one solution to (almost) all Twitter's API problems}\label{twarc-one-solution-to-almost-all-twitters-api-problems}}

\begin{itemize}
\item
  Why Twarc?

  \begin{itemize}
  \item
    A command-line tool and Python library that works for almost every Twitter API-related problem.
  \item
    It's really well-documented, tested, and maintained.

    \begin{itemize}
    \tightlist
    \item
      \href{https://scholarslab.github.io/learn-twarc/06-twarc-command-basics}{Twarc documentation} covers basic commands.
    \item
      \href{https://twarc-cloud.readthedocs.io/_/downloads/en/stable/pdf/}{Tward-cloud documentation} explains how to collect data from Twitter's API using Twarc running in \href{https://aws.amazon.com/}{Amazon Web Services} (AWS).
    \end{itemize}
  \item
    Twarc was developed as part of the \href{https://www.docnow.io/}{Documenting the Now} project, which the Mellon Foundation funded.
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://vignette.wikia.nocookie.net/lotr/images/8/8b/DOiAi2WUEAE3A1Y.0.jpg/revision/latest/scale-to-width-down/699?cb=20200305221819}
\caption{One ring that rules them all.}
\end{figure}

\begin{itemize}
\item
  There's no reason to be afraid of using a command-line tool and Python library, even though you primarily use R. It's easy to embed \href{https://bookdown.org/yihui/rmarkdown/language-engines.html\#python}{Python code} and \href{https://bookdown.org/yihui/rmarkdown/language-engines.html\#shell-scripts}{shell scripts} in R Markdown.
\item
  Even though you don't know how to write Python code or shell scripts, it's handy to learn how to integrate them into your R workflow.
\item
  I assume that you have already installed \href{https://www.python.org/download/releases/3.0/}{Python 3}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{pip3}\NormalTok{ install twarc}
\end{Highlighting}
\end{Shaded}

\hypertarget{applications-1}{%
\paragraph{Applications}\label{applications-1}}

The following examples are created by \href{http://digitalcollecting.lib.virginia.edu/toolkit/docs/social-media/twarc-commands/}{the University of Virginia library}.

\hypertarget{search}{%
\subparagraph{Search}\label{search}}

\begin{itemize}
\item
  Download pre-existing tweets (7-day window) matching certain conditions
\item
  In command-line, \texttt{\textgreater{}} = Create a file
\item
  I recommend running the following commands in the terminal because it's more stable than in R Markdown.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://github.com/jaeyk/digital_data_collection_workshop/raw/master/misc/terminal.png}
\caption{You can type commands in the Terminal in R Studio.}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Key word }
\ExtensionTok{twarc}\NormalTok{ search blacklivesmatter }\OperatorTok{\textgreater{}}\NormalTok{ blm\_tweets.jsonl}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Hashtag }
\ExtensionTok{twarc}\NormalTok{ search }\StringTok{\textquotesingle{}\#blacklivesmatter\textquotesingle{}} \OperatorTok{\textgreater{}}\NormalTok{ blm\_tweets\_hash.jsonl}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Hashtag + Language }
\ExtensionTok{twarc}\NormalTok{ search }\StringTok{\textquotesingle{}\#blacklivesmatter\textquotesingle{}}\NormalTok{ {-}{-}lang en }\OperatorTok{\textgreater{}}\NormalTok{ blm\_tweets\_hash.jsonl}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  It is really important to \textbf{save these tweets into a \texttt{jsonl} format;} \texttt{jsonl} extension refers to JSON \textbf{Lines} files. This structure is useful for splitting JSON data into smaller chunks if it is too large.
\end{itemize}

\hypertarget{filter}{%
\subparagraph{Filter}\label{filter}}

\begin{itemize}
\tightlist
\item
  Download tweets meeting certain conditions as they happen.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Key word}
\ExtensionTok{twarc}\NormalTok{ filter blacklivesmatter }\OperatorTok{\textgreater{}}\NormalTok{ blm\_tweets.jsonl}
\end{Highlighting}
\end{Shaded}

\hypertarget{sample}{%
\subparagraph{Sample}\label{sample}}

\begin{itemize}
\tightlist
\item
  Use Twitter's random sample of recent tweets.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{twarc}\NormalTok{ sample }\OperatorTok{\textgreater{}}\NormalTok{ tweets.jsonl }
\end{Highlighting}
\end{Shaded}

\hypertarget{hydrate}{%
\subparagraph{Hydrate}\label{hydrate}}

\begin{itemize}
\tightlist
\item
  Tweet I.D.s -\textgreater{} Tweets
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{twarc}\NormalTok{ hydrate tweet\_ids.txt }\OperatorTok{\textgreater{}}\NormalTok{ tweets.jsonl }
\end{Highlighting}
\end{Shaded}

\hypertarget{dehydrate}{%
\subparagraph{Dehydrate}\label{dehydrate}}

\begin{itemize}
\tightlist
\item
  Hydrate \textless\textgreater{} Dehydrate
\item
  Tweets -\textgreater{} Tweet I.D.s
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\ExtensionTok{twarc}\NormalTok{ dehydrate tweets.jsonl }\OperatorTok{\textgreater{}}\NormalTok{ tweet\_ids.txt}
\end{Highlighting}
\end{Shaded}

\textbf{Challenge}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Collect tweets containing keywords of your choice using \texttt{twarc\ search} and save them as \texttt{tweets.jsonl}.
\item
  Using \texttt{less} command in the terminal, inspect \texttt{twarc.log.}
\item
  Using \texttt{less} command in the terminal, inspect \texttt{tweets.json.}
\end{enumerate}

\hypertarget{parsing-json}{%
\subsection{Parsing JSON}\label{parsing-json}}

\hypertarget{objectives-6}{%
\subsubsection{Objectives}\label{objectives-6}}

\begin{itemize}
\tightlist
\item
  Learning chunk and pull strategy
\item
  Learning how \texttt{tidyjson} works
\item
  Learning how to apply \texttt{tidyjson} to tweets
\end{itemize}

\hypertarget{chunk-and-pull}{%
\subsubsection{Chunk and Pull}\label{chunk-and-pull}}

\hypertarget{problem-3}{%
\paragraph{Problem}\label{problem-3}}

\begin{itemize}
\tightlist
\item
  What if the size of the Twitter data you downloaded is too big (e.g., \textgreater10 GB) to do complex wrangling in R?
\end{itemize}

\hypertarget{solution-2}{%
\paragraph{Solution}\label{solution-2}}

\begin{figure}
\centering
\includegraphics{https://rviews.rstudio.com/post/2019-07-01-3-big-data-paradigms-for-r_files/chunk_pull.png}
\caption{Chunk and Pull. From Studio.}
\end{figure}

Step1: Split the large JSON file in small chunks.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#Divide the JSON file by 100 lines (tweets)}

\CommentTok{\# Linux and Windows (in Bash)}
\NormalTok{$ }\FunctionTok{split}\NormalTok{ {-}100 search.jsonl}

\CommentTok{\# macOS}
\NormalTok{$ }\ExtensionTok{gsplit}\NormalTok{ {-}100 search.jsonl}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  After that, you will see several files appear in the directory. Each of these files should have 100 tweets or fewer. All of these file names \textbf{should start with ``x,'' as in ``xaa.''}
\end{itemize}

Step 2: Apply the parsing function to each chunk and pull all of these chunks together.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# You need to choose a Tweet JSON file}
\NormalTok{filepath \textless{}{-}}\StringTok{ }\KeywordTok{file.choose}\NormalTok{()}

\CommentTok{\# Assign the parsed result to the \textasciigrave{}df\textasciigrave{} object}
\CommentTok{\# 11.28 sec elapsed to parse 17,928 tweets }
\KeywordTok{tic}\NormalTok{()}
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{jsonl\_to\_df}\NormalTok{(filepath)}
\KeywordTok{toc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Setup }
\NormalTok{n\_cores \textless{}{-}}\StringTok{ }\KeywordTok{availableCores}\NormalTok{() }\OperatorTok{{-}}\StringTok{ }\DecValTok{1}

\NormalTok{n\_cores }\CommentTok{\# This number depends on your computer spec.}

\KeywordTok{plan}\NormalTok{(multiprocess, }\CommentTok{\# multicore, if supported, otherwise multisession}
     \DataTypeTok{workers =}\NormalTok{ n\_cores) }\CommentTok{\# the maximum number of workers}

\CommentTok{\# You need to designate a directory path where you saved the list of JSON files.}

\CommentTok{\# 9.385 sec elapsed to parse 17,928 tweets }

\NormalTok{dirpath \textless{}{-}}\StringTok{ }\NormalTok{tcltk}\OperatorTok{::}\KeywordTok{tk\_choose.dir}\NormalTok{()}

\KeywordTok{tic}\NormalTok{()}
\NormalTok{df\_all \textless{}{-}}\StringTok{ }\NormalTok{tidytweetjson}\OperatorTok{::}\KeywordTok{jsonl\_to\_df\_all}\NormalTok{(dirpath)}
\KeywordTok{toc}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\hypertarget{tidyjson}{%
\paragraph{tidyjson}\label{tidyjson}}

The \href{https://cran.r-project.org/web/packages/tidyjson/vignettes/introduction-to-tidyjson.html}{\texttt{tidyjson}} package helps to use tidyverse framework to JSON data.

\begin{itemize}
\tightlist
\item
  toy example
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# JSON collection; nested structure + keys and values }
\NormalTok{worldbank[}\DecValTok{1}\NormalTok{]}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "{\"_id\":{\"$oid\":\"52b213b38594d8a2be17c780\"},\"boardapprovaldate\":\"2013-11-12T00:00:00Z\",\"closingdate\":\"2018-07-07T00:00:00Z\",\"countryshortname\":\"Ethiopia\",\"majorsector_percent\":[{\"Name\":\"Education\",\"Percent\":46},{\"Name\":\"Education\",\"Percent\":26},{\"Name\":\"Public Administration, Law, and Justice\",\"Percent\":16},{\"Name\":\"Education\",\"Percent\":12}],\"project_name\":\"Ethiopia General Education Quality Improvement Project II\",\"regionname\":\"Africa\",\"totalamt\":130000000}"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Check out keys (objects)}
\NormalTok{worldbank }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{as.tbl\_json}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{gather\_object}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(document.id }\OperatorTok{==}\StringTok{ }\DecValTok{1}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tbl_json: 8 x 3 tibble with a "JSON" attribute
##   ..JSON                  document.id name               
##   <chr>                         <int> <chr>              
## 1 "{\"$oid\":\"52b213..."           1 _id                
## 2 "\"2013-11-12T00:..."             1 boardapprovaldate  
## 3 "\"2018-07-07T00:..."             1 closingdate        
## 4 "\"Ethiopia\""                    1 countryshortname   
## 5 "[{\"Name\":\"Educa..."           1 majorsector_percent
## 6 "\"Ethiopia Gener..."             1 project_name       
## 7 "\"Africa\""                      1 regionname         
## 8 "130000000"                       1 totalamt
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Get the values associated with the keys }
\NormalTok{worldbank }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{as.tbl\_json}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Turn JSON into tbl\_json object }
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"project\_name"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Enter the objects }
\StringTok{  }\KeywordTok{append\_values\_string}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Append the values }
\StringTok{  }\KeywordTok{as\_tibble}\NormalTok{() }\CommentTok{\# To reduce the size of the file }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 500 x 2
##    document.id string                                                           
##          <int> <chr>                                                            
##  1           1 Ethiopia General Education Quality Improvement Project II        
##  2           2 TN: DTF Social Protection Reforms Support                        
##  3           3 Tuvalu Aviation Investment Project - Additional Financing        
##  4           4 Gov't and Civil Society Organization Partnership                 
##  5           5 Second Private Sector Competitiveness and Economic Diversificati~
##  6           6 Additional Financing for Cash Transfers for Orphans and Vulnerab~
##  7           7 National Highways Interconnectivity Improvement Project          
##  8           8 China Renewable Energy Scale-Up Program Phase II                 
##  9           9 Rajasthan Road Sector Modernization Project                      
## 10          10 MA Accountability and Transparency DPL                           
## # ... with 490 more rows
\end{verbatim}

\begin{itemize}
\tightlist
\item
  The following example draws on my \href{https://github.com/jaeyk/tidytweetjson}{tidytweetjson} R package. The package applies \texttt{tidyjson} to Tweets.
\end{itemize}

\hypertarget{individual-file}{%
\subparagraph{Individual file}\label{individual-file}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{jsonl\_to\_df \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(file\_path)\{}

\CommentTok{\# Save file name }

\NormalTok{file\_name \textless{}{-}}\StringTok{ }\KeywordTok{strsplit}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ file\_path, }
                     \DataTypeTok{split =} \StringTok{"[/]"}\NormalTok{) }

\NormalTok{file\_name \textless{}{-}}\StringTok{ }\NormalTok{file\_name[[}\DecValTok{1}\NormalTok{]][}\KeywordTok{length}\NormalTok{(file\_name[[}\DecValTok{1}\NormalTok{]])]}

\CommentTok{\# Import a Tweet JSON file}

\NormalTok{listed \textless{}{-}}\StringTok{ }\KeywordTok{read\_json}\NormalTok{(file\_path, }\DataTypeTok{format =} \KeywordTok{c}\NormalTok{(}\StringTok{"jsonl"}\NormalTok{))}

\CommentTok{\# IDs of the tweets with country codes}

\NormalTok{ccodes \textless{}{-}}\StringTok{ }\NormalTok{listed }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"place"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"country\_code"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{append\_values\_string}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{as\_tibble}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{rename}\NormalTok{(}\StringTok{"country\_code"}\NormalTok{ =}\StringTok{ "string"}\NormalTok{)}

\CommentTok{\# IDs of the tweets with location}

\NormalTok{locations \textless{}{-}}\StringTok{ }\NormalTok{listed }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"user"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{enter\_object}\NormalTok{(}\StringTok{"location"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{append\_values\_string}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{as\_tibble}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{rename}\NormalTok{(}\DataTypeTok{location =} \StringTok{"string"}\NormalTok{)}

\CommentTok{\# Extract other key elements from the JSON file}

\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{listed }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{spread\_values}\NormalTok{(}
    \DataTypeTok{id =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"id"}\NormalTok{),}
    \DataTypeTok{created\_at =} \KeywordTok{jstring}\NormalTok{(}\StringTok{"created\_at"}\NormalTok{),}
    \DataTypeTok{full\_text =} \KeywordTok{jstring}\NormalTok{(}\StringTok{"full\_text"}\NormalTok{),}
    \DataTypeTok{retweet\_count =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"retweet\_count"}\NormalTok{),}
    \DataTypeTok{favorite\_count =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"favorite\_count"}\NormalTok{),}
    \DataTypeTok{user.followers\_count =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"user.followers\_count"}\NormalTok{),}
    \DataTypeTok{user.friends\_count =} \KeywordTok{jnumber}\NormalTok{(}\StringTok{"user.friends\_count"}\NormalTok{)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{      }\NormalTok{as\_tibble}

\KeywordTok{message}\NormalTok{(}\KeywordTok{paste}\NormalTok{(}\StringTok{"Parsing"}\NormalTok{, file\_name, }\StringTok{"done."}\NormalTok{))}

\CommentTok{\# Full join}
\NormalTok{outcome \textless{}{-}}\StringTok{ }\KeywordTok{full\_join}\NormalTok{(ccodes, df) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{full\_join}\NormalTok{(locations)}

\CommentTok{\# Or you can write this way: outcome \textless{}{-} reduce(list(df, ccodes, locations), full\_join)}

\CommentTok{\# Select}
\NormalTok{outcome }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{select}\NormalTok{(}\OperatorTok{{-}}\KeywordTok{c}\NormalTok{(}\StringTok{"document.id"}\NormalTok{))\}}
\end{Highlighting}
\end{Shaded}

\hypertarget{many-files}{%
\subparagraph{Many files}\label{many-files}}

\begin{itemize}
\tightlist
\item
  Set up parallel processing.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{n\_cores \textless{}{-}}\StringTok{ }\KeywordTok{availableCores}\NormalTok{() }\OperatorTok{{-}}\StringTok{ }\DecValTok{1}

\NormalTok{n\_cores }\CommentTok{\# This number depends on your computer spec.}

\KeywordTok{plan}\NormalTok{(multiprocess, }\CommentTok{\# multicore, if supported, otherwise multisession}
     \DataTypeTok{workers =}\NormalTok{ n\_cores) }\CommentTok{\# the maximum number of workers}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Parsing in parallel.
\end{itemize}

\textbf{Review}

There are, at least, three ways you can use function + \texttt{purrr::map().}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{squared \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(x)\{}
\NormalTok{  x}\OperatorTok{*}\DecValTok{2} 
\NormalTok{\}}

\CommentTok{\# Named function }
\KeywordTok{map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{, squared)}

\CommentTok{\# Anonymous function }
\KeywordTok{map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{, }\ControlFlowTok{function}\NormalTok{(x)\{ x }\OperatorTok{*}\DecValTok{2}\NormalTok{ \})}

\CommentTok{\# Using formula; \textasciitilde{} = formula, .x = input }
\KeywordTok{map}\NormalTok{(}\DecValTok{1}\OperatorTok{:}\DecValTok{3}\NormalTok{,}\OperatorTok{\textasciitilde{}}\NormalTok{.x}\OperatorTok{*}\DecValTok{2}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Create a list of file paths }
\NormalTok{filename \textless{}{-}}\StringTok{ }\KeywordTok{list.files}\NormalTok{(dir\_path,}
          \DataTypeTok{pattern =} \StringTok{\textquotesingle{}\^{}x\textquotesingle{}}\NormalTok{,}
          \DataTypeTok{full.names =} \OtherTok{TRUE}\NormalTok{)}

\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{filename }\OperatorTok{\%\textgreater{}\%}

\CommentTok{\# Apply jsonl\_to\_df function to items on the list}
\KeywordTok{future\_map}\NormalTok{(}\OperatorTok{\textasciitilde{}}\KeywordTok{jsonl\_to\_df}\NormalTok{(.)) }\OperatorTok{\%\textgreater{}\%}

\CommentTok{\# Full join the list of dataframes}
\KeywordTok{reduce}\NormalTok{(full\_join,}
       \DataTypeTok{by =} \KeywordTok{c}\NormalTok{(}\StringTok{"id"}\NormalTok{,}
              \StringTok{"location"}\NormalTok{,}
              \StringTok{"country\_code"}\NormalTok{,}
              \StringTok{"created\_at"}\NormalTok{,}
              \StringTok{"full\_text"}\NormalTok{,}
              \StringTok{"retweet\_count"}\NormalTok{,}
              \StringTok{"favorite\_count"}\NormalTok{,}
              \StringTok{"user.followers\_count"}\NormalTok{,}
              \StringTok{"user.friends\_count"}\NormalTok{))}

\CommentTok{\# Output}
\NormalTok{df}
\end{Highlighting}
\end{Shaded}

\textbf{rtweet and twarc}

\begin{itemize}
\item
  The main difference is using RStudio vs.~the terminal.
\item
  The difference matters when your data size is large. For example, suppose the size of the Twitter data you downloaded is 10 GB. R/RStudio might have a hard time dealing with this size of data. Then, how can you wrangle this data size in a complex way using R?
\end{itemize}

\hypertarget{getting-api-data-from-scratch}{%
\subsection{Getting API data from scratch}\label{getting-api-data-from-scratch}}

Load packages. For the connection interface, don't use \texttt{RCurl,} but I strongly recommend using \texttt{httr.} The following code examples draw from my R interface for the New York Times API called \href{https://jaeyk.github.io/rnytapi/}{\texttt{rnytapi}}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(httr, jsonlite, purrr, glue)}
\end{Highlighting}
\end{Shaded}

\hypertarget{form-request}{%
\subsubsection{Form REQUEST}\label{form-request}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{get\_request \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(term, begin\_date, end\_date, key, }\DataTypeTok{page =} \DecValTok{1}\NormalTok{) \{}

\NormalTok{    out \textless{}{-}}\StringTok{ }\KeywordTok{GET}\NormalTok{(}\StringTok{"http://api.nytimes.com/svc/search/v2/articlesearch.json"}\NormalTok{,}
        \DataTypeTok{query =} \KeywordTok{list}\NormalTok{(}\StringTok{\textquotesingle{}q\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{term,}
                     \StringTok{\textquotesingle{}begin\_date\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{begin\_date,}
                     \StringTok{\textquotesingle{}end\_date\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{end\_date,}
                     \StringTok{\textquotesingle{}api{-}key\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{key,}
                     \StringTok{\textquotesingle{}page\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{page))}

    \KeywordTok{return}\NormalTok{(out)}

\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\hypertarget{extract-data}{%
\subsubsection{Extract data}\label{extract-data}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{get\_content \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(term, begin\_date, end\_date, key, }\DataTypeTok{page =} \DecValTok{1}\NormalTok{) \{}

    \KeywordTok{message}\NormalTok{(}\KeywordTok{glue}\NormalTok{(}\StringTok{"Scraping page \{page\}"}\NormalTok{))}

    \KeywordTok{fromJSON}\NormalTok{(}\KeywordTok{content}\NormalTok{(}\KeywordTok{get\_request}\NormalTok{(term, begin\_date, end\_date, key, page),}
                     \StringTok{"text"}\NormalTok{,}
                \DataTypeTok{encoding =} \StringTok{"UTF{-}8"}\NormalTok{),}
                \DataTypeTok{simplifyDataFrame =} \OtherTok{TRUE}\NormalTok{, }\DataTypeTok{flatten =} \OtherTok{TRUE}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.data.frame}\NormalTok{()}

\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\hypertarget{automating-iterations}{%
\subsubsection{Automating iterations}\label{automating-iterations}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{extract\_all \textless{}{-}}\StringTok{ }\ControlFlowTok{function}\NormalTok{(term, begin\_date, end\_date, key) \{}

\NormalTok{    request \textless{}{-}}\StringTok{ }\KeywordTok{GET}\NormalTok{(}\StringTok{"http://api.nytimes.com/svc/search/v2/articlesearch.json"}\NormalTok{,}
                   \DataTypeTok{query =} \KeywordTok{list}\NormalTok{(}\StringTok{\textquotesingle{}q\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{term,}
                                \StringTok{\textquotesingle{}begin\_date\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{begin\_date,}
                                \StringTok{\textquotesingle{}end\_date\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{end\_date,}
                                \StringTok{\textquotesingle{}api{-}key\textquotesingle{}}\NormalTok{ =}\StringTok{ }\NormalTok{key))}

\NormalTok{    max\_pages \textless{}{-}}\StringTok{ }\NormalTok{(}\KeywordTok{round}\NormalTok{(}\KeywordTok{content}\NormalTok{(request)}\OperatorTok{$}\NormalTok{response}\OperatorTok{$}\NormalTok{meta}\OperatorTok{$}\NormalTok{hits[}\DecValTok{1}\NormalTok{] }\OperatorTok{/}\StringTok{ }\DecValTok{10}\NormalTok{) }\OperatorTok{{-}}\StringTok{ }\DecValTok{1}\NormalTok{)}

    \KeywordTok{message}\NormalTok{(}\KeywordTok{glue}\NormalTok{(}\StringTok{"The total number of pages is \{max\_pages\}"}\NormalTok{))}

\NormalTok{    iter \textless{}{-}}\StringTok{ }\DecValTok{0}\OperatorTok{:}\NormalTok{max\_pages}

\NormalTok{    arg\_list \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}\KeywordTok{rep}\NormalTok{(term, }\DataTypeTok{times =} \KeywordTok{length}\NormalTok{(iter)),}
                     \KeywordTok{rep}\NormalTok{(begin\_date, }\DataTypeTok{times =} \KeywordTok{length}\NormalTok{(iter)),}
                     \KeywordTok{rep}\NormalTok{(end\_date, }\DataTypeTok{times =} \KeywordTok{length}\NormalTok{(iter)),}
                     \KeywordTok{rep}\NormalTok{(key, }\DataTypeTok{times =} \KeywordTok{length}\NormalTok{(iter)),}
\NormalTok{                     iter}
\NormalTok{                     )}

\NormalTok{    out \textless{}{-}}\StringTok{ }\KeywordTok{pmap\_dfr}\NormalTok{(arg\_list, }\KeywordTok{slowly}\NormalTok{(get\_content,}
                                     \CommentTok{\# 6 seconds sleep is the default requirement.}
                                     \DataTypeTok{rate =} \KeywordTok{rate\_delay}\NormalTok{(}
                                         \DataTypeTok{pause =} \DecValTok{6}\NormalTok{,}
                                         \DataTypeTok{max\_times =} \DecValTok{4000}\NormalTok{)))}

    \KeywordTok{return}\NormalTok{(out)}

\NormalTok{    \}}
\end{Highlighting}
\end{Shaded}

\hypertarget{machine_learning}{%
\chapter{High-dimensional data}\label{machine_learning}}

\hypertarget{the-big-picture-8}{%
\section{The Big Picture}\label{the-big-picture-8}}

\begin{itemize}
\item
  The rise of high-dimensional data. The new data frontiers in social sciences---text (\href{https://web.stanford.edu/~gentzkow/research/text-as-data.pdf}{Gentzkow et al.~2019}; \href{https://www.jstor.org/stable/pdf/24572662.pdf?casa_token=SQdSI4R_VdwAAAAA:4QiVLhCXqr9f0qNMM9U75EL5JbDxxnXxUxyIfDf0U8ZzQx9szc0xVqaU6DXG4nHyZiNkvcwGlgD6H0Lxj3y0ULHwgkf1MZt8-9TPVtkEH9I4AHgbTg}{Grimmer and Stewart 2013}) and and image (\href{https://arxiv.org/pdf/1810.01544}{Joo and Steinert-Threlkeld 2018})---are all high-dimensional data.

  \begin{itemize}
  \item
    1000 common English words for 30-word tweets: \(1000^{30}\) similar to N of atoms in the universe (\href{https://web.stanford.edu/~gentzkow/research/text-as-data.pdf}{Gentzkow et al.~2019})
  \item
    Belloni, Alexandre, Victor Chernozhukov, and Christian Hansen. \href{https://pubs.aeaweb.org/doi/pdfplus/10.1257/jep.28.2.29}{``High-dimensional methods and inference on structural and treatment effects.''} \emph{Journal of Economic Perspectives 28}, no. 2 (2014): 29-50.
  \end{itemize}
\item
  The rise of the new approach: statistics + computer science = machine learning
\item
  Statistical inference

  \begin{itemize}
  \item
    \(y\) \textless- some probability models (e.g., linear regression, logistic regression) \textless- \(x\)
  \item
    \(y\) = \(X\beta\) + \(\epsilon\)
  \item
    The goal is to estimate \(\beta\)
  \end{itemize}
\item
  Machine learning

  \begin{itemize}
  \item
    \(y\) \textless- unknown \textless- \(x\)
  \item
    \(y\) \textless-\textgreater{} decision trees, neutral nets \textless-\textgreater{} \(x\)
  \item
    For the main idea behind prediction modeling, see Breiman, Leo (Berkeley stat faculty who passed away in 2005). \href{https://projecteuclid.org/euclid.ss/1009213726}{``Statistical modeling: The two cultures (with comments and a rejoinder by the author).''} \emph{Statistical science} 16, no. 3 (2001): 199-231.
  \item
    ``The problem is to find an algorithm \(f(x)\) such that for future \(x\) in a test set, \(f(x)\) will be a good predictor of \(y\).''
  \item
    ``There are \textbf{two cultures} in the use of statistical modeling to reach conclusions from data. One assumes that the data are generated by a \textbf{given} \textbf{stochastic data model}. The other uses \textbf{algorithmic models} and treats the data mechanism as \textbf{unknown}.''
  \end{itemize}
\item
  How ML differs from econometrics?
\item
  A review by Athey, Susan, and Guido W. Imbens. \href{https://www.annualreviews.org/doi/full/10.1146/annurev-economics-080217-053433}{``Machine learning methods that economists should know about.''} \emph{Annual Review of Economics} 11 (2019): 685-725.
\item
  Stat:

  \begin{itemize}
  \item
    Specifying a target (i.e., an estimand)
  \item
    Fitting a model to data using an objective function (e.g., the sum of squared errors)
  \item
    Reporting point estimates (effect size) and standard errors (uncertainty)
  \item
    Validation by yes-no using goodness-of-fit tests and residual examination
  \end{itemize}
\item
  ML:

  \begin{itemize}
  \item
    Developing algorithms (estimating \emph{f(x)})
  \item
    Prediction power, not structural/causal parameters
  \item
    Basically, high-dimensional data statistics (N \textless{} P)
  \item
    The major problem is to avoid \href{https://en.wikipedia.org/wiki/Curse_of_dimensionality}{``the curse of dimensionality''} (\href{https://towardsdatascience.com/the-curse-of-dimensionality-50dc6e49aa1e}{too many features - \textgreater{} overfitting})
  \item
    Validation: out-of-sample comparisons (cross-validation) not in-sample goodness-of-fit measures
  \item
    So, it's curve-fitting, but the primary focus is unseen (test data), not seen data (training data)
  \end{itemize}
\item
  A quick review on ML lingos for those trained in econometrics

  \begin{itemize}
  \item
    Sample to estimate parameters = Training sample
  \item
    Estimating the model = Being trained
  \item
    Regressors, covariates, or predictors = Features
  \item
    Regression parameters = weights
  \item
    Prediction problems = Supervised (some \(y\) are known) + Unsupervised (\(y\) unknown)
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://i.vas3k.ru/7w9.jpg}
\caption{How to teach machines. Based on \href{https://vas3k.com/blog/machine_learning/}{vas3k blog}. Many images in this chapter come from vas3k blog.}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://i.vas3k.ru/7vz.jpg}
\caption{The main types of machine learning. Based on \href{https://vas3k.com/blog/machine_learning/}{vas3k blog}}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://i.vas3k.ru/7vx.jpg}
\caption{The map of the machine learning universe. Based on \href{https://vas3k.com/blog/machine_learning/}{vas3k blog}}
\end{figure}

\begin{figure}
\centering
\includegraphics{https://i.vas3k.ru/7w1.jpg}
\caption{Classical machine learning. Based on \href{https://vas3k.com/blog/machine_learning/}{vas3k blog}}
\end{figure}

\hypertarget{dataset}{%
\section{Dataset}\label{dataset}}

\begin{itemize}
\item
  \href{https://archive.ics.uci.edu/ml/datasets/heart+Disease}{Heart disease data from UCI}
\item
  One of the popular datasets used in machine learning competitions
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Load packages}

\CommentTok{\#\# CRAN packages}
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  here,}
\NormalTok{  tidyverse,}
\NormalTok{  tidymodels,}
\NormalTok{  doParallel, }\CommentTok{\# parallel processing}
\NormalTok{  patchwork, }\CommentTok{\# arranging ggplots}
\NormalTok{  remotes,}
\NormalTok{  SuperLearner,}
\NormalTok{  vip,}
\NormalTok{  tidymodels,}
\NormalTok{  glmnet,}
\NormalTok{  xgboost,}
\NormalTok{  rpart,}
\NormalTok{  ranger,}
\NormalTok{  conflicted}
\NormalTok{)}

\NormalTok{remotes}\OperatorTok{::}\KeywordTok{install\_github}\NormalTok{(}\StringTok{"ck37/ck37r"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Skipping install of 'ck37r' from a github remote, the SHA1 (87085fff) has not changed since last install.
##   Use `force = TRUE` to force installation
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{conflicted}\OperatorTok{::}\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"filter"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Will prefer dplyr::filter over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#\# Jae\textquotesingle{}s custom functions}
\KeywordTok{source}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"functions"}\NormalTok{, }\StringTok{"ml\_utils.r"}\NormalTok{))}

\CommentTok{\# Import the dataset}

\NormalTok{data\_original \textless{}{-}}\StringTok{ }\KeywordTok{read\_csv}\NormalTok{(}\KeywordTok{here}\NormalTok{(}\StringTok{"data"}\NormalTok{, }\StringTok{"heart.csv"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 303 Columns: 14
## -- Column specification ---------------------------------------------------------
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glimpse}\NormalTok{(data\_original)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 303
## Columns: 14
## $ age      <dbl> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58, 5~
## $ sex      <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1~
## $ cp       <dbl> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3, 0~
## $ trestbps <dbl> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130, 1~
## $ chol     <dbl> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275, 2~
## $ fbs      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0~
## $ restecg  <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1~
## $ thalach  <dbl> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139, 1~
## $ exang    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~
## $ oldpeak  <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2, 0~
## $ slope    <dbl> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2, 1~
## $ ca       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0~
## $ thal     <dbl> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3~
## $ target   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Createa a copy}
\NormalTok{data \textless{}{-}}\StringTok{ }\NormalTok{data\_original}

\KeywordTok{theme\_set}\NormalTok{(}\KeywordTok{theme\_minimal}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\hypertarget{workflow-3}{%
\section{Workflow}\label{workflow-3}}

\begin{itemize}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \tightlist
  \item
    Preprocessing
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{1}
  \tightlist
  \item
    Model building
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{2}
  \tightlist
  \item
    Model fitting
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{3}
  \tightlist
  \item
    Model evaluation
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{4}
  \tightlist
  \item
    Model tuning
  \end{enumerate}
\item
  \begin{enumerate}
  \def\labelenumi{\arabic{enumi}.}
  \setcounter{enumi}{5}
  \tightlist
  \item
    Prediction
  \end{enumerate}
\end{itemize}

\hypertarget{tidymodels}{%
\section{tidymodels}\label{tidymodels}}

\begin{itemize}
\item
  Like \texttt{tidyverse}, \texttt{tidymodels} is a collection of packages.

  \begin{itemize}
  \item
    \href{https://rsample.tidymodels.org/}{\texttt{rsample}}: for data splitting
  \item
    \href{https://recipes.tidymodels.org/index.html}{\texttt{recipes}}: for pre-processing
  \item
    \href{https://www.tidyverse.org/blog/2018/11/parsnip-0-0-1/}{\texttt{parsnip}}: for model building

    \begin{itemize}
    \tightlist
    \item
      \href{https://github.com/tidymodels/tune}{\texttt{tune}}: hyperparameter tuning
    \end{itemize}
  \item
    \href{https://github.com/tidymodels/yardstick}{\texttt{yardstick}}: for model evaluations
  \item
    \href{https://github.com/tidymodels/workflows}{\texttt{workflows}}: for bundling a pieplne that bundles together preprocessing, modeling, and post-processing requests
  \end{itemize}
\item
  Why taking a tidyverse approach to machine learning?
\item
  Benefits

  \begin{itemize}
  \item
    Readable code
  \item
    Reusable data structures
  \item
    Extendable code
  \end{itemize}
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://rviews.rstudio.com/post/2019-06-14-a-gentle-intro-to-tidymodels_files/figure-html/ds.png}
\caption{Tidymodels. From RStudio.}
\end{figure}

\begin{quote}
tidymodels are an \textbf{integrated, modular, extensible} set of packages that implement a framework that facilitates creating predicative stochastic models. - Joseph \href{mailto:Rickert@RStudio}{\nolinkurl{Rickert@RStudio}}
\end{quote}

\begin{itemize}
\item
  Currently, 238 models are \href{https://topepo.github.io/caret/available-models.html}{available}
\item
  The following materials are based on \href{https://github.com/dlab-berkeley/Machine-Learning-with-tidymodels}{the machine learning with tidymodels workshop} I developed for D-Lab. \href{https://github.com/dlab-berkeley/Machine-Learning-in-R}{The original workshop} was designed by \href{https://ck37.com/}{Chris Kennedy} and {[}Evan Muzzall{]}(\url{https://dlab.berkeley.edu/people/evan-muzzall}.
\end{itemize}

\hypertarget{pre-processing}{%
\section{Pre-processing}\label{pre-processing}}

\begin{itemize}
\item
  \href{https://recipes.tidymodels.org/index.html}{\texttt{recipes}}: for pre-processing
\item
  \href{https://github.com/tidymodels/textrecipes}{\texttt{textrecipes}} for text pre-processing
\item
  Step 1: \texttt{recipe()} defines target and predictor variables (ingredients).
\item
  Step 2: \texttt{step\_*()} defines preprocessing steps to be taken (recipe).

  The preprocessing steps list draws on the vignette of the \href{https://www.tidymodels.org/find/parsnip/}{\texttt{parsnip}} package.

  \begin{itemize}
  \item
    dummy: Also called one-hot encoding
  \item
    zero variance: Removing columns (or features) with a single unique value
  \item
    impute: Imputing missing values
  \item
    decorrelate: Mitigating correlated predictors (e.g., principal component analysis)
  \item
    normalize: Centering and/or scaling predictors (e.g., log scaling). Scaling matters because many algorithms (e.g., lasso) are scale-variant (except tree-based algorithms). Remind you that normalization (sensitive to outliers) = \(\frac{X - X_{min}}{X_{max} - X_{min}}\) and standardization (not sensitive to outliers) = \(\frac{X - \mu}{\sigma}\)
  \item
    transform: Making predictors symmetric
  \end{itemize}
\item
  Step 3: \texttt{prep()} prepares a dataset to base each step on.
\item
  Step 4: \texttt{bake()} applies the preprocessing steps to your datasets.
\end{itemize}

In this course, we focus on two preprocessing tasks.

\begin{itemize}
\tightlist
\item
  One-hot encoding (creating dummy/indicator variables)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Turn selected numeric variables into factor variables}
\NormalTok{data \textless{}{-}}\StringTok{ }\NormalTok{data }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{mutate}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"sex"}\NormalTok{, }\StringTok{"ca"}\NormalTok{, }\StringTok{"cp"}\NormalTok{, }\StringTok{"slope"}\NormalTok{, }\StringTok{"thal"}\NormalTok{), as.factor))}

\KeywordTok{glimpse}\NormalTok{(data)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 303
## Columns: 14
## $ age      <dbl> 63, 37, 41, 56, 57, 57, 56, 44, 52, 57, 54, 48, 49, 64, 58, 5~
## $ sex      <fct> 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1~
## $ cp       <fct> 3, 2, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 1, 3, 3, 2, 2, 3, 0, 3, 0~
## $ trestbps <dbl> 145, 130, 130, 120, 120, 140, 140, 120, 172, 150, 140, 130, 1~
## $ chol     <dbl> 233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275, 2~
## $ fbs      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0~
## $ restecg  <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1~
## $ thalach  <dbl> 150, 187, 172, 178, 163, 148, 153, 173, 162, 174, 160, 139, 1~
## $ exang    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~
## $ oldpeak  <dbl> 2.3, 3.5, 1.4, 0.8, 0.6, 0.4, 1.3, 0.0, 0.5, 1.6, 1.2, 0.2, 0~
## $ slope    <fct> 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2, 1~
## $ ca       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0~
## $ thal     <fct> 1, 2, 2, 2, 2, 1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3~
## $ target   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Imputation
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Check missing values}

\KeywordTok{map\_df}\NormalTok{(data, }\OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{is.na}\NormalTok{(.) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{sum}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <int> <int> <int>    <int> <int> <int>   <int>   <int> <int>   <int> <int>
## 1     0     0     0        0     0     0       0       0     0       0     0
## # ... with 3 more variables: ca <int>, thal <int>, target <int>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add missing values}

\NormalTok{data}\OperatorTok{$}\NormalTok{oldpeak[}\KeywordTok{sample}\NormalTok{(}\KeywordTok{seq}\NormalTok{(data), }\DataTypeTok{size =} \DecValTok{10}\NormalTok{)] \textless{}{-}}\StringTok{ }\OtherTok{NA}

\CommentTok{\# Check missing values}

\CommentTok{\# Check the number of missing values}
\NormalTok{data }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{map\_df}\NormalTok{(}\OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{is.na}\NormalTok{(.) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{sum}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <int> <int> <int>    <int> <int> <int>   <int>   <int> <int>   <int> <int>
## 1     0     0     0        0     0     0       0       0     0      10     0
## # ... with 3 more variables: ca <int>, thal <int>, target <int>
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Check the rate of missing values}
\NormalTok{data }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{map\_df}\NormalTok{(}\OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{is.na}\NormalTok{(.) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{mean}\NormalTok{())}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 14
##     age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak slope
##   <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl>
## 1     0     0     0        0     0     0       0       0     0  0.0330     0
## # ... with 3 more variables: ca <dbl>, thal <dbl>, target <dbl>
\end{verbatim}

\hypertarget{regression-setup}{%
\subsection{Regression setup}\label{regression-setup}}

\hypertarget{outcome-variable}{%
\subsubsection{Outcome variable}\label{outcome-variable}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Continuous variable}
\NormalTok{data}\OperatorTok{$}\NormalTok{age }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{class}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\hypertarget{data-splitting-using-random-sampling}{%
\subsubsection{Data splitting using random sampling}\label{data-splitting-using-random-sampling}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# for reproducibility}
\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{)}

\CommentTok{\# split}
\NormalTok{split\_reg \textless{}{-}}\StringTok{ }\KeywordTok{initial\_split}\NormalTok{(data, }\DataTypeTok{prop =} \FloatTok{0.7}\NormalTok{)}

\CommentTok{\# training set}
\NormalTok{raw\_train\_x\_reg \textless{}{-}}\StringTok{ }\KeywordTok{training}\NormalTok{(split\_reg)}

\CommentTok{\# test set}
\NormalTok{raw\_test\_x\_reg \textless{}{-}}\StringTok{ }\KeywordTok{testing}\NormalTok{(split\_reg)}
\end{Highlighting}
\end{Shaded}

\hypertarget{recipe}{%
\subsubsection{recipe}\label{recipe}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Regression recipe}
\NormalTok{rec\_reg \textless{}{-}}\StringTok{ }\NormalTok{raw\_train\_x\_reg }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Define the outcome variable}
\StringTok{  }\KeywordTok{recipe}\NormalTok{(age }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Median impute oldpeak column}
\StringTok{  }\KeywordTok{step\_medianimpute}\NormalTok{(oldpeak) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Expand "sex", "ca", "cp", "slope", and "thal" features out into dummy variables (indicators).}
\StringTok{  }\KeywordTok{step\_dummy}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"sex"}\NormalTok{, }\StringTok{"ca"}\NormalTok{, }\StringTok{"cp"}\NormalTok{, }\StringTok{"slope"}\NormalTok{, }\StringTok{"thal"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: `step_medianimpute()` was deprecated in recipes 0.1.16.
## Please use `step_impute_median()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Prepare a dataset to base each step on}
\NormalTok{prep\_reg \textless{}{-}}\StringTok{ }\NormalTok{rec\_reg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{prep}\NormalTok{(}\DataTypeTok{retain =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# x features}
\NormalTok{train\_x\_reg \textless{}{-}}\StringTok{ }\KeywordTok{juice}\NormalTok{(prep\_reg, }\KeywordTok{all\_predictors}\NormalTok{())}

\NormalTok{test\_x\_reg \textless{}{-}}\StringTok{ }\KeywordTok{bake}\NormalTok{(}
  \DataTypeTok{object =}\NormalTok{ prep\_reg,}
  \DataTypeTok{new\_data =}\NormalTok{ raw\_test\_x\_reg, }\KeywordTok{all\_predictors}\NormalTok{()}
\NormalTok{)}

\CommentTok{\# y variables}
\NormalTok{train\_y\_reg \textless{}{-}}\StringTok{ }\KeywordTok{juice}\NormalTok{(prep\_reg, }\KeywordTok{all\_outcomes}\NormalTok{())}\OperatorTok{$}\NormalTok{age }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.numeric}\NormalTok{()}
\NormalTok{test\_y\_reg \textless{}{-}}\StringTok{ }\KeywordTok{bake}\NormalTok{(prep\_reg, raw\_test\_x\_reg, }\KeywordTok{all\_outcomes}\NormalTok{())}\OperatorTok{$}\NormalTok{age }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.numeric}\NormalTok{()}

\CommentTok{\# Checks}
\KeywordTok{names}\NormalTok{(train\_x\_reg) }\CommentTok{\# Make sure there\textquotesingle{}s no age variable!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "trestbps" "chol"     "fbs"      "restecg"  "thalach"  "exang"   
##  [7] "oldpeak"  "target"   "sex_X1"   "ca_X1"    "ca_X2"    "ca_X3"   
## [13] "ca_X4"    "cp_X1"    "cp_X2"    "cp_X3"    "slope_X1" "slope_X2"
## [19] "thal_X1"  "thal_X2"  "thal_X3"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(train\_y\_reg) }\CommentTok{\# Make sure this is a continuous variable!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Note that other imputation methods are also available.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{grep}\NormalTok{(}\StringTok{"impute"}\NormalTok{, }\KeywordTok{ls}\NormalTok{(}\StringTok{"package:recipes"}\NormalTok{), }\DataTypeTok{value =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "step_bagimpute"     "step_impute_bag"    "step_impute_knn"   
##  [4] "step_impute_linear" "step_impute_lower"  "step_impute_mean"  
##  [7] "step_impute_median" "step_impute_mode"   "step_impute_roll"  
## [10] "step_knnimpute"     "step_lowerimpute"   "step_meanimpute"   
## [13] "step_medianimpute"  "step_modeimpute"    "step_rollimpute"
\end{verbatim}

\begin{itemize}
\tightlist
\item
  You can also create your own \texttt{step\_} functions. For more information, see \href{https://www.tidymodels.org/learn/develop/recipes/}{tidymodels.org}.
\end{itemize}

\hypertarget{classification-setup}{%
\subsection{Classification setup}\label{classification-setup}}

\hypertarget{outcome-variable-1}{%
\subsubsection{Outcome variable}\label{outcome-variable-1}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data}\OperatorTok{$}\NormalTok{target }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{class}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data}\OperatorTok{$}\NormalTok{target \textless{}{-}}\StringTok{ }\KeywordTok{as.factor}\NormalTok{(data}\OperatorTok{$}\NormalTok{target)}

\NormalTok{data}\OperatorTok{$}\NormalTok{target }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{class}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "factor"
\end{verbatim}

\hypertarget{data-splitting-using-stratified-random-sampling}{%
\subsubsection{Data splitting using stratified random sampling}\label{data-splitting-using-stratified-random-sampling}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# split}
\NormalTok{split\_class \textless{}{-}}\StringTok{ }\KeywordTok{initial\_split}\NormalTok{(data }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{target =} \KeywordTok{as.factor}\NormalTok{(target)),}
\DataTypeTok{prop =} \FloatTok{0.7}\NormalTok{,}
\DataTypeTok{strata =}\NormalTok{ target}
\NormalTok{)}

\CommentTok{\# training set}
\NormalTok{raw\_train\_x\_class \textless{}{-}}\StringTok{ }\KeywordTok{training}\NormalTok{(split\_class)}

\CommentTok{\# testing set}
\NormalTok{raw\_test\_x\_class \textless{}{-}}\StringTok{ }\KeywordTok{testing}\NormalTok{(split\_class)}
\end{Highlighting}
\end{Shaded}

\hypertarget{recipe-1}{%
\subsubsection{recipe}\label{recipe-1}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Classification recipe}
\NormalTok{rec\_class \textless{}{-}}\StringTok{ }\NormalTok{raw\_train\_x\_class }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Define the outcome variable}
\StringTok{  }\KeywordTok{recipe}\NormalTok{(target }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Median impute oldpeak column}
\StringTok{  }\KeywordTok{step\_medianimpute}\NormalTok{(oldpeak) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Expand "sex", "ca", "cp", "slope", and "thal" features out into dummy variables (indicators).}
\StringTok{  }\KeywordTok{step\_normalize}\NormalTok{(age) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{step\_dummy}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"sex"}\NormalTok{, }\StringTok{"ca"}\NormalTok{, }\StringTok{"cp"}\NormalTok{, }\StringTok{"slope"}\NormalTok{, }\StringTok{"thal"}\NormalTok{))}

\CommentTok{\# Prepare a dataset to base each step on}
\NormalTok{prep\_class \textless{}{-}}\StringTok{ }\NormalTok{rec\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{prep}\NormalTok{(}\DataTypeTok{retain =} \OtherTok{TRUE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# x features}
\NormalTok{train\_x\_class \textless{}{-}}\StringTok{ }\KeywordTok{juice}\NormalTok{(prep\_class, }\KeywordTok{all\_predictors}\NormalTok{())}
\NormalTok{test\_x\_class \textless{}{-}}\StringTok{ }\KeywordTok{bake}\NormalTok{(prep\_class, raw\_test\_x\_class, }\KeywordTok{all\_predictors}\NormalTok{())}

\CommentTok{\# y variables}
\NormalTok{train\_y\_class \textless{}{-}}\StringTok{ }\KeywordTok{juice}\NormalTok{(prep\_class, }\KeywordTok{all\_outcomes}\NormalTok{())}\OperatorTok{$}\NormalTok{target }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.factor}\NormalTok{()}
\NormalTok{test\_y\_class \textless{}{-}}\StringTok{ }\KeywordTok{bake}\NormalTok{(prep\_class, raw\_test\_x\_class, }\KeywordTok{all\_outcomes}\NormalTok{())}\OperatorTok{$}\NormalTok{target }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{as.factor}\NormalTok{()}

\CommentTok{\# Checks}
\KeywordTok{names}\NormalTok{(train\_x\_class) }\CommentTok{\# Make sure there\textquotesingle{}s no target variable!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "age"      "trestbps" "chol"     "fbs"      "restecg"  "thalach" 
##  [7] "exang"    "oldpeak"  "sex_X1"   "ca_X1"    "ca_X2"    "ca_X3"   
## [13] "ca_X4"    "cp_X1"    "cp_X2"    "cp_X3"    "slope_X1" "slope_X2"
## [19] "thal_X1"  "thal_X2"  "thal_X3"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{class}\NormalTok{(train\_y\_class) }\CommentTok{\# Make sure this is a factor variable!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "factor"
\end{verbatim}

\hypertarget{supervised-learning}{%
\section{Supervised learning}\label{supervised-learning}}

x -\textgreater{} f - \textgreater{} y (defined)

\hypertarget{ols-and-lasso}{%
\subsection{OLS and Lasso}\label{ols-and-lasso}}

\hypertarget{parsnip}{%
\subsubsection{parsnip}\label{parsnip}}

\begin{itemize}
\tightlist
\item
  Build models (\texttt{parsnip})
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Specify a model
\item
  Specify an engine
\item
  Specify a mode
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# OLS spec}
\NormalTok{ols\_spec \textless{}{-}}\StringTok{ }\KeywordTok{linear\_reg}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Specify a model}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"lm"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Specify an engine: lm, glmnet, stan, keras, spark}
\StringTok{  }\KeywordTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{) }\CommentTok{\# Declare a mode: regression or classification}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{http://ethen8181.github.io/machine-learning/regularization/images/lasso_ridge_coefficients.png}
\caption{Source: \url{http://ethen8181.github.io}}
\end{figure}

Lasso is one of the regularization techniques along with ridge and elastic-net.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Lasso spec}
\NormalTok{lasso\_spec \textless{}{-}}\StringTok{ }\KeywordTok{linear\_reg}\NormalTok{(}
  \DataTypeTok{penalty =} \FloatTok{0.1}\NormalTok{, }\CommentTok{\# tuning hyperparameter}
  \DataTypeTok{mixture =} \DecValTok{1}
\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# 1 = lasso, 0 = ridge}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"glmnet"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{)}

\CommentTok{\# If you don\textquotesingle{}t understand parsnip arguments}
\NormalTok{lasso\_spec }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{translate}\NormalTok{() }\CommentTok{\# See the documentation}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.1
##   mixture = 1
## 
## Computational engine: glmnet 
## 
## Model fit template:
## glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
##     alpha = 1, family = "gaussian")
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Fit models
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ols\_fit \textless{}{-}}\StringTok{ }\NormalTok{ols\_spec }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit\_xy}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ train\_x\_reg, }\DataTypeTok{y =}\NormalTok{ train\_y\_reg)}
\CommentTok{\# fit(train\_y\_reg \textasciitilde{} ., train\_x\_reg) \# When you data are not preprocessed}

\NormalTok{lasso\_fit \textless{}{-}}\StringTok{ }\NormalTok{lasso\_spec }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit\_xy}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ train\_x\_reg, }\DataTypeTok{y =}\NormalTok{ train\_y\_reg)}
\end{Highlighting}
\end{Shaded}

\hypertarget{yardstick}{%
\subsubsection{yardstick}\label{yardstick}}

\begin{itemize}
\tightlist
\item
  Visualize model fits
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{map2}\NormalTok{(}\KeywordTok{list}\NormalTok{(ols\_fit, lasso\_fit), }\KeywordTok{c}\NormalTok{(}\StringTok{"OLS"}\NormalTok{, }\StringTok{"Lasso"}\NormalTok{), visualize\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [[1]]
\end{verbatim}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-19-1.pdf}

\begin{verbatim}
## 
## [[2]]
\end{verbatim}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-19-2.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Define performance metrics}
\NormalTok{metrics \textless{}{-}}\StringTok{ }\NormalTok{yardstick}\OperatorTok{::}\KeywordTok{metric\_set}\NormalTok{(rmse, mae, rsq)}

\CommentTok{\# Evaluate many models}
\NormalTok{evals \textless{}{-}}\StringTok{ }\NormalTok{purrr}\OperatorTok{::}\KeywordTok{map}\NormalTok{(}\KeywordTok{list}\NormalTok{(ols\_fit, lasso\_fit), evaluate\_reg) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{reduce}\NormalTok{(bind\_rows) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{type =} \KeywordTok{rep}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"OLS"}\NormalTok{, }\StringTok{"Lasso"}\NormalTok{), }\DataTypeTok{each =} \DecValTok{3}\NormalTok{))}

\CommentTok{\# Visualize the test results}
\NormalTok{evals }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(type, .estimate), }\DataTypeTok{y =}\NormalTok{ .estimate)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Model"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Estimate"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{glue}\NormalTok{(}\StringTok{"\{toupper(.metric)\}"}\NormalTok{), }\DataTypeTok{scales =} \StringTok{"free\_y"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-20-1.pdf}
- For more information, read \href{https://www.tmwr.org/}{Tidy Modeling with R} by Max Kuhn and Julia Silge.

\hypertarget{tune}{%
\subsubsection{tune}\label{tune}}

\textbf{Hyper}parameters are parameters that control the learning process.

\hypertarget{tune-ingredients}{%
\paragraph{tune ingredients}\label{tune-ingredients}}

\begin{itemize}
\tightlist
\item
  Search space for hyperparameters
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Grid search: a grid of hyperparameters
\item
  Random search: random sample points from a bounded domain
\end{enumerate}

\includegraphics{https://www.programmersought.com/images/523/7e44435f20fe514c11ca0d930af8547b.png}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# tune() = placeholder}

\NormalTok{tune\_spec \textless{}{-}}\StringTok{ }\KeywordTok{linear\_reg}\NormalTok{(}
  \DataTypeTok{penalty =} \KeywordTok{tune}\NormalTok{(), }\CommentTok{\# tuning hyperparameter}
  \DataTypeTok{mixture =} \DecValTok{1}
\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# 1 = lasso, 0 = ridge}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"glmnet"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_mode}\NormalTok{(}\StringTok{"regression"}\NormalTok{)}

\NormalTok{tune\_spec}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Computational engine: glmnet
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# penalty() searches 50 possible combinations}

\NormalTok{lambda\_grid \textless{}{-}}\StringTok{ }\KeywordTok{grid\_regular}\NormalTok{(}\KeywordTok{penalty}\NormalTok{(), }\DataTypeTok{levels =} \DecValTok{50}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{https://www.googleapis.com/download/storage/v1/b/kaggle-forum-message-attachments/o/inbox\%2F4788946\%2F82b5a41b6693a313b246f02d79e972d5\%2FK\%20FOLD.png?generation=1608195745131795\&alt=media}
\caption{Source: Kaggle}
\end{figure}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 10{-}fold cross{-}validation}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility}

\NormalTok{rec\_folds \textless{}{-}}\StringTok{ }\KeywordTok{vfold\_cv}\NormalTok{(train\_x\_reg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{age =}\NormalTok{ train\_y\_reg)))}
\end{Highlighting}
\end{Shaded}

\hypertarget{add-these-elements-to-a-workflow}{%
\paragraph{Add these elements to a workflow}\label{add-these-elements-to-a-workflow}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Workflow}
\NormalTok{rec\_wf \textless{}{-}}\StringTok{ }\KeywordTok{workflow}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{add\_model}\NormalTok{(tune\_spec) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{add\_formula}\NormalTok{(age }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Tuning results}
\NormalTok{rec\_res \textless{}{-}}\StringTok{ }\NormalTok{rec\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tune\_grid}\NormalTok{(}
    \DataTypeTok{resamples =}\NormalTok{ rec\_folds,}
    \DataTypeTok{grid =}\NormalTok{ lambda\_grid}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize}{%
\paragraph{Visualize}\label{visualize}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Visualize}

\NormalTok{rec\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect\_metrics}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(penalty, mean, }\DataTypeTok{col =}\NormalTok{ .metric)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_errorbar}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{ymin =}\NormalTok{ mean }\OperatorTok{{-}}\StringTok{ }\NormalTok{std\_err,}
    \DataTypeTok{ymax =}\NormalTok{ mean }\OperatorTok{+}\StringTok{ }\NormalTok{std\_err}
\NormalTok{  ),}
  \DataTypeTok{alpha =} \FloatTok{0.3}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_line}\NormalTok{(}\DataTypeTok{size =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{x =} \StringTok{"log(lambda)"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{glue}\NormalTok{(}\StringTok{"\{toupper(.metric)\}"}\NormalTok{),}
    \DataTypeTok{scales =} \StringTok{"free"}\NormalTok{,}
    \DataTypeTok{nrow =} \DecValTok{2}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{legend.position =} \StringTok{"none"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-25-1.pdf}

\hypertarget{select}{%
\paragraph{Select}\label{select}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"filter"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Removing existing preference
## [conflicted] Will prefer dplyr::filter over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{top\_rmse \textless{}{-}}\StringTok{ }\KeywordTok{show\_best}\NormalTok{(rec\_res, }\DataTypeTok{metric =} \StringTok{"rmse"}\NormalTok{)}

\NormalTok{best\_rmse \textless{}{-}}\StringTok{ }\KeywordTok{select\_best}\NormalTok{(rec\_res, }\DataTypeTok{metric =} \StringTok{"rmse"}\NormalTok{)}

\NormalTok{best\_rmse}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 2
##   penalty .config              
##     <dbl> <chr>                
## 1   0.391 Preprocessor1_Model48
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{\textquotesingle{}The RMSE of the intiail model is}
\StringTok{     \{evals \%\textgreater{}\%}
\StringTok{  filter(type == "Lasso", .metric == "rmse") \%\textgreater{}\%}
\StringTok{  select(.estimate) \%\textgreater{}\%}
\StringTok{  round(2)\}\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## The RMSE of the intiail model is
##    7.82
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{glue}\NormalTok{(}\StringTok{\textquotesingle{}The RMSE of the tuned model is \{rec\_res \%\textgreater{}\%}
\StringTok{  collect\_metrics() \%\textgreater{}\%}
\StringTok{  filter(.metric == "rmse") \%\textgreater{}\%}
\StringTok{  arrange(mean) \%\textgreater{}\%}
\StringTok{  dplyr::slice(1) \%\textgreater{}\%}
\StringTok{  select(mean) \%\textgreater{}\%}
\StringTok{  round(2)\}\textquotesingle{}}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## The RMSE of the tuned model is 7.55
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Finalize your workflow and visualize \href{https://koalaverse.github.io/vip/articles/vip.html}{variable importance}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{finalize\_lasso \textless{}{-}}\StringTok{ }\NormalTok{rec\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{finalize\_workflow}\NormalTok{(best\_rmse)}

\NormalTok{finalize\_lasso }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(train\_x\_reg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{age =}\NormalTok{ train\_y\_reg))) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull\_workflow\_fit}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{vip}\OperatorTok{::}\KeywordTok{vip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
\end{verbatim}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-27-1.pdf}

\hypertarget{test-fit}{%
\paragraph{Test fit}\label{test-fit}}

\begin{itemize}
\tightlist
\item
  Apply the tuned model to the test dataset
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_fit \textless{}{-}}\StringTok{ }\NormalTok{finalize\_lasso }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(test\_x\_reg }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{age =}\NormalTok{ test\_y\_reg)))}

\KeywordTok{evaluate\_reg}\NormalTok{(test\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       7.04 
## 2 mae     standard       5.79 
## 3 rsq     standard       0.411
\end{verbatim}

\hypertarget{decision-tree}{%
\subsection{Decision tree}\label{decision-tree}}

\hypertarget{parsnip-1}{%
\subsubsection{parsnip}\label{parsnip-1}}

\begin{itemize}
\tightlist
\item
  Build a model
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Specify a model
\item
  Specify an engine
\item
  Specify a mode
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# workflow}
\NormalTok{tree\_wf \textless{}{-}}\StringTok{ }\KeywordTok{workflow}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_formula}\NormalTok{(target }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.)}

\CommentTok{\# spec}
\NormalTok{tree\_spec \textless{}{-}}\StringTok{ }\KeywordTok{decision\_tree}\NormalTok{(}

  \CommentTok{\# Mode}
  \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}

  \CommentTok{\# Tuning hyperparameters}
  \DataTypeTok{cost\_complexity =} \OtherTok{NULL}\NormalTok{,}
  \DataTypeTok{tree\_depth =} \OtherTok{NULL}
\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"rpart"}\NormalTok{) }\CommentTok{\# rpart, c5.0, spark}

\NormalTok{tree\_wf \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_model}\NormalTok{(tree\_spec)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Fit a model
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_fit \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}
\end{Highlighting}
\end{Shaded}

\hypertarget{yardstick-1}{%
\subsubsection{yardstick}\label{yardstick-1}}

\begin{itemize}
\tightlist
\item
  Let's formally test prediction performance.
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Confusion matrix
\end{enumerate}

A confusion matrix is often used to describe the performance of a classification model. The below example is based on a binary classification model.

\begin{longtable}[]{@{}lll@{}}
\toprule
& Predicted: YES & Predicted: NO\tabularnewline
\midrule
\endhead
\textbf{Actual: YES} & True positive (TP) & False negative (FN)\tabularnewline
\textbf{Actual: NO} & False positive (FP) & True negative (TN)\tabularnewline
\bottomrule
\end{longtable}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Metrics
\end{enumerate}

\begin{itemize}
\item
  \texttt{accuracy}: The proportion of the data predicted correctly (\(\frac{TP + TN}{total}\)). 1 - accuracy = misclassification rate.
\item
  \texttt{precision}: Positive predictive value. \emph{When the model predicts yes, how correct is it?} (\(\frac{TP}{TP + FP}\))
\item
  \texttt{recall} (sensitivity): True positive rate (e.g., healthy people healthy). \emph{When the actual value is yes, how often does the model predict yes?} (\(\frac{TP}{TP + FN}\))
\item
  \texttt{F-score}: A weighted average between precision and recall.
\item
  \texttt{ROC\ Curve} (receiver operating characteristic curve): a plot that shows the relationship between true and false positive rates at different classification thresholds. y-axis indicates the true positive rate and x-axis indicates the false positive rate. What matters is the AUC (Area under the ROC Curve), which is a cumulative probability function of ranking a random ``positive'' - ``negative'' pair (for the probability of AUC, see \href{https://www.alexejgossmann.com/auc/}{this blog post}).
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://developers.google.com/machine-learning/crash-course/images/ROCCurve.svg}
\caption{Source: Google Machine Learning Crash Course}
\end{figure}

\begin{itemize}
\tightlist
\item
  To learn more about other metrics, check out the yardstick package \href{https://yardstick.tidymodels.org/reference/index.html}{references}.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Define performance metrics}

\NormalTok{metrics \textless{}{-}}\StringTok{ }\NormalTok{yardstick}\OperatorTok{::}\KeywordTok{metric\_set}\NormalTok{(accuracy, precision, recall)}

\CommentTok{\# Visualize}

\NormalTok{tree\_fit\_viz\_metr \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_eval}\NormalTok{(tree\_fit)}

\NormalTok{tree\_fit\_viz\_metr}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-31-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_fit\_viz\_mat \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_conf}\NormalTok{(tree\_fit)}

\NormalTok{tree\_fit\_viz\_mat}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-31-2.pdf}

\hypertarget{tune-1}{%
\subsubsection{tune}\label{tune-1}}

\hypertarget{tune-ingredients-1}{%
\paragraph{tune ingredients}\label{tune-ingredients-1}}

Decisions trees tend to overfit. We need to consider two things to reduce this problem: how to split and when to stop a tree.

\begin{itemize}
\item
  \textbf{complexity parameter}: a high CP means a simple decision tree with few splits.
\item
  \textbf{tree\_depth}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tune\_spec \textless{}{-}}\StringTok{ }\KeywordTok{decision\_tree}\NormalTok{(}
  \DataTypeTok{cost\_complexity =} \KeywordTok{tune}\NormalTok{(), }\CommentTok{\# how to split}
  \DataTypeTok{tree\_depth =} \KeywordTok{tune}\NormalTok{(), }\CommentTok{\# when to stop}
  \DataTypeTok{mode =} \StringTok{"classification"}
\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"rpart"}\NormalTok{)}

\NormalTok{tree\_grid \textless{}{-}}\StringTok{ }\KeywordTok{grid\_regular}\NormalTok{(}\KeywordTok{cost\_complexity}\NormalTok{(),}
  \KeywordTok{tree\_depth}\NormalTok{(),}
  \DataTypeTok{levels =} \DecValTok{5}
\NormalTok{) }\CommentTok{\# 2 hyperparameters {-}\textgreater{} 5*5 = 25 combinations}

\NormalTok{tree\_grid }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(tree\_depth)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   tree_depth     n
##        <int> <int>
## 1          1     5
## 2          4     5
## 3          8     5
## 4         11     5
## 5         15     5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 10{-}fold cross{-}validation}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility}

\NormalTok{tree\_folds \textless{}{-}}\StringTok{ }\KeywordTok{vfold\_cv}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)),}
  \DataTypeTok{strata =}\NormalTok{ target}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{add-these-elements-to-a-workflow-1}{%
\paragraph{Add these elements to a workflow}\label{add-these-elements-to-a-workflow-1}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Update workflow}
\NormalTok{tree\_wf \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{update\_model}\NormalTok{(tune\_spec)}

\CommentTok{\# Determine the number of cores}
\NormalTok{no\_cores \textless{}{-}}\StringTok{ }\KeywordTok{detectCores}\NormalTok{() }\OperatorTok{{-}}\StringTok{ }\DecValTok{1}

\CommentTok{\# Initiate}
\NormalTok{cl \textless{}{-}}\StringTok{ }\KeywordTok{makeCluster}\NormalTok{(no\_cores)}

\KeywordTok{registerDoParallel}\NormalTok{(cl)}

\CommentTok{\# Tuning results}
\NormalTok{tree\_res \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tune\_grid}\NormalTok{(}
    \DataTypeTok{resamples =}\NormalTok{ tree\_folds,}
    \DataTypeTok{grid =}\NormalTok{ tree\_grid,}
    \DataTypeTok{metrics =}\NormalTok{ metrics}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize-1}{%
\paragraph{Visualize}\label{visualize-1}}

\begin{itemize}
\tightlist
\item
  The following plot draws on the \href{https://www.tidymodels.org/start/tuning/}{vignette} of the tidymodels package.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect\_metrics}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{tree\_depth =} \KeywordTok{factor}\NormalTok{(tree\_depth)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(cost\_complexity, mean, }\DataTypeTok{col =}\NormalTok{ .metric)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{size =} \DecValTok{3}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Subplots}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{tree\_depth,}
    \DataTypeTok{scales =} \StringTok{"free"}\NormalTok{,}
    \DataTypeTok{nrow =} \DecValTok{2}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Log scale x}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\KeywordTok{label\_number}\NormalTok{()) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Discrete color scale}
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{(}\DataTypeTok{option =} \StringTok{"plasma"}\NormalTok{, }\DataTypeTok{begin =} \FloatTok{.9}\NormalTok{, }\DataTypeTok{end =} \DecValTok{0}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Cost complexity"}\NormalTok{,}
    \DataTypeTok{col =} \StringTok{"Tree depth"}\NormalTok{,}
    \DataTypeTok{y =} \OtherTok{NULL}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-34-1.pdf}

\hypertarget{select-1}{%
\paragraph{Select}\label{select-1}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Optimal hyperparameter}
\NormalTok{best\_tree \textless{}{-}}\StringTok{ }\KeywordTok{select\_best}\NormalTok{(tree\_res, }\StringTok{"recall"}\NormalTok{)}

\CommentTok{\# Add the hyperparameter to the workflow}
\NormalTok{finalize\_tree \textless{}{-}}\StringTok{ }\NormalTok{tree\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{finalize\_workflow}\NormalTok{(best\_tree)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_fit\_tuned \textless{}{-}}\StringTok{ }\NormalTok{finalize\_tree }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}

\CommentTok{\# Metrics}
\NormalTok{(tree\_fit\_viz\_metr }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_eval}\NormalTok{(tree\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-36-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Confusion matrix}
\NormalTok{(tree\_fit\_viz\_mat }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_conf}\NormalTok{(tree\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-36-2.pdf}

\begin{itemize}
\tightlist
\item
  Visualize variable importance
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tree\_fit\_tuned }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull\_workflow\_fit}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{vip}\OperatorTok{::}\KeywordTok{vip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
\end{verbatim}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-37-1.pdf}

\hypertarget{test-fit-1}{%
\paragraph{Test fit}\label{test-fit-1}}

\begin{itemize}
\tightlist
\item
  Apply the tuned model to the test dataset
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_fit \textless{}{-}}\StringTok{ }\NormalTok{finalize\_tree }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(test\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ test\_y\_class)))}

\KeywordTok{evaluate\_class}\NormalTok{(test\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.761
## 2 precision binary         0.778
## 3 recall    binary         0.667
\end{verbatim}

In the next subsection, we will learn variants of ensemble models that improve decision tree models by putting models together.

\hypertarget{bagging-random-forest}{%
\subsection{Bagging (Random forest)}\label{bagging-random-forest}}

Key idea applied across all ensemble models (bagging, boosting, and stacking):
single learner -\textgreater{} N learners (N \textgreater{} 1)

Many learners could perform better than a single learner as this approach reduces the \textbf{variance} of a single estimate and provides more stability.

Here we focus on the difference between bagging and boosting. In short, boosting may reduce bias while increasing variance. On the other hand, bagging may reduce variance but has nothing to do with bias. Please check out \href{https://quantdare.com/what-is-the-difference-between-bagging-and-boosting/}{What is the difference between Bagging and Boosting?} by aporras.

\textbf{bagging}

\begin{itemize}
\item
  Data: Training data will be randomly sampled with replacement (bootstrapping samples + drawing random \textbf{subsets} of features for training individual trees)
\item
  Learning: Building models in parallel (independently)
\item
  Prediction: Simple average of the estimated responses (majority vote system)
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://sebastianraschka.com/images/faq/bagging-boosting-rf/bagging.png}
\caption{From Sebastian Raschka's blog}
\end{figure}

\textbf{boosting}

\begin{itemize}
\item
  Data: Weighted training data will be random sampled
\item
  Learning: Building models sequentially (mispredicted cases would receive more weights)
\item
  Prediction: Weighted average of the estimated responses
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://sebastianraschka.com/images/faq/bagging-boosting-rf/boosting.png}
\caption{From Sebastian Raschka's blog}
\end{figure}

\hypertarget{parsnip-2}{%
\subsubsection{parsnip}\label{parsnip-2}}

\begin{itemize}
\tightlist
\item
  Build a model
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Specify a model
\item
  Specify an engine
\item
  Specify a mode
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# workflow}
\NormalTok{rand\_wf \textless{}{-}}\StringTok{ }\KeywordTok{workflow}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_formula}\NormalTok{(target }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.)}

\CommentTok{\# spec}
\NormalTok{rand\_spec \textless{}{-}}\StringTok{ }\KeywordTok{rand\_forest}\NormalTok{(}

  \CommentTok{\# Mode}
  \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}

  \CommentTok{\# Tuning hyperparameters}
  \DataTypeTok{mtry =} \OtherTok{NULL}\NormalTok{, }\CommentTok{\# The number of predictors to available for splitting at each node}
  \DataTypeTok{min\_n =} \OtherTok{NULL}\NormalTok{, }\CommentTok{\# The minimum number of data points needed to keep splitting nodes}
  \DataTypeTok{trees =} \DecValTok{500}
\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# The number of trees}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{,}
    \CommentTok{\# We want the importance of predictors to be assessed.}
    \DataTypeTok{seed =} \DecValTok{1234}\NormalTok{,}
    \DataTypeTok{importance =} \StringTok{"permutation"}
\NormalTok{  )}

\NormalTok{rand\_wf \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_model}\NormalTok{(rand\_spec)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Fit a model
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_fit \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}
\end{Highlighting}
\end{Shaded}

\hypertarget{yardstick-2}{%
\subsubsection{yardstick}\label{yardstick-2}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Define performance metrics}
\NormalTok{metrics \textless{}{-}}\StringTok{ }\NormalTok{yardstick}\OperatorTok{::}\KeywordTok{metric\_set}\NormalTok{(accuracy, precision, recall)}

\NormalTok{rand\_fit\_viz\_metr \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_eval}\NormalTok{(rand\_fit)}

\NormalTok{rand\_fit\_viz\_metr}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-41-1.pdf}

\begin{itemize}
\tightlist
\item
  Visualize the confusion matrix.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_fit\_viz\_mat \textless{}{-}}\StringTok{ }\KeywordTok{visualize\_class\_conf}\NormalTok{(rand\_fit)}

\NormalTok{rand\_fit\_viz\_mat}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-42-1.pdf}

\hypertarget{tune-2}{%
\subsubsection{tune}\label{tune-2}}

\hypertarget{tune-ingredients-2}{%
\paragraph{tune ingredients}\label{tune-ingredients-2}}

We focus on the following two hyperparameters:

\begin{itemize}
\item
  \texttt{mtry}: The number of predictors available for splitting at each node.
\item
  \texttt{min\_n}: The minimum number of data points needed to keep splitting nodes.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tune\_spec \textless{}{-}}
\StringTok{  }\KeywordTok{rand\_forest}\NormalTok{(}
    \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}

    \CommentTok{\# Tuning hyperparameters}
    \DataTypeTok{mtry =} \KeywordTok{tune}\NormalTok{(),}
    \DataTypeTok{min\_n =} \KeywordTok{tune}\NormalTok{()}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"ranger"}\NormalTok{,}
    \DataTypeTok{seed =} \DecValTok{1234}\NormalTok{,}
    \DataTypeTok{importance =} \StringTok{"permutation"}
\NormalTok{  )}

\NormalTok{rand\_grid \textless{}{-}}\StringTok{ }\KeywordTok{grid\_regular}\NormalTok{(}\KeywordTok{mtry}\NormalTok{(}\DataTypeTok{range =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{10}\NormalTok{)),}
  \KeywordTok{min\_n}\NormalTok{(}\DataTypeTok{range =} \KeywordTok{c}\NormalTok{(}\DecValTok{2}\NormalTok{, }\DecValTok{10}\NormalTok{)),}
  \DataTypeTok{levels =} \DecValTok{5}
\NormalTok{)}

\NormalTok{rand\_grid }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(min\_n)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 5 x 2
##   min_n     n
##   <int> <int>
## 1     2     5
## 2     4     5
## 3     6     5
## 4     8     5
## 5    10     5
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# 10{-}fold cross{-}validation}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility}

\NormalTok{rand\_folds \textless{}{-}}\StringTok{ }\KeywordTok{vfold\_cv}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)),}
  \DataTypeTok{strata =}\NormalTok{ target}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{add-these-elements-to-a-workflow-2}{%
\paragraph{Add these elements to a workflow}\label{add-these-elements-to-a-workflow-2}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Update workflow}
\NormalTok{rand\_wf \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{update\_model}\NormalTok{(tune\_spec)}

\CommentTok{\# Tuning results}
\NormalTok{rand\_res \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tune\_grid}\NormalTok{(}
    \DataTypeTok{resamples =}\NormalTok{ rand\_folds,}
    \DataTypeTok{grid =}\NormalTok{ rand\_grid,}
    \DataTypeTok{metrics =}\NormalTok{ metrics}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize-2}{%
\paragraph{Visualize}\label{visualize-2}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect\_metrics}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{min\_n =} \KeywordTok{factor}\NormalTok{(min\_n)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(mtry, mean, }\DataTypeTok{color =}\NormalTok{ min\_n)) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Line + Point plot}
\StringTok{  }\KeywordTok{geom\_line}\NormalTok{(}\DataTypeTok{size =} \FloatTok{1.5}\NormalTok{, }\DataTypeTok{alpha =} \FloatTok{0.6}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{size =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Subplots}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{.metric,}
    \DataTypeTok{scales =} \StringTok{"free"}\NormalTok{,}
    \DataTypeTok{nrow =} \DecValTok{2}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Log scale x}
\StringTok{  }\KeywordTok{scale\_x\_log10}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\KeywordTok{label\_number}\NormalTok{()) }\OperatorTok{+}
\StringTok{  }\CommentTok{\# Discrete color scale}
\StringTok{  }\KeywordTok{scale\_color\_viridis\_d}\NormalTok{(}\DataTypeTok{option =} \StringTok{"plasma"}\NormalTok{, }\DataTypeTok{begin =} \FloatTok{.9}\NormalTok{, }\DataTypeTok{end =} \DecValTok{0}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"The number of predictors to be sampled"}\NormalTok{,}
    \DataTypeTok{col =} \StringTok{"The minimum number of data points needed for splitting"}\NormalTok{,}
    \DataTypeTok{y =} \OtherTok{NULL}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{legend.position =} \StringTok{"bottom"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-46-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Optimal hyperparameter}
\NormalTok{best\_tree \textless{}{-}}\StringTok{ }\KeywordTok{select\_best}\NormalTok{(rand\_res, }\StringTok{"accuracy"}\NormalTok{)}

\NormalTok{best\_tree}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 3
##    mtry min_n .config              
##   <int> <int> <chr>                
## 1     1     4 Preprocessor1_Model06
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add the hyperparameter to the workflow}
\NormalTok{finalize\_tree \textless{}{-}}\StringTok{ }\NormalTok{rand\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{finalize\_workflow}\NormalTok{(best\_tree)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_fit\_tuned \textless{}{-}}\StringTok{ }\NormalTok{finalize\_tree }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}

\CommentTok{\# Metrics}
\NormalTok{(rand\_fit\_viz\_metr }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_eval}\NormalTok{(rand\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-48-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Confusion matrix}
\NormalTok{(rand\_fit\_viz\_mat }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_conf}\NormalTok{(rand\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-48-2.pdf}

\begin{itemize}
\tightlist
\item
  Visualize variable importance
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rand\_fit\_tuned }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull\_workflow\_fit}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{vip}\OperatorTok{::}\KeywordTok{vip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
\end{verbatim}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-49-1.pdf}

\hypertarget{test-fit-2}{%
\paragraph{Test fit}\label{test-fit-2}}

\begin{itemize}
\tightlist
\item
  Apply the tuned model to the test dataset
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_fit \textless{}{-}}\StringTok{ }\NormalTok{finalize\_tree }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(test\_x\_class }\OperatorTok{\%\textgreater{}\%}
\StringTok{    }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ test\_y\_class)))}

\KeywordTok{evaluate\_class}\NormalTok{(test\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.913
## 2 precision binary         0.905
## 3 recall    binary         0.905
\end{verbatim}

\hypertarget{boosting-xgboost}{%
\subsection{Boosting (XGboost)}\label{boosting-xgboost}}

\hypertarget{parsnip-3}{%
\subsubsection{parsnip}\label{parsnip-3}}

\begin{itemize}
\tightlist
\item
  Build a model
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Specify a model
\item
  Specify an engine
\item
  Specify a mode
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# workflow}
\NormalTok{xg\_wf \textless{}{-}}\StringTok{ }\KeywordTok{workflow}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_formula}\NormalTok{(target }\OperatorTok{\textasciitilde{}}\StringTok{ }\NormalTok{.)}

\CommentTok{\# spec}
\NormalTok{xg\_spec \textless{}{-}}\StringTok{ }\KeywordTok{boost\_tree}\NormalTok{(}

  \CommentTok{\# Mode}
  \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}

  \CommentTok{\# Tuning hyperparameters}

  \CommentTok{\# The number of trees to fit, aka boosting iterations}
  \DataTypeTok{trees =} \KeywordTok{c}\NormalTok{(}\DecValTok{100}\NormalTok{, }\DecValTok{300}\NormalTok{, }\DecValTok{500}\NormalTok{, }\DecValTok{700}\NormalTok{, }\DecValTok{900}\NormalTok{),}
  \CommentTok{\# The depth of the decision tree (how many levels of splits).}
  \DataTypeTok{tree\_depth =} \KeywordTok{c}\NormalTok{(}\DecValTok{1}\NormalTok{, }\DecValTok{6}\NormalTok{),}
  \CommentTok{\# Learning rate: lower means the ensemble will adapt more slowly.}
  \DataTypeTok{learn\_rate =} \KeywordTok{c}\NormalTok{(}\FloatTok{0.0001}\NormalTok{, }\FloatTok{0.01}\NormalTok{, }\FloatTok{0.2}\NormalTok{),}
  \CommentTok{\# Stop splitting a tree if we only have this many obs in a tree node.}
  \DataTypeTok{min\_n =}\NormalTok{ 10L}
\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"xgboost"}\NormalTok{)}

\NormalTok{xg\_wf \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{add\_model}\NormalTok{(xg\_spec)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Fit a model
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in begin_iteration:end_iteration: numerical expression has 5 elements:
## only the first used
\end{verbatim}

\begin{verbatim}
## [22:46:13] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
\end{verbatim}

\hypertarget{yardstick-3}{%
\subsubsection{yardstick}\label{yardstick-3}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{metrics \textless{}{-}}\StringTok{ }\KeywordTok{metric\_set}\NormalTok{(}
\NormalTok{  yardstick}\OperatorTok{::}\NormalTok{accuracy,}
\NormalTok{  yardstick}\OperatorTok{::}\NormalTok{precision,}
\NormalTok{  yardstick}\OperatorTok{::}\NormalTok{recall}
\NormalTok{)}

\KeywordTok{evaluate\_class}\NormalTok{(xg\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.739
## 2 precision binary         0.705
## 3 recall    binary         0.738
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit\_viz\_metr \textless{}{-}}
\StringTok{  }\KeywordTok{visualize\_class\_eval}\NormalTok{(xg\_fit)}

\NormalTok{xg\_fit\_viz\_metr}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-54-1.pdf}

\begin{itemize}
\tightlist
\item
  Visualize the confusion matrix.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit\_viz\_mat \textless{}{-}}
\StringTok{  }\KeywordTok{visualize\_class\_conf}\NormalTok{(xg\_fit)}

\NormalTok{xg\_fit\_viz\_mat}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-55-1.pdf}

\hypertarget{tune-3}{%
\subsubsection{tune}\label{tune-3}}

\hypertarget{tune-ingredients-3}{%
\paragraph{tune ingredients}\label{tune-ingredients-3}}

\begin{itemize}
\tightlist
\item
  We focus on the following hyperparameters: \texttt{trees,} \texttt{tree\_depth,} \texttt{learn\_rate,} \texttt{min\_n,} \texttt{mtry,} \texttt{loss\_reduction,} and \texttt{sample\_size}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{tune\_spec \textless{}{-}}
\StringTok{  }\NormalTok{xg\_spec \textless{}{-}}\StringTok{ }\KeywordTok{boost\_tree}\NormalTok{(}

    \CommentTok{\# Mode}
    \DataTypeTok{mode =} \StringTok{"classification"}\NormalTok{,}

    \CommentTok{\# Tuning hyperparameters}

    \CommentTok{\# The number of trees to fit, aka boosting iterations}
    \DataTypeTok{trees =} \KeywordTok{tune}\NormalTok{(),}
    \CommentTok{\# The depth of the decision tree (how many levels of splits).}
    \DataTypeTok{tree\_depth =} \KeywordTok{tune}\NormalTok{(),}
    \CommentTok{\# Learning rate: lower means the ensemble will adapt more slowly.}
    \DataTypeTok{learn\_rate =} \KeywordTok{tune}\NormalTok{(),}
    \CommentTok{\# Stop splitting a tree if we only have this many obs in a tree node.}
    \DataTypeTok{min\_n =} \KeywordTok{tune}\NormalTok{(),}
    \DataTypeTok{loss\_reduction =} \KeywordTok{tune}\NormalTok{(),}
    \CommentTok{\# The number of randomly selected hyperparameters}
    \DataTypeTok{mtry =} \KeywordTok{tune}\NormalTok{(),}
    \CommentTok{\# The size of the data set used for modeling within an iteration}
    \DataTypeTok{sample\_size =} \KeywordTok{tune}\NormalTok{()}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{set\_engine}\NormalTok{(}\StringTok{"xgboost"}\NormalTok{)}

\CommentTok{\# Space{-}filling hyperparameter grids}
\NormalTok{xg\_grid \textless{}{-}}\StringTok{ }\KeywordTok{grid\_latin\_hypercube}\NormalTok{(}
  \KeywordTok{trees}\NormalTok{(),}
  \KeywordTok{tree\_depth}\NormalTok{(),}
  \KeywordTok{learn\_rate}\NormalTok{(),}
  \KeywordTok{min\_n}\NormalTok{(),}
  \KeywordTok{loss\_reduction}\NormalTok{(),}
  \DataTypeTok{sample\_size =} \KeywordTok{sample\_prop}\NormalTok{(),}
  \KeywordTok{finalize}\NormalTok{(}\KeywordTok{mtry}\NormalTok{(), train\_x\_class),}
  \DataTypeTok{size =} \DecValTok{30}
\NormalTok{)}

\CommentTok{\# 10{-}fold cross{-}validation}

\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1234}\NormalTok{) }\CommentTok{\# for reproducibility}

\NormalTok{xg\_folds \textless{}{-}}\StringTok{ }\KeywordTok{vfold\_cv}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)),}
  \DataTypeTok{strata =}\NormalTok{ target}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{add-these-elements-to-a-workflow-3}{%
\paragraph{Add these elements to a workflow}\label{add-these-elements-to-a-workflow-3}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Update workflow}
\NormalTok{xg\_wf \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{update\_model}\NormalTok{(tune\_spec)}

\CommentTok{\# Tuning results}
\NormalTok{xg\_res \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tune\_grid}\NormalTok{(}
    \DataTypeTok{resamples =}\NormalTok{ xg\_folds,}
    \DataTypeTok{grid =}\NormalTok{ xg\_grid,}
    \DataTypeTok{control =} \KeywordTok{control\_grid}\NormalTok{(}\DataTypeTok{save\_pred =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{visualize-3}{%
\paragraph{Visualize}\label{visualize-3}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"filter"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Removing existing preference
## [conflicted] Will prefer dplyr::filter over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect\_metrics}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(.metric }\OperatorTok{==}\StringTok{ "roc\_auc"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pivot\_longer}\NormalTok{(mtry}\OperatorTok{:}\NormalTok{sample\_size,}
    \DataTypeTok{values\_to =} \StringTok{"value"}\NormalTok{,}
    \DataTypeTok{names\_to =} \StringTok{"parameter"}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ value, }\DataTypeTok{y =}\NormalTok{ mean, }\DataTypeTok{color =}\NormalTok{ parameter)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.8}\NormalTok{, }\DataTypeTok{show.legend =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{parameter, }\DataTypeTok{scales =} \StringTok{"free\_x"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{y =} \StringTok{"AUC"}\NormalTok{,}
    \DataTypeTok{x =} \OtherTok{NULL}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-58-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Optimal hyperparameter}
\NormalTok{best\_xg \textless{}{-}}\StringTok{ }\KeywordTok{select\_best}\NormalTok{(xg\_res, }\StringTok{"roc\_auc"}\NormalTok{)}

\NormalTok{best\_xg}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 8
##    mtry trees min_n tree_depth    learn_rate loss_reduction sample_size .config 
##   <int> <int> <int>      <int>         <dbl>          <dbl>       <dbl> <chr>   
## 1     6  1856     6         10 0.00000000859  0.00000000102       0.681 Preproc~
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Add the hyperparameter to the workflow}
\NormalTok{finalize\_xg \textless{}{-}}\StringTok{ }\NormalTok{xg\_wf }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{finalize\_workflow}\NormalTok{(best\_xg)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit\_tuned \textless{}{-}}\StringTok{ }\NormalTok{finalize\_xg }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(train\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ train\_y\_class)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [22:47:59] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Metrics}
\NormalTok{(xg\_fit\_viz\_metr }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_eval}\NormalTok{(xg\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-60-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Confusion matrix}
\NormalTok{(xg\_fit\_viz\_mat }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Non{-}tuned"}\NormalTok{)) }\OperatorTok{/}\StringTok{ }\NormalTok{(}\KeywordTok{visualize\_class\_conf}\NormalTok{(xg\_fit\_tuned) }\OperatorTok{+}\StringTok{ }\KeywordTok{labs}\NormalTok{(}\DataTypeTok{title =} \StringTok{"Tuned"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-60-2.pdf}

\begin{itemize}
\tightlist
\item
  Visualize variable importance
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xg\_fit\_tuned }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{pull\_workflow\_fit}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{vip}\OperatorTok{::}\KeywordTok{vip}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
\end{verbatim}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-61-1.pdf}

\hypertarget{test-fit-3}{%
\paragraph{Test fit}\label{test-fit-3}}

\begin{itemize}
\tightlist
\item
  Apply the tuned model to the test dataset
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_fit \textless{}{-}}\StringTok{ }\NormalTok{finalize\_xg }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{fit}\NormalTok{(test\_x\_class }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{bind\_cols}\NormalTok{(}\KeywordTok{tibble}\NormalTok{(}\DataTypeTok{target =}\NormalTok{ test\_y\_class)))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [22:48:00] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{evaluate\_class}\NormalTok{(test\_fit)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 3 x 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.761
## 2 precision binary         0.763
## 3 recall    binary         0.690
\end{verbatim}

\hypertarget{stacking-superlearner}{%
\subsection{Stacking (SuperLearner)}\label{stacking-superlearner}}

This stacking part of the book heavily relies on \href{https://github.com/dlab-berkeley/Machine-Learning-in-R/blob/master/07-ensembles.Rmd}{Chris Kennedy's notebook}.

\hypertarget{overview}{%
\subsubsection{Overview}\label{overview}}

\hypertarget{stacking}{%
\paragraph{Stacking}\label{stacking}}

Wolpert, D.H., 1992. \href{http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.56.1533}{Stacked generalization}. \emph{Neural networks}, 5(2), pp.241-259.

Breiman, L., 1996. {[}Stacked regressions{]}((\url{https://statistics.berkeley.edu/sites/default/files/tech-reports/367.pdf}). \emph{Machine learning}, 24(1), pp.49-64.

\hypertarget{superlearner}{%
\paragraph{SuperLearner}\label{superlearner}}

The \href{https://cran.r-project.org/web/packages/SuperLearner/index.html}{``SuperLearner'' R package} is a method that simplifies ensemble learning by allowing you to simultaneously evaluate the cross-validated performance of multiple algorithms and/or a single algorithm with differently tuned hyperparameters. This is a generally advisable approach to machine learning instead of fitting single algorithms.

Let's see how the four classification algorithms you learned in this workshop (1-lasso, 2-decision tree, 3-random forest, and 4-gradient boosted trees) compare to each other and also to 5-binary logistic regression (\texttt{glm}) and the 6-mean of Y as a benchmark algorithm, in terms of their cross-validated error!

A ``wrapper'' is a short function that adapts an algorithm for the SuperLearner package. Check out the different algorithm wrappers offered by SuperLearner:

\hypertarget{choose-algorithms}{%
\subsubsection{Choose algorithms}\label{choose-algorithms}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Review available models}
\NormalTok{SuperLearner}\OperatorTok{::}\KeywordTok{listWrappers}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## All prediction algorithm wrappers in SuperLearner:
\end{verbatim}

\begin{verbatim}
##  [1] "SL.bartMachine"      "SL.bayesglm"         "SL.biglasso"        
##  [4] "SL.caret"            "SL.caret.rpart"      "SL.cforest"         
##  [7] "SL.earth"            "SL.extraTrees"       "SL.gam"             
## [10] "SL.gbm"              "SL.glm"              "SL.glm.interaction" 
## [13] "SL.glmnet"           "SL.ipredbagg"        "SL.kernelKnn"       
## [16] "SL.knn"              "SL.ksvm"             "SL.lda"             
## [19] "SL.leekasso"         "SL.lm"               "SL.loess"           
## [22] "SL.logreg"           "SL.mean"             "SL.nnet"            
## [25] "SL.nnls"             "SL.polymars"         "SL.qda"             
## [28] "SL.randomForest"     "SL.ranger"           "SL.ridge"           
## [31] "SL.rpart"            "SL.rpartPrune"       "SL.speedglm"        
## [34] "SL.speedlm"          "SL.step"             "SL.step.forward"    
## [37] "SL.step.interaction" "SL.stepAIC"          "SL.svm"             
## [40] "SL.template"         "SL.xgboost"
\end{verbatim}

\begin{verbatim}
## 
## All screening algorithm wrappers in SuperLearner:
\end{verbatim}

\begin{verbatim}
## [1] "All"
## [1] "screen.corP"           "screen.corRank"        "screen.glmnet"        
## [4] "screen.randomForest"   "screen.SIS"            "screen.template"      
## [7] "screen.ttest"          "write.screen.template"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Compile the algorithm wrappers to be used.}
\NormalTok{sl\_lib \textless{}{-}}\StringTok{ }\KeywordTok{c}\NormalTok{(}
  \StringTok{"SL.mean"}\NormalTok{, }\CommentTok{\# Marginal mean of the outcome ()}
  \StringTok{"SL.glmnet"}\NormalTok{, }\CommentTok{\# GLM with lasso/elasticnet regularization}
  \StringTok{"SL.rpart"}\NormalTok{, }\CommentTok{\# Decision tree}
  \StringTok{"SL.ranger"}\NormalTok{, }\CommentTok{\# Random forest}
  \StringTok{"SL.xgboost"}
\NormalTok{) }\CommentTok{\# Xgbboost}
\end{Highlighting}
\end{Shaded}

\hypertarget{fit-model}{%
\subsubsection{Fit model}\label{fit-model}}

Fit the ensemble!

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# This is a seed that is compatible with multicore parallel processing.}
\CommentTok{\# See ?set.seed for more information.}
\KeywordTok{set.seed}\NormalTok{(}\DecValTok{1}\NormalTok{, }\StringTok{"L\textquotesingle{}Ecuyer{-}CMRG"}\NormalTok{)}

\CommentTok{\# This will take a few minutes to execute {-} take a look at the .html file to see the output!}
\NormalTok{cv\_sl \textless{}{-}}\StringTok{ }\NormalTok{SuperLearner}\OperatorTok{::}\KeywordTok{CV.SuperLearner}\NormalTok{(}
  \DataTypeTok{Y =} \KeywordTok{as.numeric}\NormalTok{(}\KeywordTok{as.character}\NormalTok{(train\_y\_class)),}
  \DataTypeTok{X =}\NormalTok{ train\_x\_class,}
  \DataTypeTok{family =} \KeywordTok{binomial}\NormalTok{(),}
  \CommentTok{\# For a real analysis we would use V = 10.}
  \DataTypeTok{cvControl =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{V =}\NormalTok{ 5L, }\DataTypeTok{stratifyCV =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{SL.library =}\NormalTok{ sl\_lib,}
  \DataTypeTok{verbose =} \OtherTok{FALSE}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{risk}{%
\subsubsection{Risk}\label{risk}}

Risk is the average loss, and loss is how far off the prediction was for an individual observation. The lower the risk, the fewer errors the model makes in its prediction. SuperLearner's default loss metric is squared error \((y_{actual} - y_{predicted})^2\), so the risk is the mean-squared error (just like in ordinary least \emph{squares} regression). View the summary, plot results, and compute the Area Under the ROC Curve (AUC)!

\hypertarget{summary}{%
\paragraph{Summary}\label{summary}}

\begin{itemize}
\tightlist
\item
  \texttt{Discrete\ SL} chooses the best single learner (in this case, \texttt{SL.glmnet} or \texttt{lasso}).
\item
  \texttt{SuperLearner} takes a weighted average of the \textbf{models} using the coefficients (importance of each learner in the overall ensemble). Coefficient 0 means that learner is not used at all.
\item
  \texttt{SL.mean\_All} (the weighted mean of \(Y\)) is a benchmark algorithm (ignoring features).
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{summary}\NormalTok{(cv\_sl)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Call:  
## SuperLearner::CV.SuperLearner(Y = as.numeric(as.character(train_y_class)),  
##     X = train_x_class, family = binomial(), SL.library = sl_lib, verbose = FALSE,  
##     cvControl = list(V = 5L, stratifyCV = TRUE)) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  5 
## 
##       Algorithm     Ave        se      Min     Max
##   Super Learner 0.11275 0.0135331 0.077146 0.14245
##     Discrete SL 0.11899 0.0144538 0.075281 0.16458
##     SL.mean_All 0.24798 0.0030968 0.247743 0.24895
##   SL.glmnet_All 0.10731 0.0135643 0.075281 0.14245
##    SL.rpart_All 0.16374 0.0184360 0.107553 0.22803
##   SL.ranger_All 0.12551 0.0119097 0.097342 0.15908
##  SL.xgboost_All 0.13336 0.0151555 0.104319 0.16458
\end{verbatim}

\hypertarget{plot}{%
\paragraph{Plot}\label{plot}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Plot the cross{-}validated risk estimate with 95\% CIs.}

\KeywordTok{plot}\NormalTok{(cv\_sl)}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/cvsl_review-1.pdf}

\hypertarget{compute-auc-for-all-estimators}{%
\subsubsection{Compute AUC for all estimators}\label{compute-auc-for-all-estimators}}

\textbf{ROC}

ROC: a ROC (receiver operating characteristic curve) plots the relationship between True Positive Rate (Y-axis) and FALSE Positive Rate (X-axis).

\begin{figure}
\centering
\includegraphics{https://developers.google.com/machine-learning/crash-course/images/AUC.svg}
\caption{Area Under the ROC Curve}
\end{figure}

\textbf{AUC}

AUC: Area Under the ROC Curve

1 = perfect

0.5 = no better than chance

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ck37r}\OperatorTok{::}\KeywordTok{auc\_table}\NormalTok{(cv\_sl)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##                      auc         se  ci_lower  ci_upper      p-value
## SL.mean_All    0.5000000 0.06912305 0.3645213 0.6354787 5.438317e-10
## SL.rpart_All   0.8354691 0.03936978 0.7583058 0.9126325 1.455568e-02
## SL.xgboost_All 0.8801602 0.02451881 0.8321042 0.9282162 4.639456e-02
## DiscreteSL     0.9064989 0.02057929 0.8661642 0.9468335 2.349099e-01
## SL.ranger_All  0.9075744 0.02002181 0.8683323 0.9468164 2.453551e-01
## SuperLearner   0.9149428 0.01984636 0.8760446 0.9538409 3.729693e-01
## SL.glmnet_All  0.9213730 0.01925689 0.8836302 0.9591158 5.000000e-01
\end{verbatim}

\hypertarget{plot-the-roc-curve-for-the-best-estimator-discretsl}{%
\paragraph{Plot the ROC curve for the best estimator (DiscretSL)}\label{plot-the-roc-curve-for-the-best-estimator-discretsl}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ck37r}\OperatorTok{::}\KeywordTok{plot\_roc}\NormalTok{(cv\_sl)}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-68-1.pdf}

\hypertarget{review-weight-distribution-for-the-superlearner}{%
\paragraph{Review weight distribution for the SuperLearner}\label{review-weight-distribution-for-the-superlearner}}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{print}\NormalTok{(ck37r}\OperatorTok{::}\KeywordTok{cvsl\_weights}\NormalTok{(cv\_sl), }\DataTypeTok{row.names =} \OtherTok{FALSE}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  # Learner    Mean      SD     Min     Max
##  1  glmnet 0.78566 0.21528 0.48711 1.00000
##  2 xgboost 0.12437 0.17523 0.00000 0.41331
##  3  ranger 0.07006 0.15665 0.00000 0.35028
##  4   rpart 0.01992 0.04453 0.00000 0.09958
##  5    mean 0.00000 0.00000 0.00000 0.00000
\end{verbatim}

The general stacking approach is available in the tidymodels framework through \href{https://github.com/tidymodels/stacks}{\texttt{stacks}} package (developmental stage).

However, SuperLearner is currently not available in the tidymodels framework. You can easily build and add a parsnip model if you'd like to. If you are interested in knowing more about it, please look at \href{https://www.tidymodels.org/learn/develop/models/}{this vignette} of the tidymodels.

\hypertarget{applications-2}{%
\subsection{Applications}\label{applications-2}}

\hypertarget{bandit-algorithm-optimizing-an-experiment}{%
\subsubsection{Bandit algorithm (optimizing an experiment)}\label{bandit-algorithm-optimizing-an-experiment}}

\hypertarget{causal-forest-estimating-heterogeneous-treatment-effect}{%
\subsubsection{Causal forest (estimating heterogeneous treatment effect)}\label{causal-forest-estimating-heterogeneous-treatment-effect}}

\hypertarget{unsupervised-learning}{%
\section{Unsupervised learning}\label{unsupervised-learning}}

x -\textgreater{} f - \textgreater{} y (not defined)

\hypertarget{dimension-reduction}{%
\subsection{Dimension reduction}\label{dimension-reduction}}

\begin{figure}
\centering
\includegraphics{https://i.stack.imgur.com/Q7HIP.gif}
\caption{Projecting 2D-data to a line (PCA). From vas3k.com}
\end{figure}

\hypertarget{correlation-analysis}{%
\subsubsection{Correlation analysis}\label{correlation-analysis}}

This dataset is a good problem for PCA as some features are highly correlated.

Again, think about what the dataset is about. The following data dictionary comes from \href{http://rstudio-pubs-static.s3.amazonaws.com/24341_184a58191486470cab97acdbbfe78ed5.html}{this site}.

\begin{itemize}
\tightlist
\item
  age - age in years
\item
  sex - sex (1 = male; 0 = female)
\item
  cp - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)
\item
  trestbps - resting blood pressure (in mm Hg on admission to the hospital)
\item
  chol - serum cholestoral in mg/dl
\item
  fbs - fasting blood sugar \textgreater{} 120 mg/dl (1 = true; 0 = false)
\item
  restecg - resting electrocardiographic results (0 = normal; 1 = having ST-T; 2 = hypertrophy)
\item
  thalach - maximum heart rate achieved
\item
  exang - exercise induced angina (1 = yes; 0 = no)
\item
  oldpeak - ST depression induced by exercise relative to rest
  slope - the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
\item
  ca - number of major vessels (0-3) colored by flourosopy
\item
  thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
\item
  num - the predicted attribute - diagnosis of heart disease (angiographic disease status) (Value 0 = \textless{} 50\% diameter narrowing; Value 1 = \textgreater{} 50\% diameter narrowing)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data\_original }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\OperatorTok{{-}}\NormalTok{target) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\NormalTok{corrr}\OperatorTok{::}\KeywordTok{correlate}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
\end{verbatim}

\begin{verbatim}
## # A tibble: 13 x 14
##    term         age     sex      cp trestbps     chol      fbs restecg  thalach
##    <chr>      <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>   <dbl>    <dbl>
##  1 age      NA      -0.0984 -0.0687   0.279   0.214    0.121   -0.116  -0.399  
##  2 sex      -0.0984 NA      -0.0494  -0.0568 -0.198    0.0450  -0.0582 -0.0440 
##  3 cp       -0.0687 -0.0494 NA        0.0476 -0.0769   0.0944   0.0444  0.296  
##  4 trestbps  0.279  -0.0568  0.0476  NA       0.123    0.178   -0.114  -0.0467 
##  5 chol      0.214  -0.198  -0.0769   0.123  NA        0.0133  -0.151  -0.00994
##  6 fbs       0.121   0.0450  0.0944   0.178   0.0133  NA       -0.0842 -0.00857
##  7 restecg  -0.116  -0.0582  0.0444  -0.114  -0.151   -0.0842  NA       0.0441 
##  8 thalach  -0.399  -0.0440  0.296   -0.0467 -0.00994 -0.00857  0.0441 NA      
##  9 exang     0.0968  0.142  -0.394    0.0676  0.0670   0.0257  -0.0707 -0.379  
## 10 oldpeak   0.210   0.0961 -0.149    0.193   0.0540   0.00575 -0.0588 -0.344  
## 11 slope    -0.169  -0.0307  0.120   -0.121  -0.00404 -0.0599   0.0930  0.387  
## 12 ca        0.276   0.118  -0.181    0.101   0.0705   0.138   -0.0720 -0.213  
## 13 thal      0.0680  0.210  -0.162    0.0622  0.0988  -0.0320  -0.0120 -0.0964 
## # ... with 5 more variables: exang <dbl>, oldpeak <dbl>, slope <dbl>, ca <dbl>,
## #   thal <dbl>
\end{verbatim}

\hypertarget{descriptive-statistics}{%
\subsubsection{Descriptive statistics}\label{descriptive-statistics}}

Notice the scaling issues? PCA is not scale-invariant. So, we need to fix this problem.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{min\_max \textless{}{-}}\StringTok{ }\KeywordTok{list}\NormalTok{(}
  \DataTypeTok{min =} \OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{min}\NormalTok{(.x, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
  \DataTypeTok{max =} \OperatorTok{\textasciitilde{}}\StringTok{ }\KeywordTok{max}\NormalTok{(.x, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)}
\NormalTok{)}

\NormalTok{data\_original }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(}\OperatorTok{{-}}\NormalTok{target) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\KeywordTok{across}\NormalTok{(}\KeywordTok{where}\NormalTok{(is.numeric), min\_max))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 1 x 26
##   age_min age_max sex_min sex_max cp_min cp_max trestbps_min trestbps_max
##     <dbl>   <dbl>   <dbl>   <dbl>  <dbl>  <dbl>        <dbl>        <dbl>
## 1      29      77       0       1      0      3           94          200
## # ... with 18 more variables: chol_min <dbl>, chol_max <dbl>, fbs_min <dbl>,
## #   fbs_max <dbl>, restecg_min <dbl>, restecg_max <dbl>, thalach_min <dbl>,
## #   thalach_max <dbl>, exang_min <dbl>, exang_max <dbl>, oldpeak_min <dbl>,
## #   oldpeak_max <dbl>, slope_min <dbl>, slope_max <dbl>, ca_min <dbl>,
## #   ca_max <dbl>, thal_min <dbl>, thal_max <dbl>
\end{verbatim}

\hypertarget{preprocessing}{%
\subsubsection{Preprocessing}\label{preprocessing}}

\texttt{recipe} is essential for preprocessing multiple features at once.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pca\_recipe \textless{}{-}}\StringTok{ }\KeywordTok{recipe}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{., }\DataTypeTok{data =}\NormalTok{ data\_original) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Imputing NAs using mean}
\StringTok{  }\KeywordTok{step\_meanimpute}\NormalTok{(}\KeywordTok{all\_predictors}\NormalTok{()) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Normalize some numeric variables}
\StringTok{  }\KeywordTok{step\_normalize}\NormalTok{(}\KeywordTok{c}\NormalTok{(}\StringTok{"age"}\NormalTok{, }\StringTok{"trestbps"}\NormalTok{, }\StringTok{"chol"}\NormalTok{, }\StringTok{"thalach"}\NormalTok{, }\StringTok{"oldpeak"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: `step_meanimpute()` was deprecated in recipes 0.1.16.
## Please use `step_impute_mean()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
\end{verbatim}

\hypertarget{pca-analysis}{%
\subsubsection{PCA analysis}\label{pca-analysis}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pca\_res \textless{}{-}}\StringTok{ }\NormalTok{pca\_recipe }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{step\_pca}\NormalTok{(}\KeywordTok{all\_predictors}\NormalTok{(),}
    \DataTypeTok{id =} \StringTok{"pca"}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# id argument identifies each PCA step}
\StringTok{  }\KeywordTok{prep}\NormalTok{()}

\NormalTok{pca\_res }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{id =} \StringTok{"pca"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 196 x 4
##    terms        value component id   
##    <chr>        <dbl> <chr>     <chr>
##  1 age      -0.00101  PC1       pca  
##  2 sex       0.216    PC1       pca  
##  3 cp        0.321    PC1       pca  
##  4 trestbps  0.00118  PC1       pca  
##  5 chol     -0.000292 PC1       pca  
##  6 fbs       0.0468   PC1       pca  
##  7 restecg   0.166    PC1       pca  
##  8 thalach   0.0137   PC1       pca  
##  9 exang     0.0962   PC1       pca  
## 10 oldpeak  -0.00863  PC1       pca  
## # ... with 186 more rows
\end{verbatim}

\hypertarget{screeplot}{%
\paragraph{Screeplot}\label{screeplot}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# To avoid conflicts}
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"filter"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Removing existing preference
## [conflicted] Will prefer dplyr::filter over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{conflict\_prefer}\NormalTok{(}\StringTok{"select"}\NormalTok{, }\StringTok{"dplyr"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [conflicted] Will prefer dplyr::select over any other package
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pca\_recipe }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{step\_pca}\NormalTok{(}\KeywordTok{all\_predictors}\NormalTok{(),}
    \DataTypeTok{id =} \StringTok{"pca"}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# id argument identifies each PCA step}
\StringTok{  }\KeywordTok{prep}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{id =} \StringTok{"pca"}\NormalTok{, }\DataTypeTok{type =} \StringTok{"variance"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(terms }\OperatorTok{==}\StringTok{ "percent variance"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ component, }\DataTypeTok{y =}\NormalTok{ value)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"PCAs of heart disease"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"\% of variance"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Scree plot"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-74-1.pdf}

\hypertarget{view-factor-loadings}{%
\paragraph{View factor loadings}\label{view-factor-loadings}}

Loadings are the covariances between the features and the principal components (=eigenvectors).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pca\_recipe }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{step\_pca}\NormalTok{(}\KeywordTok{all\_predictors}\NormalTok{(),}
    \DataTypeTok{id =} \StringTok{"pca"}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# id argument identifies each PCA step}
\StringTok{  }\KeywordTok{prep}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tidy}\NormalTok{(}\DataTypeTok{id =} \StringTok{"pca"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{filter}\NormalTok{(component }\OperatorTok{\%in\%}\StringTok{ }\KeywordTok{c}\NormalTok{(}\StringTok{"PC1"}\NormalTok{, }\StringTok{"PC2"}\NormalTok{)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(terms, value), }\DataTypeTok{y =}\NormalTok{ value,}
    \DataTypeTok{fill =}\NormalTok{ component}
\NormalTok{  )) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{(}\DataTypeTok{position =} \StringTok{"dodge"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Terms"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Contribtutions"}\NormalTok{,}
    \DataTypeTok{fill =} \StringTok{"PCAs"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-75-1.pdf}

\textbf{The key lesson}

You can use these low-dimensional data to solve the curse of dimensionality problem. Compressing feature space via dimension reduction techniques is called feature extraction. PCA is one way of doing this.

\hypertarget{topic-modeling}{%
\subsection{Topic modeling}\label{topic-modeling}}

\hypertarget{setup-6}{%
\subsubsection{Setup}\label{setup-6}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{  tidytext, }\CommentTok{\# tidy text analysis}
\NormalTok{  glue, }\CommentTok{\# paste string and objects}
\NormalTok{  stm, }\CommentTok{\# structural topic modeling}
\NormalTok{  gutenbergr}
\NormalTok{) }\CommentTok{\# toy datasets}
\end{Highlighting}
\end{Shaded}

\hypertarget{dataset-1}{%
\subsubsection{Dataset}\label{dataset-1}}

The data munging process draws on \href{https://juliasilge.com/blog/sherlock-holmes-stm/}{Julia Silge's blog post}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock\_raw \textless{}{-}}\StringTok{ }\KeywordTok{gutenberg\_download}\NormalTok{(}\DecValTok{1661}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
\end{verbatim}

\begin{verbatim}
## Using mirror http://aleph.gutenberg.org
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock \textless{}{-}}\StringTok{ }\NormalTok{sherlock\_raw }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Mutate story using a conditional statement}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}
    \DataTypeTok{story =} \KeywordTok{ifelse}\NormalTok{(}\KeywordTok{str\_detect}\NormalTok{(text, }\StringTok{"ADVENTURE"}\NormalTok{), text, }\OtherTok{NA}\NormalTok{)}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Fill in missing values with next value}
\StringTok{  }\NormalTok{tidyr}\OperatorTok{::}\KeywordTok{fill}\NormalTok{(story, }\DataTypeTok{.direction =} \StringTok{"down"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Filter}
\StringTok{  }\NormalTok{dplyr}\OperatorTok{::}\KeywordTok{filter}\NormalTok{(story }\OperatorTok{!=}\StringTok{ "THE ADVENTURES OF SHERLOCK HOLMES"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\CommentTok{\# Factor}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{story =} \KeywordTok{factor}\NormalTok{(story, }\DataTypeTok{levels =} \KeywordTok{unique}\NormalTok{(story)))}

\NormalTok{sherlock \textless{}{-}}\StringTok{ }\NormalTok{sherlock[, }\DecValTok{2}\OperatorTok{:}\DecValTok{3}\NormalTok{] }\CommentTok{\# no id}
\end{Highlighting}
\end{Shaded}

\hypertarget{key-ideas}{%
\subsubsection{Key ideas}\label{key-ideas}}

\begin{figure}
\centering
\includegraphics{https://paperswithcode.com/media/thumbnails/task/task-0000000179-fd3a1d11_fGQkZCJ.jpg}
\caption{Source: paperswithcode.com}
\end{figure}

\begin{itemize}
\item
  Main papers: See \href{https://proceedings.neurips.cc/paper/2001/file/296472c9542ad4d4788d543508116cbc-Paper.pdf}{Latent Dirichlet Allocation} by David M. Blei, Andrew Y. Ng and Michael I. Jordan (then all Berkeley) and this \href{http://www.cse.cuhk.edu.hk/irwin.king/_media/presentations/latent_dirichlet_allocation.pdf}{follow-up paper} with the same title.
\item
  Topics as \textbf{distributions} of words (\(\beta\) distribution)
\item
  Documents as \textbf{distributions} of topics (\(\alpha\) distribution)
\item
  What distributions?

  \begin{itemize}
  \item
    Probability
  \item
    Multinominal
  \end{itemize}
\item
  Words lie on a lower-dimensional space (dimension reduction akin to PCA)
\item
  Co-occurrence of words (clustering)
\item
  Bag of words (feature engineering)

  \begin{itemize}
  \tightlist
  \item
    Upside: easy and fast (also working quite well)
  \item
    Downside: ignored grammatical structures and rich interactions among words (Alternative: word embeddings. Please check out \href{http://text2vec.org/}{text2vec})
  \end{itemize}
\item
  Documents are exchangeable (sequencing won't matter).
\item
  Topics are independent (uncorrelated). If you don't think this assumption holds, use Correlated Topics Models by \href{https://arxiv.org/pdf/0708.3601.pdf\#:~:text=The\%20correlated\%20topic\%20model\%20(CTM)\%20is\%20a\%20hierarchical\%20model\%20of,are\%20document\%2D\%20specific\%20random\%20variables.}{Blei and Lafferty (2007)}.
\end{itemize}

\hypertarget{exploratory-data-analysis}{%
\subsubsection{Exploratory data analysis}\label{exploratory-data-analysis}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock\_n \textless{}{-}}\StringTok{ }\NormalTok{sherlock }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest\_tokens}\NormalTok{(}
    \DataTypeTok{output =}\NormalTok{ word,}
    \DataTypeTok{input =}\NormalTok{ text}
\NormalTok{  ) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{count}\NormalTok{(story, word, }\DataTypeTok{sort =} \OtherTok{TRUE}\NormalTok{)}

\NormalTok{sherlock\_total\_n \textless{}{-}}\StringTok{ }\NormalTok{sherlock\_n }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(story) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{total =} \KeywordTok{sum}\NormalTok{(n))}

\NormalTok{sherlock\_words \textless{}{-}}\StringTok{ }\NormalTok{sherlock\_n }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{left\_join}\NormalTok{(sherlock\_total\_n)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "story"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{sherlock\_words }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{freq =}\NormalTok{ n }\OperatorTok{/}\StringTok{ }\NormalTok{total) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(story) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}
    \DataTypeTok{x =} \KeywordTok{fct\_reorder}\NormalTok{(word, freq),}
    \DataTypeTok{y =}\NormalTok{ freq,}
    \DataTypeTok{fill =}\NormalTok{ story}
\NormalTok{  )) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{story,}
    \DataTypeTok{ncol =} \DecValTok{2}\NormalTok{,}
    \DataTypeTok{scales =} \StringTok{"free\_y"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{""}\NormalTok{,}
    \DataTypeTok{fill =} \StringTok{"Story"}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{theme}\NormalTok{(}\DataTypeTok{legend.position =} \StringTok{"bottom"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Selecting by freq
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : conversion failure on 'IX. THE ADVENTURE OF THE ENGINEER’S THUMB' in
## 'mbcsToSbcs': dot substituted for <99>
\end{verbatim}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-78-1.pdf}

\hypertarget{stm}{%
\subsubsection{STM}\label{stm}}

\href{https://www.structuraltopicmodel.com/}{Structural Topic Modeling} by Roberts, Stewart, and Tingley helps estimate how topics' proportions vary by covariates. If you don't use covariates, this approach is close to CTM. The other useful (and very recent) topic modeling package is Keyword Assisted Topic Models (\href{https://keyatm.github.io/keyATM/}{keyATM}) by Shusei, Imai, and Sasaki.

Also, note that we didn't cover other important techniques in topic modeling, such as dynamic and hierarchical topic modeling.

\includegraphics{https://warin.ca/shiny/stm/images/fig02.png}

\hypertarget{turn-text-into-document-term-matrix}{%
\paragraph{Turn text into document-term matrix}\label{turn-text-into-document-term-matrix}}

\texttt{stm} package has its preprocessing function.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{dtm \textless{}{-}}\StringTok{ }\KeywordTok{textProcessor}\NormalTok{(}
  \DataTypeTok{documents =}\NormalTok{ sherlock}\OperatorTok{$}\NormalTok{text,}
  \DataTypeTok{metadata =}\NormalTok{ sherlock,}
  \DataTypeTok{removestopwords =} \OtherTok{TRUE}\NormalTok{,}
  \DataTypeTok{verbose =} \OtherTok{FALSE}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{tuning-k}{%
\paragraph{Tuning K}\label{tuning-k}}

\begin{itemize}
\tightlist
\item
  K is the number of topics.
\item
  Let's try K = 5, 10, 15.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_res \textless{}{-}}\StringTok{ }\KeywordTok{searchK}\NormalTok{(}
\NormalTok{  dtm}\OperatorTok{$}\NormalTok{documents,}
\NormalTok{  dtm}\OperatorTok{$}\NormalTok{vocab,}
  \DataTypeTok{K =} \KeywordTok{c}\NormalTok{(}\DecValTok{5}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{15}\NormalTok{),}
  \DataTypeTok{prevalence =} \OperatorTok{\textasciitilde{}}\NormalTok{story,}
  \DataTypeTok{data =}\NormalTok{ dtm}\OperatorTok{$}\NormalTok{meta}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      .....
##   Recovering initialization...
##      ..............................................
## Initialization complete.
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -7.627) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -7.512, relative change = 1.510e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -7.419, relative change = 1.228e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -7.381, relative change = 5.151e-03) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -7.365, relative change = 2.165e-03) 
## Topic 1: littl, man, see, hand, shall 
##  Topic 2: upon, holm, think, come, take 
##  Topic 3: said, will, just, know, word 
##  Topic 4: one, may, came, tell, ask 
##  Topic 5: time, sherlock, case, saw, face 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -7.358, relative change = 9.504e-04) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -7.355, relative change = 4.015e-04) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -7.354, relative change = 1.580e-04) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged 
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      ..........
##   Recovering initialization...
##      ..............................................
## Initialization complete.
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -7.699) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -7.499, relative change = 2.594e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -7.373, relative change = 1.684e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -7.287, relative change = 1.172e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -7.257, relative change = 4.115e-03) 
## Topic 1: miss, littl, came, man, good 
##  Topic 2: said, might, sudden, hous, went 
##  Topic 3: upon, just, never, right, two 
##  Topic 4: upon, will, one, see, may 
##  Topic 5: sherlock, name, think, laugh, holm 
##  Topic 6: see, hard, night, cri, forward 
##  Topic 7: littl, stone, becam, whole, sure 
##  Topic 8: can, know, matter, now, say 
##  Topic 9: man, hand, knew, one, even 
##  Topic 10: holm, ask, sat, “pray, long 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -7.248, relative change = 1.256e-03) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -7.247, relative change = 9.258e-05) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged 
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      ...............
##   Recovering initialization...
##      ..............................................
## Initialization complete.
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -7.749) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -7.417, relative change = 4.283e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -7.297, relative change = 1.624e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -7.242, relative change = 7.558e-03) 
## ....................................................................................................
## Completed E-Step (1 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -7.222, relative change = 2.745e-03) 
## Topic 1: think, holm, turn, now, “ye 
##  Topic 2: might, dress, hous, place, near 
##  Topic 3: know, without, now, “’s, money 
##  Topic 4: open, may, look, much, one 
##  Topic 5: hand, well, see, way, littl 
##  Topic 6: question, salesman, told, companion, close 
##  Topic 7: littl, told, feel, remark, quit 
##  Topic 8: can, matter, “oh, say, away 
##  Topic 9: will, shall, must, come, littl 
##  Topic 10: one, man, light, time, two 
##  Topic 11: upon, holm, miss, man, sherlock 
##  Topic 12: room, came, ask, just, hous 
##  Topic 13: may, tell, sir, find, help 
##  Topic 14: said, holm, believ, laugh, will 
##  Topic 15: littl, now, noth, day, saw 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -7.212, relative change = 1.382e-03) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -7.207, relative change = 5.993e-04) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -7.203, relative change = 5.851e-04) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -7.202, relative change = 9.837e-05) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged
\end{verbatim}

\hypertarget{evaludating-models}{%
\paragraph{Evaludating models}\label{evaludating-models}}

Several metrics assess topic models' performance: the held-out likelihood, residuals, semantic coherence, and exclusivity. Here we examine the relationship between semantic coherence and exclusivity to understand the trade-off involved in selecting K.

\begin{itemize}
\item
  Semantic coherence: high probability words for a topic co-occur in documents
\item
  Exclusivity: keywords of one topic are not likely to appear as keywords in other topics.
\end{itemize}

\begin{quote}
In Roberts et al.~2014 we proposed using the Mimno et al.~2011 semantic coherence metric for helping with topic model selection. However, we found that semantic coherence alone is relatively easy to achieve by having only a couple of topics that dominate the most common words. Thus we also proposed an exclusivity measure.
\end{quote}

\begin{quote}
Our exclusivity measure includes some information on word frequency as well. It is based on the FREX labeling metric (calcfrex) with the weight set to .7 in favor of exclusivity by default.
\end{quote}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{test\_res}\OperatorTok{$}\NormalTok{results }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{unnest}\NormalTok{(}\KeywordTok{c}\NormalTok{(K, exclus, semcoh)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(K, exclus, semcoh) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{K =} \KeywordTok{as.factor}\NormalTok{(K)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ exclus, }\DataTypeTok{y =}\NormalTok{ semcoh)) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_point}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_text}\NormalTok{(}
    \DataTypeTok{label =} \KeywordTok{glue}\NormalTok{(}\StringTok{"K = \{test\_res$results$K\}"}\NormalTok{),}
    \DataTypeTok{size =} \DecValTok{5}\NormalTok{,}
    \DataTypeTok{color =} \StringTok{"red"}\NormalTok{,}
    \DataTypeTok{position =} \KeywordTok{position\_jitter}\NormalTok{(}\DataTypeTok{width =} \FloatTok{0.05}\NormalTok{, }\DataTypeTok{height =} \FloatTok{0.05}\NormalTok{)}
\NormalTok{  ) }\OperatorTok{+}
\StringTok{  }\KeywordTok{labs}\NormalTok{(}
    \DataTypeTok{x =} \StringTok{"Exclusivity"}\NormalTok{,}
    \DataTypeTok{y =} \StringTok{"Semantic coherence"}\NormalTok{,}
    \DataTypeTok{title =} \StringTok{"Exclusivity and semantic coherence"}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-81-1.pdf}

\hypertarget{finalize}{%
\paragraph{Finalize}\label{finalize}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{final\_stm \textless{}{-}}\StringTok{ }\KeywordTok{stm}\NormalTok{(dtm}\OperatorTok{$}\NormalTok{documents,}
\NormalTok{  dtm}\OperatorTok{$}\NormalTok{vocab,}
  \DataTypeTok{K =} \DecValTok{10}\NormalTok{, }\DataTypeTok{prevalence =} \OperatorTok{\textasciitilde{}}\NormalTok{story,}
  \DataTypeTok{max.em.its =} \DecValTok{75}\NormalTok{,}
  \DataTypeTok{data =}\NormalTok{ dtm}\OperatorTok{$}\NormalTok{meta,}
  \DataTypeTok{init.type =} \StringTok{"Spectral"}\NormalTok{,}
  \DataTypeTok{seed =} \DecValTok{1234567}\NormalTok{,}
  \DataTypeTok{verbose =} \OtherTok{FALSE}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{explore-the-results}{%
\paragraph{Explore the results}\label{explore-the-results}}

\begin{itemize}
\tightlist
\item
  Using the \texttt{stm} package.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{plot}\NormalTok{(final\_stm)}
\end{Highlighting}
\end{Shaded}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-83-1.pdf}

\begin{itemize}
\tightlist
\item
  Using ggplot2
\end{itemize}

In LDA distribution, \(\alpha\) represents document-topic density and \(\beta\) represents topic-word density.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# tidy}
\NormalTok{tidy\_stm \textless{}{-}}\StringTok{ }\KeywordTok{tidy}\NormalTok{(final\_stm)}

\CommentTok{\# top terms}
\NormalTok{tidy\_stm }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(topic) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{top\_n}\NormalTok{(}\DecValTok{10}\NormalTok{, beta) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ungroup}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\KeywordTok{fct\_reorder}\NormalTok{(term, beta), beta, }\DataTypeTok{fill =} \KeywordTok{as.factor}\NormalTok{(topic))) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.8}\NormalTok{, }\DataTypeTok{show.legend =} \OtherTok{FALSE}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{facet\_wrap}\NormalTok{(}\OperatorTok{\textasciitilde{}}\NormalTok{topic, }\DataTypeTok{scales =} \StringTok{"free\_y"}\NormalTok{) }\OperatorTok{+}
\StringTok{  }\KeywordTok{coord\_flip}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_y\_continuous}\NormalTok{(}\DataTypeTok{labels =}\NormalTok{ scales}\OperatorTok{::}\NormalTok{percent) }\OperatorTok{+}
\StringTok{  }\KeywordTok{scale\_fill\_viridis\_d}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <e2>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <80>
\end{verbatim}

\begin{verbatim}
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'sir”' in 'mbcsToSbcs': dot substituted for <9d>
\end{verbatim}

\includegraphics{07_high_dimensional_data_files/figure-latex/unnamed-chunk-84-1.pdf}

\hypertarget{references-4}{%
\section{References}\label{references-4}}

\hypertarget{books}{%
\subsection{Books}\label{books}}

\begin{itemize}
\item
  \emph{An Introduction to Statistical Learning - with Applications in R (2013)} by Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani. Springer: New York. \href{https://www.amazon.com/Introduction-Statistical-Learning-Applications-Statistics/dp/1461471370}{Amazon} or \href{http://www-bcf.usc.edu/~gareth/ISL/}{free PDF}.
\item
  \emph{Hands-On Machine Learning with R (2020)} by Bradley Boehmke \& Brandon Greenwell. \href{https://www.routledge.com/Hands-On-Machine-Learning-with-R/Boehmke-Greenwell/p/book/9781138495685}{CRC Press} or \href{https://www.amazon.com/gp/product/1138495689?pf_rd_p=ab873d20-a0ca-439b-ac45-cd78f07a84d8\&pf_rd_r=JBRX0ZJ1WFSR9T3JPTQE}{Amazon}
\item
  \emph{Applied Predictive Modeling (2013)} by Max Kuhn and Kjell Johnson. Springer: New York. \href{https://www.amazon.com/Applied-Predictive-Modeling-Max-Kuhn/dp/1461468485?SubscriptionId=0ENGV10E9K9QDNSJ5C82\&tag=apm0a-20\&linkCode=xm2\&camp=2025\&creative=165953\&creativeASIN=1461468485}{Amazon}
\item
  \emph{Feature Engineering and Selection: A Practical Approach for Predictive Models (2019)} by Kjell Johnson and Max Kuhn. Taylor \& Francis. \href{http://www.feat.engineering/}{Amazon} or \href{http://www.feat.engineering/}{free HTML}.
\item
  \emph{\href{https://www.tmwr.org/}{Tidy Modeling with R} (2020)} by Max Kuhn and Julia Silge (work-in-progress)
\end{itemize}

\hypertarget{lecture-slides}{%
\subsection{Lecture slides}\label{lecture-slides}}

\begin{itemize}
\item
  \href{https://www.nber.org/econometrics_minicourse_2015/nber_slides11.pdf}{An introduction to supervised and unsupervised learning (2015)} by Susan Athey and Guido Imbens
\item
  \href{https://education.rstudio.com/blog/2020/02/conf20-intro-ml/}{Introduction Machine Learning with the Tidyverse} by Alison Hill
\end{itemize}

\hypertarget{blog-posts}{%
\subsection{Blog posts}\label{blog-posts}}

\begin{itemize}
\tightlist
\item
  \href{http://www.rebeccabarter.com/blog/2019-06-06_pre_processing/}{``Using the recipes package for easy pre-processing''} by Rebecca Barter
\end{itemize}

\hypertarget{big_data}{%
\chapter{Big data}\label{big_data}}

\hypertarget{the-big-picture-9}{%
\section{The Big Picture}\label{the-big-picture-9}}

\begin{itemize}
\tightlist
\item
  Big data problem: data is too big to fit into memory (=local environment).
\item
  R reads data into random-access memory (RAM) at once, and this object lives in memory entirely. So, if object.size \textgreater{} memory.size, the process will crash R.
\item
  Therefore, the key to dealing with big data in R is reducing the size of data you want to bring into it.
\end{itemize}

\textbf{Techniques to deal with big data}

\begin{itemize}
\tightlist
\item
  Medium-sized file (1-2 GB)

  \begin{itemize}
  \tightlist
  \item
    Try to reduce the size of the file using slicing and dicing
  \item
    Tools:

    \begin{itemize}
    \tightlist
    \item
      R:\texttt{data.table::fread(file\ path,\ select\ =\ c("column\ 1",\ "column\ 2"))}. This command imports data faster than \texttt{read.csv()} does.
    \item
      Command-line: \href{https://csvkit.readthedocs.io/en/latest/}{\texttt{csvkit}} - a suite of command-line tools to and working with CSV
    \end{itemize}
  \end{itemize}
\item
  Large file (\textgreater{} 2-10 GB)

  \begin{itemize}
  \tightlist
  \item
    Put the data into a database and \textbf{ACCESS} it
  \item
    Explore the data and pull the objects of interest
  \end{itemize}
\end{itemize}

\textbf{Databases}

\begin{itemize}
\tightlist
\item
  Types of databases

  \begin{itemize}
  \tightlist
  \item
    Relational database = a \textbf{collection} of \textbf{tables} (fixed columns and rows): SQL is a staple tool to define, \textbf{query} (the focus of the workshop today), control, and manipulate this type of database
  \item
    Non-relational database = a collection of documents (MongoDB), key-values (Redis and DyanoDB), wide-column stores (Cassandra and HBase), or graph (Neo4j and JanusGraph). Note that this type of database does not preclude SQL. NoSQL stands for \href{https://www.mongodb.com/nosql-explained}{``not only SQL.''}
  \end{itemize}
\end{itemize}

\textbf{Relational database example}

\begin{figure}
\centering
\includegraphics{https://sp.mysqltutorial.org/wp-content/uploads/2009/12/MySQL-Sample-Database-Schema.png}
\caption{Relational Database. Source: MySQL Tutorial}
\end{figure}

\hypertarget{sql}{%
\section{SQL}\label{sql}}

\begin{itemize}
\item
  Structured Query Language. Called SEQUEL and was developed by IBM Corporation in the 1970s.
\item
  Remains the standard language for a relational database management system.
\item
  It's a DECLARATIVE language (\href{https://www.sqlite.org/queryplanner.html}{what to do \textgreater{} how to do})

  \begin{itemize}
  \tightlist
  \item
    Database management systems figure an optimal way to execute a query (query optimization)
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{SELECT} \KeywordTok{COLUMN} \KeywordTok{FROM} \KeywordTok{TABLE} 
\end{Highlighting}
\end{Shaded}

\hypertarget{learning-objectives}{%
\subsection{Learning objectives}\label{learning-objectives}}

\begin{itemize}
\item
  Embracing a new mindset: shifting from ownership (opening CSVs stored in your laptop) to access (accessing data stored in a database)
\item
  Learning how to use R and SQL to access and query a database
\end{itemize}

\hypertarget{sql-and-r}{%
\subsection{SQL and R}\label{sql-and-r}}

\begin{itemize}
\tightlist
\item
  SQL and R
\end{itemize}

\begin{longtable}[]{@{}ll@{}}
\toprule
\begin{minipage}[b]{0.14\columnwidth}\raggedright
SQL\strut
\end{minipage} & \begin{minipage}[b]{0.80\columnwidth}\raggedright
R\strut
\end{minipage}\tabularnewline
\midrule
\endhead
\begin{minipage}[t]{0.14\columnwidth}\raggedright
SELECT\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
select() for columns, mutate() for expressions, summarise() for aggregates\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
FROM\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
which data frame\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
WHERE\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
filter()\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
GROUP BY\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
group\_by()\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
HAVING\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
filter() \textbf{after group\_by()}\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
ORDER BY\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
arrange()\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.14\columnwidth}\raggedright
LIMIT\strut
\end{minipage} & \begin{minipage}[t]{0.80\columnwidth}\raggedright
head()\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

\textbf{Challenge 1}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Can you tell me the difference in the order in which the following \texttt{R} and \texttt{SQL} codes were written to manipulate data? For instance, in R, what command comes first? In contrast, in SQL, what command comes first?
\end{enumerate}

\begin{itemize}
\tightlist
\item
  R example
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\NormalTok{data }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Data }
\StringTok{  }\KeywordTok{select}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Column}
\StringTok{  }\KeywordTok{filter}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Row }
\StringTok{  }\KeywordTok{group\_by}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Group by }
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{n =} \KeywordTok{n}\NormalTok{()) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# n() is one of the aggregate functions in r; it\textquotesingle{}s count() used inside summarise() function }
\StringTok{  }\KeywordTok{filter}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# Row }
\StringTok{  }\KeywordTok{order\_by}\NormalTok{() }\CommentTok{\# Arrange }
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  SQL example (in a SQL chunk, use \texttt{-\/-} instead of \texttt{\#} to comment)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\KeywordTok{SELECT} \KeywordTok{column}\NormalTok{, aggregation (}\FunctionTok{count}\NormalTok{())\textasciigrave{} }\CommentTok{{-}{-} Column}

\KeywordTok{FROM} \KeywordTok{data}\NormalTok{ \# }\KeywordTok{Data} 

\KeywordTok{WHERE}\NormalTok{ condition }\CommentTok{{-}{-} Filter rows }

\KeywordTok{GROUP} \KeywordTok{BY} \KeywordTok{column} \CommentTok{{-}{-} Group by}

\KeywordTok{HAVING}\NormalTok{ condition }\CommentTok{{-}{-} Filter rows after group by  }

\KeywordTok{ORDER} \KeywordTok{BY} \KeywordTok{column} \CommentTok{{-}{-} Arrange }
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{https://wizardzines.com/zines/sql/samples/from.png}
\caption{SQL Zine by by \href{https://jvns.ca/}{Julia Evans}}
\end{figure}

\hypertarget{setup-7}{%
\subsection{Setup}\label{setup-7}}

Let's get to work.

\hypertarget{packages-1}{%
\subsection{Packages}\label{packages-1}}

\begin{itemize}
\tightlist
\item
  \texttt{pacman::p\_load()} reduces steps for installing and loading several packages simultaneously.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# pacman }
\ControlFlowTok{if}\NormalTok{ (}\OperatorTok{!}\KeywordTok{require}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)) }\KeywordTok{install.packages}\NormalTok{(}\StringTok{"pacman"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Loading required package: pacman
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# The rest of pkgs }
\NormalTok{pacman}\OperatorTok{::}\KeywordTok{p\_load}\NormalTok{(}
\NormalTok{ tidyverse, }\CommentTok{\# tidyverse packages }
\NormalTok{ DBI, }\CommentTok{\# using SQL queries}
\NormalTok{ RSQLite, }\CommentTok{\# SQLite}
\NormalTok{ dbplyr, }\CommentTok{\# use database with dplyr }
\NormalTok{ glue, }\CommentTok{\# glue to automate workflow }
\NormalTok{ nycflights13 }\CommentTok{\# toy data }
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{nyc-flights-data}{%
\subsection{NYC flights data}\label{nyc-flights-data}}

\begin{itemize}
\tightlist
\item
  \href{https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236}{The flight on-time performance data} from the Bureau of Transportation Statistics of the U.S. government. The data goes back to 1987, and its size is more than 20 gigabytes. For practice, we only use a small subset of the original data (flight data departing NYC in 2013) provided by RStudio.
\end{itemize}

\begin{figure}
\centering
\includegraphics{https://d33wubrfki0l68.cloudfront.net/245292d1ea724f6c3fd8a92063dcd7bfb9758d02/5751b/diagrams/relational-nycflights.png}
\caption{From RStudio.}
\end{figure}

\hypertarget{workflow-4}{%
\subsection{Workflow}\label{workflow-4}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Create/connect to a database
\end{enumerate}

\begin{itemize}
\item
  Note that the server also can be your laptop (called \href{https://en.wikipedia.org/wiki/Localhost\#:~:text=In\%20computer\%20networking\%2C\%20localhost\%20is,via\%20the\%20loopback\%20network\%20interface.}{localhost}).
\item
  Short answer: To do so, you need interfaces between R and a database. We use \href{https://github.com/r-dbi/RSQLite}{\texttt{RSQLite}} in this tutorial because it's easy to set up.
\item
  Long answer: The \texttt{DBI} package in R provides a client-side interface that allows \texttt{dplyr} to work with databases. DBI is automatically installed when you install \texttt{dbplyr}. However, you need to install a specific backend engine (a tool for communication between R and a database management system) for the database (e.g., \texttt{RMariaDB}, \texttt{RPostgres}, \texttt{RSQLite}). In this workshop, we use SQLite because it is the easiest to get started with. I love PostgreSQL because it's open-source and also powerful to do \href{https://www.postgresql.org/docs/current/functions.html}{many amazing things} (e.g., text mining, geospatial analysis). If you want to build a data warehouse, an analytical platform, consider using Spark (Hadoop).
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Copy a table to the database
\end{enumerate}

\begin{itemize}
\item
  Option 1: You can create a table and insert rows manually. You also need to define the data schema (the database structure) to do that.
\item
  Table

  \begin{itemize}
  \tightlist
  \item
    Collection of rows
  \item
    Collection of columns (fields or attributes)
  \item
    Each col has a type:

    \begin{itemize}
    \tightlist
    \item
      String: \texttt{VARCHAR(20)}
    \item
      Integer: \texttt{INTEGER}
    \item
      Floating-point: \texttt{FLOAT}, \texttt{DOUBLE}
    \item
      Date/time: \texttt{DATE}, \texttt{TIME}, \texttt{DATETIME}
    \end{itemize}
  \item
    \textbf{Schema}: the structure of the database

    \begin{itemize}
    \tightlist
    \item
      The table name
    \item
      The names and types of its columns
    \item
      Various optional additional information

      \begin{itemize}
      \tightlist
      \item
        \href{https://www.w3schools.com/sql/sql_constraints.asp}{Constraints}

        \begin{itemize}
        \tightlist
        \item
          Syntax: \texttt{column\ datatype\ constraint}
        \item
          Examples: \texttt{NOT\ NULL}, \texttt{UNIQUE}, \texttt{INDEX}
        \end{itemize}
      \end{itemize}
    \end{itemize}
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\CommentTok{{-}{-} Create table }

\KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ students (}
    \KeywordTok{id} \DataTypeTok{INT}\NormalTok{ AUTO\_INCREMENT,}
\NormalTok{    name }\DataTypeTok{VARCHAR}\NormalTok{(}\DecValTok{30}\NormalTok{),}
\NormalTok{    birth }\DataTypeTok{DATE}\NormalTok{,}
\NormalTok{    gpa }\DataTypeTok{FLOAT}\NormalTok{,}
\NormalTok{    grad }\DataTypeTok{INT}\NormalTok{,}
    \KeywordTok{PRIMARY} \KeywordTok{KEY}\NormalTok{(}\KeywordTok{id}\NormalTok{));}

\CommentTok{{-}{-} Insert one additional row }

\KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ students(name, birth, gpa, grad)}
      \KeywordTok{VALUES}\NormalTok{ (}\StringTok{\textquotesingle{}Adam\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}2000{-}08{-}04\textquotesingle{}}\NormalTok{, }\FloatTok{4.0}\NormalTok{, }\DecValTok{2020}\NormalTok{);}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Option 2: Copy a file (object) to a table in a database using \texttt{copy\_to}). We take this option as it's fast, and we would like to focus on querying in this workshop.
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Query the table
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Main focus
\end{itemize}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\item
  Pull the results of interests (\textbf{data}) using \texttt{collect()}
\item
  Disconnect the database
\end{enumerate}

\hypertarget{create-a-database}{%
\subsubsection{Create a database}\label{create-a-database}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Define a backend engine }

\NormalTok{drv \textless{}{-}}\StringTok{ }\NormalTok{RSQLite}\OperatorTok{::}\KeywordTok{SQLite}\NormalTok{()}

\CommentTok{\# Create an empty in{-}memory database }
\NormalTok{con \textless{}{-}}\StringTok{ }\NormalTok{DBI}\OperatorTok{::}\KeywordTok{dbConnect}\NormalTok{(drv, }
                      \DataTypeTok{dbname =} \StringTok{":memory:"}\NormalTok{)}

\CommentTok{\# Connect to an existing database }
\CommentTok{\#con \textless{}{-} DBI::dbConnect(RMariaDB::MariaDB(), }
 \CommentTok{\# host = "database.rstudio.com",}
 \CommentTok{\# user = "hadley",}
 \CommentTok{\# password = rstudioapi::askForPassword("Database password")}
\CommentTok{\#)}

\KeywordTok{dbListTables}\NormalTok{(con)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## character(0)
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# character(0) = NULL}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  Note that \texttt{con} is empty at this stage.
\end{itemize}

\hypertarget{copy-an-object-as-a-table-to-the-database-push}{%
\subsubsection{Copy an object as a table to the database (push)}\label{copy-an-object-as-a-table-to-the-database-push}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Copy objects to the data }
\CommentTok{\# copy\_to() comes from dplyr}
\KeywordTok{copy\_to}\NormalTok{(}\DataTypeTok{dest =}\NormalTok{ con, }
        \DataTypeTok{df =}\NormalTok{ flights)}

\KeywordTok{copy\_to}\NormalTok{(}\DataTypeTok{dest =}\NormalTok{ con, }
        \DataTypeTok{df =}\NormalTok{ airports)}

\KeywordTok{copy\_to}\NormalTok{(}\DataTypeTok{dest =}\NormalTok{ con,}
        \DataTypeTok{df =}\NormalTok{ planes)}

\KeywordTok{copy\_to}\NormalTok{(}\DataTypeTok{dest =}\NormalTok{ con, }
        \DataTypeTok{df =}\NormalTok{ weather)}

\CommentTok{\# If you need, you can also select which columns you would like to copy:}

\CommentTok{\# copy\_to(dest = con, }
\CommentTok{\#          df = flights, }
\CommentTok{\#          name = "flights",}
\CommentTok{\#          indexes = list(c("year", "tailnum", "dest")))}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Show two tables in the database }

\KeywordTok{dbListTables}\NormalTok{(con)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "airports"     "flights"      "planes"       "sqlite_stat1" "sqlite_stat4"
## [6] "weather"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Show the columns/attributes/fields of a table }

\KeywordTok{dbListFields}\NormalTok{(con, }\StringTok{"flights"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dbListFields}\NormalTok{(con, }\StringTok{"weather"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "origin"     "year"       "month"      "day"        "hour"      
##  [6] "temp"       "dewp"       "humid"      "wind_dir"   "wind_speed"
## [11] "wind_gust"  "precip"     "pressure"   "visib"      "time_hour"
\end{verbatim}

\hypertarget{quick-demonstrations}{%
\subsubsection{Quick demonstrations:}\label{quick-demonstrations}}

\begin{itemize}
\item
  SELECT desired columns
\item
  FROM tables
\item
  Select all columns (*) from \texttt{flights} table and show the \texttt{first\ ten\ rows}
\item
  Note that you can combine SQL and R commands thanks to \texttt{dbplyr.}
\item
  Option 1
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{DBI}\OperatorTok{::}\KeywordTok{dbGetQuery}\NormalTok{(con, }
                \StringTok{"SELECT * FROM flights;"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\CommentTok{\# SQL}
\StringTok{  }\KeywordTok{head}\NormalTok{(}\DecValTok{10}\NormalTok{) }\CommentTok{\# dplyr }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1  2013     1   1      517            515         2      830            819
## 2  2013     1   1      533            529         4      850            830
## 3  2013     1   1      542            540         2      923            850
## 4  2013     1   1      544            545        -1     1004           1022
## 5  2013     1   1      554            600        -6      812            837
## 6  2013     1   1      554            558        -4      740            728
## 7  2013     1   1      555            600        -5      913            854
## 8  2013     1   1      557            600        -3      709            723
## 9  2013     1   1      557            600        -3      838            846
## 10 2013     1   1      558            600        -2      753            745
##    arr_delay carrier flight tailnum origin dest air_time distance hour minute
## 1         11      UA   1545  N14228    EWR  IAH      227     1400    5     15
## 2         20      UA   1714  N24211    LGA  IAH      227     1416    5     29
## 3         33      AA   1141  N619AA    JFK  MIA      160     1089    5     40
## 4        -18      B6    725  N804JB    JFK  BQN      183     1576    5     45
## 5        -25      DL    461  N668DN    LGA  ATL      116      762    6      0
## 6         12      UA   1696  N39463    EWR  ORD      150      719    5     58
## 7         19      B6    507  N516JB    EWR  FLL      158     1065    6      0
## 8        -14      EV   5708  N829AS    LGA  IAD       53      229    6      0
## 9         -8      B6     79  N593JB    JFK  MCO      140      944    6      0
## 10         8      AA    301  N3ALAA    LGA  ORD      138      733    6      0
##     time_hour
## 1  1357034400
## 2  1357034400
## 3  1357034400
## 4  1357034400
## 5  1357038000
## 6  1357034400
## 7  1357038000
## 8  1357038000
## 9  1357038000
## 10 1357038000
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Option 2 (works faster)
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]

\KeywordTok{SELECT} \OperatorTok{*} 
\KeywordTok{FROM}\NormalTok{ flights }
\KeywordTok{LIMIT} \DecValTok{10}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Option 3 (automating workflow)

  \begin{itemize}
  \tightlist
  \item
    When local variables are updated, the SQL query is also automatically updated. This approach is called \href{https://www.php.net/manual/en/pdo.prepared-statements.php}{parameterized query} (or prepared statement).
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\# PREPARATION \#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#}

\CommentTok{\# Local variables }
\NormalTok{tbl \textless{}{-}}\StringTok{ "flights"}
\NormalTok{var \textless{}{-}}\StringTok{ "dep\_delay"}
\NormalTok{num \textless{}{-}}\StringTok{ }\DecValTok{10}

\CommentTok{\# Glue SQL query string }
\CommentTok{\# Note that to indicate a numeric value, you don\textquotesingle{}t need.}

\NormalTok{sql\_query \textless{}{-}}\StringTok{ }\KeywordTok{glue\_sql}\NormalTok{(}\StringTok{"}
\StringTok{  SELECT \{\textasciigrave{}var\textasciigrave{}\}}
\StringTok{  FROM \{\textasciigrave{}tbl\textasciigrave{}\}}
\StringTok{  LIMIT \{num\} }
\StringTok{  "}\NormalTok{, }\DataTypeTok{.con =}\NormalTok{ con)}

\CommentTok{\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\# EXECUTION \#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#}

\CommentTok{\# Run the query }
\KeywordTok{dbGetQuery}\NormalTok{(con, sql\_query)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##    dep_delay
## 1          2
## 2          4
## 3          2
## 4         -1
## 5         -6
## 6         -4
## 7         -5
## 8         -3
## 9         -3
## 10        -2
\end{verbatim}

\textbf{Challenge 2}
Can you rewrite the above code using \texttt{LIMIT} instead of \texttt{head(10)}?

\begin{itemize}
\item
  You may notice that using only SQL code makes querying faster.
\item
  Select \texttt{dep\_delay} and \texttt{arr\_delay} from flights table, show the first ten rows, then turn the result into a tibble.
\end{itemize}

\textbf{Challenge 3}
Could you remind me how to see the list of attributes of a table? Let's say you want to see the \texttt{flights} table attributes. How can you do it?

\begin{itemize}
\tightlist
\item
  Collect the selected columns and filtered rows
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\KeywordTok{dbGetQuery}\NormalTok{(con, }
  \StringTok{"SELECT dep\_delay, arr\_delay FROM flights;"}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{head}\NormalTok{(}\DecValTok{10}\NormalTok{) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\item
  Counting rows

  \begin{itemize}
  \tightlist
  \item
    Count all (*)
  \end{itemize}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dbGetQuery}\NormalTok{(con, }
          \StringTok{"SELECT COUNT(*) }
\StringTok{           FROM flights;"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   COUNT(*)
## 1   336776
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dbGetQuery}\NormalTok{(con, }
           \StringTok{"SELECT COUNT(dep\_delay)}
\StringTok{           FROM flights;"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   COUNT(dep_delay)
## 1           328521
\end{verbatim}

\begin{itemize}
\tightlist
\item
  Count distinct values
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{dbGetQuery}\NormalTok{(con, }
           \StringTok{"SELECT COUNT(DISTINCT dep\_delay)}
\StringTok{           FROM flights;"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   COUNT(DISTINCT dep_delay)
## 1                       527
\end{verbatim}

\hypertarget{tidy-way-dplyr---sql}{%
\subsubsection{Tidy-way: dplyr -\textgreater{} SQL}\label{tidy-way-dplyr---sql}}

Thanks to the \texttt{dbplyr} package, you can use the \texttt{dplyr} syntax to query SQL.

\begin{itemize}
\tightlist
\item
  Note that pipe (\%) works.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# tbl select tables}
\NormalTok{flights \textless{}{-}}\StringTok{ }\NormalTok{con }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{tbl}\NormalTok{(}\StringTok{"flights"}\NormalTok{)}
\NormalTok{airports \textless{}{-}}\StringTok{ }\NormalTok{con }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{tbl}\NormalTok{(}\StringTok{"airports"}\NormalTok{)}
\NormalTok{planes \textless{}{-}}\StringTok{ }\NormalTok{con }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{tbl}\NormalTok{(}\StringTok{"planes"}\NormalTok{)}
\NormalTok{weather \textless{}{-}}\StringTok{ }\NormalTok{con }\OperatorTok{\%\textgreater{}\%}\StringTok{ }\KeywordTok{tbl}\NormalTok{(}\StringTok{"weather"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  \texttt{select} = \texttt{SELECT}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"delay"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # Source:   lazy query [?? x 2]
## # Database: sqlite 3.37.0 [:memory:]
##    dep_delay arr_delay
##        <dbl>     <dbl>
##  1         2        11
##  2         4        20
##  3         2        33
##  4        -1       -18
##  5        -6       -25
##  6        -4        12
##  7        -5        19
##  8        -3       -14
##  9        -3        -8
## 10        -2         8
## # ... with more rows
\end{verbatim}

\textbf{Challenge 4}
Your turn: write the same code in SQL. Don't forget to add the \texttt{connection} argument to your SQL code chunk.

\begin{itemize}
\tightlist
\item
  \texttt{mutate} = \texttt{SELECT} \texttt{AS}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{select}\NormalTok{(distance, air\_time) }\OperatorTok{\%\textgreater{}\%}\StringTok{  }
\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{speed =}\NormalTok{ distance }\OperatorTok{/}\StringTok{ }\NormalTok{(air\_time }\OperatorTok{/}\StringTok{ }\DecValTok{60}\NormalTok{)) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # Source:   lazy query [?? x 3]
## # Database: sqlite 3.37.0 [:memory:]
##    distance air_time speed
##       <dbl>    <dbl> <dbl>
##  1     1400      227  370.
##  2     1416      227  374.
##  3     1089      160  408.
##  4     1576      183  517.
##  5      762      116  394.
##  6      719      150  288.
##  7     1065      158  404.
##  8      229       53  259.
##  9      944      140  405.
## 10      733      138  319.
## # ... with more rows
\end{verbatim}

\textbf{Challenge 5}
Your turn: write the same code in SQL. (
Hint: \texttt{mutate(new\_var\ =\ var\ 1\ *\ var2} (R) = \texttt{SELECT\ var1\ *\ var2\ AS\ near\_var} (SQL)

\begin{itemize}
\tightlist
\item
  \texttt{filter} = \texttt{WHERE}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{filter}\NormalTok{(month }\OperatorTok{==}\StringTok{ }\DecValTok{1}\NormalTok{, day }\OperatorTok{==}\StringTok{ }\DecValTok{1}\NormalTok{) }\CommentTok{\# filter(month ==1 \& day == 1) Both work in the same way.}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # Source:   lazy query [?? x 19]
## # Database: sqlite 3.37.0 [:memory:]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dbl>
\end{verbatim}

\textbf{Challenge 6}
Your turn: write the same code in SQL (hint: \texttt{filter(condition1,\ condition2)} = \texttt{WHERE\ condition1\ and\ condition2})

\textbf{Additional tips}

Note that R and SQL operators are not exactly alike. R uses \texttt{!=} for \texttt{Not\ equal\ to}. SQL uses \texttt{\textless{}\textgreater{}} or \texttt{!=}. Furthermore, there are some cautions about using \texttt{NULL} (NA; unknown or missing): it should be \texttt{IS\ NULL} or \texttt{IS\ NOT\ NULL} not \texttt{=NULL} or \texttt{!=NULL} (this makes sense because NULL represents an absence of a value).

Another pro-tip is \href{https://www.w3schools.com/sql/sql_like.asp}{\texttt{LIKE} operator}, used in a \texttt{WHERE} statement to find values based on string patterns.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{(origin) }\CommentTok{{-}{-} Distinct values from origin column}
\KeywordTok{FROM}\NormalTok{ flights}
\KeywordTok{WHERE}\NormalTok{ origin }\KeywordTok{LIKE} \StringTok{\textquotesingle{}J\%\textquotesingle{}}\NormalTok{; }\CommentTok{{-}{-} Find any origin values that start with "J"}
\end{Highlighting}
\end{Shaded}

\begin{table}

\caption{\label{tab:unnamed-chunk-16}1 records}
\centering
\begin{tabular}[t]{l}
\hline
origin\\
\hline
JFK\\
\hline
\end{tabular}
\end{table}

\texttt{\%} is one of the wildcards you can use for string matching. \texttt{\%} matches any number of characters. So, \texttt{J\%} matches Jae, JFK, Joseph, etc. \texttt{\_} is another useful wildcard that matches exactly one character. So \texttt{J\_} matches only JA, JE, etc. If wildcards are not enough, then you should consider using regular expressions.

\begin{itemize}
\tightlist
\item
  \texttt{arrange} = \texttt{ORDER\ BY}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{arrange}\NormalTok{(carrier, }\KeywordTok{desc}\NormalTok{(arr\_delay)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{show\_query}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## <SQL>
## SELECT *
## FROM `flights`
## ORDER BY `carrier`, `arr_delay` DESC
\end{verbatim}

\textbf{Challenge 7}
Your turn: write the same code in SQL.
Hint: \texttt{arrange(var1,\ desc(var2)} (R) = \texttt{ORDER\ BY\ var1,\ var2\ DESC} (SQL)

\begin{itemize}
\tightlist
\item
  \texttt{summarise} = \texttt{SELECT} \texttt{AS} and \texttt{group\ by} = \texttt{GROUP\ BY}
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(month, day) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{delay =} \KeywordTok{mean}\NormalTok{(dep\_delay)) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
\end{verbatim}

\begin{verbatim}
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
\end{verbatim}

\begin{verbatim}
## # Source:   lazy query [?? x 3]
## # Database: sqlite 3.37.0 [:memory:]
## # Groups:   month
##    month   day delay
##    <int> <int> <dbl>
##  1     1     1 11.5 
##  2     1     2 13.9 
##  3     1     3 11.0 
##  4     1     4  8.95
##  5     1     5  5.73
##  6     1     6  7.15
##  7     1     7  5.42
##  8     1     8  2.55
##  9     1     9  2.28
## 10     1    10  2.84
## # ... with more rows
\end{verbatim}

\textbf{Challenge 8}
Your turn: write the same code in SQL (hint: in SQL the order should be \texttt{SELECT\ group\_var1,\ group\_var2,\ AVG(old\_var)\ AS\ new\_var} -\textgreater{} \texttt{FROM} -\textgreater{} \texttt{GROUP\ BY})

\begin{itemize}
\tightlist
\item
  If you feel too much challenged, here's a help.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(month, day) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{summarise}\NormalTok{(}\DataTypeTok{delay =} \KeywordTok{mean}\NormalTok{(dep\_delay)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{show\_query}\NormalTok{() }\CommentTok{\# Show the SQL equivalent!}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
\end{verbatim}

\begin{verbatim}
## <SQL>
## SELECT `month`, `day`, AVG(`dep_delay`) AS `delay`
## FROM `flights`
## GROUP BY `month`, `day`
\end{verbatim}

\begin{itemize}
\item
  Joins
\item
  Using joins is more straightforward in R than it is in SQL.
\item
  However, more flexible joins exist in SQL, and they are not available in R.

  \begin{itemize}
  \tightlist
  \item
    Joins involving 3+ tables are not supported.
  \item
    Some advanced joins available in SQL are not supported.
  \item
    For more information, check out \href{https://github.com/ianmcook/tidyquery/issues}{\texttt{tidyquery}} to see the latest developments.
  \end{itemize}
\item
  SQL command
\end{itemize}

\texttt{FROM\ one\ table\ LEFT\ JOIN\ another\ table\ ON\ condition\ =\ condition} (\texttt{ON} in SQL = \texttt{BY} in R)

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{SELECT} \OperatorTok{*}
\KeywordTok{FROM}\NormalTok{ flights }\KeywordTok{AS}\NormalTok{ f}
\KeywordTok{LEFT} \KeywordTok{JOIN}\NormalTok{ weather }\KeywordTok{AS}\NormalTok{ w }
\KeywordTok{ON}\NormalTok{ f.}\DataTypeTok{year} \OperatorTok{=}\NormalTok{ w.}\DataTypeTok{year} \KeywordTok{AND}\NormalTok{ f.}\DataTypeTok{month} \OperatorTok{=}\NormalTok{ w.}\DataTypeTok{month}
\end{Highlighting}
\end{Shaded}

\begin{table}

\caption{\label{tab:unnamed-chunk-20}Displaying records 1 - 10}
\centering
\begin{tabular}[t]{r|r|r|r|r|r|r|r|r|l|r|l|l|l|r|r|r|r|r|l|r|r|r|r|r|r|r|r|r|r|r|r|r|r}
\hline
year & month & day & dep\_time & sched\_dep\_time & dep\_delay & arr\_time & sched\_arr\_time & arr\_delay & carrier & flight & tailnum & origin & dest & air\_time & distance & hour & minute & time\_hour & origin & year & month & day & hour & temp & dewp & humid & wind\_dir & wind\_speed & wind\_gust & precip & pressure & visib & time\_hour\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 1 & 39.02 & 26.06 & 59.37 & 270 & 10.35702 & NA & 0 & 1012.0 & 10 & 1357020000\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 2 & 39.02 & 26.96 & 61.63 & 250 & 8.05546 & NA & 0 & 1012.3 & 10 & 1357023600\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 3 & 39.02 & 28.04 & 64.43 & 240 & 11.50780 & NA & 0 & 1012.5 & 10 & 1357027200\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 4 & 39.92 & 28.04 & 62.21 & 250 & 12.65858 & NA & 0 & 1012.2 & 10 & 1357030800\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 5 & 39.02 & 28.04 & 64.43 & 260 & 12.65858 & NA & 0 & 1011.9 & 10 & 1357034400\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 6 & 37.94 & 28.04 & 67.21 & 240 & 11.50780 & NA & 0 & 1012.4 & 10 & 1357038000\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 7 & 39.02 & 28.04 & 64.43 & 240 & 14.96014 & NA & 0 & 1012.2 & 10 & 1357041600\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 8 & 39.92 & 28.04 & 62.21 & 250 & 10.35702 & NA & 0 & 1012.2 & 10 & 1357045200\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 9 & 39.92 & 28.04 & 62.21 & 260 & 14.96014 & NA & 0 & 1012.7 & 10 & 1357048800\\
\hline
2013 & 1 & 1 & 517 & 515 & 2 & 830 & 819 & 11 & UA & 1545 & N14228 & EWR & IAH & 227 & 1400 & 5 & 15 & 1357034400 & EWR & 2013 & 1 & 1 & 10 & 41.00 & 28.04 & 59.65 & 260 & 13.80936 & NA & 0 & 1012.4 & 10 & 1357052400\\
\hline
\end{tabular}
\end{table}

Can anyone explain why SQL query using \texttt{dplyr} then translated by \texttt{show\_query()} looks more complex than the above? (\href{https://stackoverflow.com/questions/36808295/how-to-remove-duplicate-columns-from-join-in-sql}{Hint})

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}\StringTok{ }
\StringTok{  }\KeywordTok{left\_join}\NormalTok{(weather, }\DataTypeTok{by =} \KeywordTok{c}\NormalTok{(}\StringTok{"year"}\NormalTok{, }\StringTok{"month"}\NormalTok{)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{show\_query}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## <SQL>
## SELECT `LHS`.`year` AS `year`, `LHS`.`month` AS `month`, `LHS`.`day` AS `day.x`, `dep_time`, `sched_dep_time`, `dep_delay`, `arr_time`, `sched_arr_time`, `arr_delay`, `carrier`, `flight`, `tailnum`, `LHS`.`origin` AS `origin.x`, `dest`, `air_time`, `distance`, `LHS`.`hour` AS `hour.x`, `minute`, `LHS`.`time_hour` AS `time_hour.x`, `RHS`.`origin` AS `origin.y`, `RHS`.`day` AS `day.y`, `RHS`.`hour` AS `hour.y`, `temp`, `dewp`, `humid`, `wind_dir`, `wind_speed`, `wind_gust`, `precip`, `pressure`, `visib`, `RHS`.`time_hour` AS `time_hour.y`
## FROM `flights` AS `LHS`
## LEFT JOIN `weather` AS `RHS`
## ON (`LHS`.`year` = `RHS`.`year` AND `LHS`.`month` = `RHS`.`month`)
\end{verbatim}

\hypertarget{collect-pull}{%
\subsubsection{Collect (pull)}\label{collect-pull}}

\begin{itemize}
\item
  \texttt{collect()} is used to pull the data. Depending on the data size, it may take a long time to run.
\item
  The following code won't work.
\end{itemize}

\begin{quote}
Error in UseMethod(``collect'') : no applicable method for `collect' applied to an object of class ``c(`LayerInstance', `Layer', `ggproto', `gg')''
\end{quote}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{origin\_flights\_plot \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(origin) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tally}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{ggplot}\NormalTok{() }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ origin, }\DataTypeTok{y =}\NormalTok{ n)) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{itemize}
\tightlist
\item
  This works.
\end{itemize}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{df \textless{}{-}}\StringTok{ }\NormalTok{flights }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{group\_by}\NormalTok{(origin) }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{tally}\NormalTok{() }\OperatorTok{\%\textgreater{}\%}
\StringTok{  }\KeywordTok{collect}\NormalTok{()}

\NormalTok{origin\_flights\_plot \textless{}{-}}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(df) }\OperatorTok{+}
\StringTok{  }\KeywordTok{geom\_col}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ origin, }\DataTypeTok{y =}\NormalTok{ n))}

\NormalTok{origin\_flights\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{08_big_data_files/figure-latex/unnamed-chunk-23-1.pdf}

\hypertarget{disconnect}{%
\subsubsection{Disconnect}\label{disconnect}}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{DBI}\OperatorTok{::}\KeywordTok{dbDisconnect}\NormalTok{(con)}
\end{Highlighting}
\end{Shaded}

\hypertarget{things-we-didnt-cover}{%
\subsection{Things we didn't cover}\label{things-we-didnt-cover}}

\hypertarget{subquery}{%
\subsubsection{Subquery}\label{subquery}}

Subquery = a query nested inside a query

This hypothetical example is inspired by \href{https://www.dofactory.com/sql/subquery}{dofactory blog post}.

\begin{Shaded}
\begin{Highlighting}[]
\KeywordTok{SELECT}\NormalTok{ names  }\CommentTok{{-}{-} Outer query }
\KeywordTok{FROM}\NormalTok{ consultants}
\KeywordTok{WHERE} \KeywordTok{Id} \KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ ConsultingId}
                \KeywordTok{FROM}\NormalTok{ consulting\_cases }
                \KeywordTok{WHERE} \KeywordTok{category} \OperatorTok{=} \StringTok{\textquotesingle{}r\textquotesingle{}} \KeywordTok{AND} \KeywordTok{category} \OperatorTok{=} \StringTok{\textquotesingle{}sql\textquotesingle{}}\NormalTok{); }\CommentTok{{-}{-} Subquery }
\end{Highlighting}
\end{Shaded}

\hypertarget{common-table-expression-with-clauses}{%
\subsubsection{Common table expression (WITH clauses)}\label{common-table-expression-with-clauses}}

This is just a hypothetical example inspired by {[}James LeDoux's blog post{]}(\url{https://jamesrledoux.com/code/sql-cte-common-table-expressions}.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{{-}{-} cases about R and SQL from dlab{-}database }
\KeywordTok{WITH}\NormalTok{ r\_sql\_consulting\_cases }\KeywordTok{AS}\NormalTok{ ( }\CommentTok{{-}{-} The name of the CTE expression }
  \CommentTok{{-}{-} The CTE query }
  \KeywordTok{SELECT}
    \KeywordTok{id} 
  \KeywordTok{FROM} 
\NormalTok{    dlab }
  \KeywordTok{WHERE}
\NormalTok{    tags }\KeywordTok{LIKE} \StringTok{\textquotesingle{}\%sql\%\textquotesingle{}}
  \KeywordTok{AND}
\NormalTok{    tags }\KeywordTok{LIKE} \StringTok{\textquotesingle{}\%r\%\textquotesingle{}}
\NormalTok{),}
\CommentTok{{-}{-} count the number of open cases about this consulting category }
\CommentTok{{-}{-} The outer query }
\KeywordTok{SELECT}\NormalTok{ status, }\FunctionTok{COUNT}\NormalTok{(status) }\KeywordTok{AS}\NormalTok{ open\_status\_count}
\KeywordTok{FROM}\NormalTok{ dlab }\KeywordTok{as}\NormalTok{ d }
\KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ r\_sql\_consulting\_cases }\KeywordTok{as}\NormalTok{ r}
  \KeywordTok{ON}\NormalTok{ d.}\KeywordTok{id} \OperatorTok{=}\NormalTok{ r.}\KeywordTok{id} 
\KeywordTok{WHERE}\NormalTok{ status }\OperatorTok{=} \StringTok{\textquotesingle{}open\textquotesingle{}}\NormalTok{; }
\end{Highlighting}
\end{Shaded}

\hypertarget{references-5}{%
\subsection{References}\label{references-5}}

\begin{itemize}
\tightlist
\item
  \href{https://github.com/csv2db/csv2db}{csv2db} - for loading large CSV files in to a database
\item
  R Studio, \href{https://db.rstudio.com/}{Database using R}
\item
  Ian Cook, \href{https://github.com/ianmcook/rstudioconf2020/blob/master/bridging_the_gap_between_sql_and_r.pdf}{``Bridging the Gap between SQL and R''} rstudio::conf 2020 slides

  \begin{itemize}
  \tightlist
  \item
    \href{https://www.youtube.com/watch?v=JwP5KdWSgqE\&ab_channel=RStudio}{Video recording}
  \end{itemize}
\item
  Data Carpentry contributors, \href{https://datacarpentry.org/R-ecology-lesson/05-r-and-databases.html}{SQL database and R}, Data Carpentry, September 10, 2019.
\item
  \href{https://cran.r-project.org/web/packages/dbplyr/vignettes/dbplyr.html}{Introduction to dbplyr}
\item
  Josh Erickson, \href{http://dept.stat.lsa.umich.edu/~jerrick/courses/stat701/notes/sql.html}{SQL in R}, STAT 701, University of Michigan
\item
  \href{https://wizardzines.com/zines/sql/}{SQL zine} by Julia Evans
\item
  \href{http://harelba.github.io/q/}{q} - a command-line tool that allows direct execution of SQL-like queries on CSVs/TSVs (and any other tabular text files)
\end{itemize}

  \bibliography{book.bib,packages.bib}

\end{document}