forked from rdpeng/CourseraLectures
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreading-data.tex
434 lines (379 loc) · 11.1 KB
/
reading-data.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
\documentclass[aspectratio=169]{beamer}
\mode<presentation>
{
\usetheme{Warsaw}
% or ...
\setbeamercovered{transparent}
% or whatever (possibly just delete it)
}
\usepackage[english]{babel}
\usepackage[latin1]{inputenc}
\usepackage{graphicx}
%\usepackage{times}
%\usepackage[T1]{fontenc}
% Or whatever. Note that the encoding and the font should match. If T1
% does not look nice, try deleting the line with the fontenc.
\usepackage{amsmath,amsfonts,amssymb}
\input{macros}
\title[The R Language]{Introduction to the R Language}
\subtitle{Reading and Writing Data}
\date{Computing for Data Analysis}
\setbeamertemplate{footline}[page number]
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Reading Data}
There are a few principal functions reading data into R.
\begin{itemize}
\item
\code{read.table}, \code{read.csv}, for reading tabular data
\item
\code{readLines}, for reading lines of a text file
\item
\code{source}, for reading in R code files (inverse of \code{dump})
\item
\code{dget}, for reading in R code files (inverse of \code{dput})
\item
\code{load}, for reading in saved workspaces
\item
\code{unserialize}, for reading single R objects in binary form
\end{itemize}
\end{frame}
\begin{frame}{Writing Data}
There are analogous functions for writing data to files
\begin{itemize}
\item
\code{write.table}
\item
\code{writeLines}
\item
\code{dump}
\item
\code{dput}
\item
\code{save}
\item
\code{serialize}
\end{itemize}
\end{frame}
\begin{frame}{Reading Data Files with read.table}
The \code{read.table} function is one of the most commonly used
functions for reading data. It has a few important arguments:
\begin{itemize}
\item
\code{file}, the name of a file, or a connection
\item
\code{header}, logical indicating if the file has a header line
\item
\code{sep}, a string indicating how the columns are separated
\item
\code{colClasses}, a character vector indicating the class of each
column in the dataset
\item
\code{nrows}, the number of rows in the dataset
\item
\code{comment.char}, a character string indicating the comment
character
\item
\code{skip}, the number of lines to skip from the beginning
\item
\code{stringsAsFactors}, should character variables be coded as
factors?
\end{itemize}
\end{frame}
\begin{frame}[fragile]{read.table}
For small to moderately sized datasets, you can usually call
\code{read.table} without specifying any other arguments
\begin{verbatim}
data <- read.table("foo.txt")
\end{verbatim}
R will automatically
\begin{itemize}
\item
skip lines that begin with a \#
\item
figure out how many rows there are (and how much memory needs to be
allocated)
\item
figure what type of variable is in each column of the table
\end{itemize}
Telling R all these things directly makes R run faster and more
efficiently.
\begin{itemize}
\item
\code{read.csv} is identical to \code{read.table} except that the
default separator is a comma.
\end{itemize}
\end{frame}
\begin{frame}{Reading in Larger Datasets with read.table}
With much larger datasets, doing the following things will make your
life easier and will prevent R from choking.
\begin{itemize}
\item
Read the help page for \code{read.table}, which contains many hints
\item
Make a rough calculation of the memory required to store your dataset.
If the dataset is larger than the amount of RAM on your computer, you
can probably stop right here.
\item
Set \code{comment.char = ""} if there are no commented lines in your
file.
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Reading in Larger Datasets with read.table}
\begin{itemize}
\item
Use the \code{colClasses} argument. Specifying this option instead of
using the default can make 'read.table' run MUCH faster, often twice
as fast. In order to use this option, you have to know the class of
each column in your data frame. If all of the columns are ``numeric'',
for example, then you can just set \code{colClasses = "numeric"}. A
quick an dirty way to figure out the classes of each column is the
following:
\begin{verbatim}
initial <- read.table("datatable.txt", nrows = 100)
classes <- sapply(initial, class)
tabAll <- read.table("datatable.txt",
colClasses = classes)
\end{verbatim}
\item
Set \code{nrows}. This doesn't make R run faster but it helps with
memory usage. A mild overestimate is okay. You can use the Unix tool
\code{wc} to calculate the number of lines in a file.
\end{itemize}
\end{frame}
\begin{frame}{Know Thy System}
In general, when using R with larger datasets, it's useful to know a
few things about your system.
\begin{itemize}
\item
How much memory is available?
\item
What other applications are in use?
\item
Are there other users logged into the same system?
\item
What operating system?
\item
Is the OS 32 or 64 bit?
\end{itemize}
\end{frame}
\begin{frame}{Calculating Memory Requirements}
I have a data frame with 1,500,000 rows and 120 columns, all of which
are numeric data. Roughly, how much memory is required to store this
data frame?
\begin{eqnarray*}
1,500,000\times 120 \times\mbox{$8$ bytes/numeric}
& = &
1440000000\mbox{ bytes}\\
& = &
1440000000 / 2^{20}\mbox{ bytes/MB}\\
& = &
1,373.29\mbox{ MB}\\
& = &
1.34\mbox{ GB}
\end{eqnarray*}
\end{frame}
\begin{frame}{Textual Formats}
\begin{itemize}
\item
\code{dump}ing and \code{dput}ing are useful because the resulting
textual format is edit-able, and in the case of corruption,
potentially recoverable.
\item
Unlike writing out a table or csv file, \code{dump} and \code{dput}
preserve the \textit{metadata} (sacrificing some readability), so that
another user doesn't have to specify it all over again.
\item
Textual formats can work much better with version control programs
like subversion or git which can only track changes meaningfully in
text files
\item Textual formats can be longer-lived; if there is corruption
somewhere in the file, it can be easier to fix the problem
\item
Textual formats adhere to the ``Unix philosophy''
\item Downside: The format is not very space-efficient
\end{itemize}
\end{frame}
\begin{frame}[fragile]{dput-ting R Objects}
Another way to pass data around is by deparsing the R object with
\code{dput} and reading it back in using \code{dget}.
\begin{verbatim}
> y <- data.frame(a = 1, b = "a")
> dput(y)
structure(list(a = 1,
b = structure(1L, .Label = "a",
class = "factor")),
.Names = c("a", "b"), row.names = c(NA, -1L),
class = "data.frame")
> dput(y, file = "y.R")
> new.y <- dget("y.R")
> new.y
a b
1 1 a
\end{verbatim}
\end{frame}
\begin{frame}[fragile]{Dumping R Objects}
Multiple objects can be deparsed using the \code{dump} function and
read back in using \code{source}.
\begin{verbatim}
> x <- "foo"
> y <- data.frame(a = 1, b = "a")
> dump(c("x", "y"), file = "data.R")
> rm(x, y)
> source("data.R")
> y
a b
1 1 a
> x
[1] "foo"
\end{verbatim}
\end{frame}
\begin{frame}{Interfaces to the Outside World}
Data are read in using \textit{connection} interfaces. Connections
can be made to files (most common) or to other more exotic things.
\begin{itemize}
\item
\code{file}, opens a connection to a file
\item
\code{gzfile}, opens a connection to a file compressed with gzip
\item
\code{bzfile}, opens a connection to a file compressed with bzip2
\item
\code{url}, opens a connection to a webpage
\end{itemize}
\end{frame}
\begin{frame}[fragile]{File Connections}
\begin{verbatim}
> str(file)
function (description = "", open = "", blocking = TRUE,
encoding = getOption("encoding"))
\end{verbatim}
\begin{itemize}
\item
\code{description} is the name of the file
\item
\code{open} is a code indicating
\begin{itemize}
\item
``r'' read only
\item
``w'' writing (and initializing a new file)
\item
``a'' appending
\item
``rb'', ``wb'', ``ab'' reading, writing, or appending in binary mode
(Windows)
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Connections}
In general, connections are powerful tools that let you navigate files
or other external objects. In practice, we often don't need to deal
with the connection interface directly.
\begin{verbatim}
con <- file("foo.txt", "r")
data <- read.csv(con)
close(con)
\end{verbatim}
is the same as
\begin{verbatim}
data <- read.csv("foo.txt")
\end{verbatim}
\end{frame}
\begin{frame}[fragile]{Reading Lines of a Text File}
The \code{readLines} function can be used to simply read lines of a
text file and store them in a character vector.
\begin{verbatim}
> con <- gzfile("words.gz")
> x <- readLines(con, 10)
> x
[1] "1080" "10-point" "10th" "11-point"
[5] "12-point" "16-point" "18-point" "1st"
[9] "2" "20-point"
\end{verbatim}
\code{writeLines} takes a character vector and writes each element one
line at a time to a text file.
\end{frame}
\begin{frame}[fragile]{Reading Lines of a Text File}
\code{readLines} can be useful for reading in lines of webpages
\begin{verbatim}
## This might take time
con <- url("http://www.jhsph.edu", "r")
x <- readLines(con)
> head(x)
[1] "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">"
[2] ""
[3] "<html>"
[4] "<head>"
[5] "\t<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" />"[6] "\t"
\end{verbatim}
\end{frame}
\begin{frame}[fragile]{Saving Data in Non-tabular Forms}
For temporary storage or for transport, it is more efficient to save
data in (compressed) binary form using \code{save} or
\code{save.image}.
\begin{verbatim}
x <- 1
y <- data.frame(a = 1, b = "a")
save(x, y, file = "data.RData")
load("data.RData") ## overwrites existing x and y!
\end{verbatim}
Binary formats are not great for long-term storage because if they are
corrupted, recovery is usually not possible.
\end{frame}
\begin{frame}{Serialization}
Serialization is the process of taking an R object and converting into
a representation as a ``series'' of bytes.
\begin{itemize}
\item
The \code{save} and \code{save.image} functions serialize R objects
and then save them to files
\item
The \code{serialize} function can be used to serialize an R object to
an arbitrary connection (database, socket, pipe, etc.)
\item
\code{unserialize} reads from an arbitrary connection and inverts a
serialization, returning an R object
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Serialization}
\begin{verbatim}
> x <- list(1, 2, 3)
> serialize(x, NULL)
[1] 58 0a 00 00 00 02 00 02 06 01 00 02 03 00 00
[16] 00 00 13 00 00 00 03 00 00 00 0e 00 00 00 01
[31] 3f f0 00 00 00 00 00 00 00 00 00 0e 00 00 00
[46] 01 40 00 00 00 00 00 00 00 00 00 00 0e 00 00
[61] 00 01 40 08 00 00 00 00 00 00
\end{verbatim}
\end{frame}
\begin{frame}[fragile]{Serialization}
\begin{verbatim}
> con <- gzfile("foo.gz", "wb")
> serialize(x, con)
NULL
> close(con)
>
> con <- gzfile("foo.gz", "rb")
> y <- unserialize(con)
> identical(x, y)
[1] TRUE
\end{verbatim}
\end{frame}
\begin{frame}{Data Output Summary}
\begin{itemize}
\item
\code{write.table}, \code{write.csv} --- readable output, textual,
little metadata
\item
\code{save}, \code{save.image}, \code{serialize} --- exact
representation, efficient storage if compressed, not recoverable if
corrupted
\item
\code{dput}, \code{dump} --- textual format, somewhat readable,
metadata retained, not usable for more exotic objects (environments)
\end{itemize}
\end{frame}
\end{document}