-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtry_to_parquet.R
83 lines (46 loc) · 1.68 KB
/
try_to_parquet.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
library(arrow)
library(data.table)
#' @author Brian Maitner
#' @description Quick and dirty function to convert TRY's .txt files to a more useful parquet format
#' @param file A file to be converted
#' @param output_directory Where you want all the files deposited
#' @param batch_size The number of lines to load in at once
try_to_parquet <- function(file,
output_directory = "manual_downloads/TRY/TRY_parquet/",
batch_size = 80000){
#Setup variables
i <- 0
error_found <- FALSE
#Iterate through batches
while(!error_found){
if(i == 0){
data <- fread(file = file,
nrows = batch_size,
skip = i,sep = "\t",header = TRUE)
col_names <- colnames(data)
}else{
data <- fread(file = file,
nrows = batch_size,
skip = i,sep = "\t",header = FALSE)
colnames(data) <- col_names
}
tryCatch(
expr = write_parquet(x = data,
sink = file.path(output_directory,paste(basename(file),".",as.integer(i),".gz.parquet",sep = "")),
compression = "gzip"),
error = function(e){
error_found <- TRUE
message(paste("Finished converting TRY file",file,"to parquet"))
return(invisible(NULL))
}
)
#check whether you're done
if(nrow(data) < batch_size){
message(paste("Finished converting TRY file",file,"to parquet"))
return(invisible(NULL))
}
i <- i+nrow(data)
print(i)
rm(data)
} #while loop
}# end fx