-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathR_funs.R
175 lines (120 loc) · 6.79 KB
/
R_funs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
pacman::p_load(dplyr,
magrittr,
rvest,
doParallel,
stringr,
lubridate,
data.table,
ggplot2,
scales,
ggthemes,
reshape2
)
#Database was built from data offered by [**bitcoincharts**](https://bitcoincharts.com). First of all there was downloaded files in [**csv.gz**](http://api.bitcoincharts.com/v1/csv/) format which contain quotes from active USD Bitcoin markets. Procedure assume that csv files are frequently update so if files are change re-run of script will cause downloading of files with a new upload time-stamp. The certain csv file contains all transactions from a particular bitcoin market at specific time frame. There are 3 attributes describing each file - time-stamp,price and volume of every transaction. This files could be merged and/or aggregate by time-stamp. Aggregating could be helpful because of the big size of decompressed files but at expense of lossing information. Data could be aggregate by every minute,hour or day. Data aggregation with 1 hour interval was implemented. Then statistics like mean,max.min,first and/or last price was evaluated.
##Data
#[**blockexplorer**](https://blockexplorer.com/b/100000) is an API which allows downloading information from certain blocks. It could be used to gather all bitcoin transactions by web scarping subsequent blocks. However it is much faster and easier to use data already downloaded which are offered for free on the Internet.
#[**bitcoincharts**](https://bitcoincharts.com/markets/currency/USD.html) summarize prices on most important markets.
#At the same website we are able to find database of csv files of historic prices on different markets. Some markets are update constantly others with few months delay.
#Csv files with dates
csvs = function(){
markets_raw <- read_html("http://api.bitcoincharts.com/v1/csv/")
csv_markets = markets_raw %>%
html_nodes("a") %>%
html_text()
csv_dates = markets_raw %>%
html_nodes(xpath="/html/body/pre/text()") %>%
html_text() %>%
str_extract("\\d\\d-[a-zA-z].+-\\d\\d\\d\\d \\d\\d:\\d\\d")
tibble(csv_dates,csv_markets)
}
###Active Markets List
active_markets = function(){
a_markets=read_html("https://bitcoincharts.com/markets/list/") %>%
html_nodes("body > div.container_16.content.padding > div > div.grid_7.alpha > ul") %>% html_text() %>% str_replace_all("\\n","") %>% str_replace_all("\\s+"," ") %>% str_trim()
a_currency = read_html("https://bitcoincharts.com/markets/list/") %>%
html_nodes("body > div.container_16.content.padding > div > div.grid_7.alpha > h3") %>% html_text()
clean_nams = function(x) gsub(")$","",trimws(unlist(str_split(x,"\\(|\\)\\s"))))
by_pos = function(x,by,pos) x[seq(pos,length(x),by)]
DF = tibble(a_currency,a_markets) %>% rowwise() %>% mutate(nam1=list(by_pos(clean_nams(a_markets),2,1)),nam2=list(by_pos(clean_nams(a_markets),2,2)))
DF
}
active_m = active_markets() %>% filter(grepl("USD",a_currency))
csv_files = csvs()
sub = gsub(".csv.gz","",csv_files$csv_markets) %in% unlist(active_m$nam1)
markets_sub = csv_files$csv_markets[sub]
###Download and Binding Data - around 100 millions rows
### It takes a few minutes
cl <- makePSOCKcluster(max(parallel::detectCores()-1,1))
doParallel::registerDoParallel(cl)
bitcoin_merged = foreach(i = 1:length(markets_sub),.combine = "rbind") %dopar%
{
my_data = tryCatch(readr::read_csv(paste0("http://api.bitcoincharts.com/v1/csv/",markets_sub[i]),col_names = c("time","price","vol"),col_types = "idd"),
error= function(e) data.frame(time=NA,price=NA,vol=NA))
my_data$name = gsub(".csv.gz","",markets_sub[i])
my_data
}
stopCluster(cl)
bitcoin_merged$name = factor(bitcoin_merged$name)
#saveRDS(bitcoin_merged,"bitcoinUSD.Rds")
#bitcoin_merged = readRDS("bitcoinUSD_2018-05-14.Rds")
###Aggregating - to x interval
#bitcoin_merged = readRDS('bitcoinUSD.Rds')
setDT(bitcoin_merged)
bitcoin_merged[,c('ymd_hm','ymd_h'):=list(as.POSIXct(time-time%%60,origin="1970-01-01"),as.POSIXct(time-time%%(60*60),origin="1970-01-01"))]
bitcoin_agg = bitcoin_merged %>%
.[,.(
price_max=max(price,na.rm=T),
price_min = min(price,na.rm=T),
price_mean = mean(price,na.rm=T),
price_median = median(price,na.rm=T),
price_first = first(price),
price_last = last(price),
vol_sum = sum(vol,na.rm=T),
vol_max = max(vol,na.rm=T)),
by=list(name,ymd_h)]
bitcoin_agg[,ret_median := (log(price_median)-lag(log(price_median))),by=list(name)]
#rm(bitcoin_merged)
bitcoin_spread = data.table::dcast(bitcoin_agg[,.(ymd_h,name,ret_median)],ymd_h ~ name,value.var="ret_median")
###Deleting markets with high NA ratio
low_NA = colnames(sort(bitcoin_spread[,lapply(.SD,function(x) sum(is.na(x))/length(x))]))[1:5]
#bitcoin_spread = na.omit(bitcoin_spread[,low_NA,with=F])
frams = bitcoin_agg[name %in% low_NA,.(min_t=min(ymd_h),max_t=max(ymd_h)),by=.(name)][,.(min=max(min_t),max=min(max_t))]
dat = bitcoin_agg[ymd_h>=frams$min & ymd_h<frams$max & name %in% low_NA,,][order(ymd_h)]
g1 = ggplot(dat,
aes(x=ymd_h,y=price_median,col=name)) +
ggthemes::theme_tufte()+
geom_point(size=1,alpha=0.1) +
scale_x_datetime(breaks = date_breaks("6 month"),labels = date_format("%Y-%m")) +
theme(axis.text.x = element_text(angle = 90, vjust=0.5)) +
facet_wrap(~name,scales = "fix",ncol=2) +
ggtitle("Bitcoin prices at most volatile Bitcoin USD markets \n 1 hour interval")
g2 = ggplot(dat,
aes(x=ymd_h,y=ret_median,col=name)) +
ggthemes::theme_tufte()+
geom_point(size=1,alpha=0.3) +
scale_x_datetime(breaks = date_breaks("6 month"),labels = date_format("%Y-%m")) +
theme(axis.text.x = element_text(angle = 90, vjust=0.5)) +
facet_wrap(~name,scales = "free",ncol=2) +
labs(title = "Bitcoin returns at most volatile Bitcoin USD markets \n 1 hour interval - free scale")
g3 = ggplot(dat,
aes(x=ymd_h,y=vol_sum,col=name)) +
ggthemes::theme_tufte()+
geom_point(size=1,alpha=0.1) +
scale_x_datetime(breaks = date_breaks("6 month"),labels = date_format("%Y-%m")) +
theme(axis.text.x = element_text(angle = 90, vjust=0.5)) +
facet_wrap(~name,scales = "free",ncol=2) +
ggtitle("Bitcoin volume at most volatile Bitcoin USD markets \n 1 hour interval - free scale")
ggsave("./png/Prices_USD.png",g1)
ggsave("./png/Returns_USD.png",g2)
ggsave("./png/Volume_USD.png",g3)
#######
ss = summary(lm(Ozone~.,airquality))
ss
summary_bayes = function(ll){
summr <- ll$coefficients
summr <- cbind(summr,`P(b<0)`=round(apply(ll$coefficients,1,function(x) pt(-x[1]/x[2], ll$df[2])),6))
summr <- cbind(summr,`P(b>0)`=round(apply(ll$coefficients,1,function(x) 1 - pt(-x[1]/x[2], ll$df[2])),6))
summr
}
summary_bayes(ss)
1 - pt((1-1.89578642)/0.2738869, ss$df[2])