-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtyping_monkeys.R
114 lines (98 loc) · 3.86 KB
/
typing_monkeys.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# ***********************************
# typing monkeys
# Author: Benjamin Tovar
# Date: August 15, 2014
#
# post: http://tata-box-blog.blogspot.mx/2014/09/if-typing-monkeys-have-met-mr-markov.html
# ***********************************
# ******************************
# load functions
# ******************************
source("functions_lib.R")
# ******************************
# simulate and experiment
# where we left a single
# monkey to type n times
# ******************************
# set the alphabet
alphabet <- letters
# > alphabet
# [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
# [20] "t" "u" "v" "w" "x" "y" "z"
# set the length of the alphabet
l <- length(alphabet)
# compute the probability to type one character randomly
char.prob <- 1/l # char.prob <- 1/26
# expand this value as a vector
uniform.prob <- rep(char.prob,l)
# number of characters to type
n <- 10000
v <- sample(alphabet,n,rep=TRUE,prob=uniform.prob)
# compute the expected number of character frequency
expected.char.freq <- floor(n*char.prob)
# compute the frequency table
v.t <- data.frame(table(v))
# ******************************
# plot the frequency of simulated
# characters using the uniform model
# ******************************
library(ggplot2)
ggplot(v.t) +
aes(x=v,y=Freq) +
geom_bar(stat="identity",fill="darkgreen",alpha=0.4) +
geom_hline(yintercept = expected.char.freq ,colour="red",
linetype = "longdash") +
labs(title=paste("Computed number of character frequency after",n,"events",sep=" "),
x="character",y="frequency") +
geom_text(data = data.frame(), aes(20, 450,
label = paste("expected frequency per character =",expected.char.freq,"hits",sep=" ")),
col="red" )
# ******************************
# compare the uniform method vs
# a Markov Chain of order 1 model
# (the M.C model is trained using the first three chapters
# of Dracula)
# ******************************
# set the training file path
training.file <- "dracula_chapters_1_to_3_parsed.txt"
# set the target strings
targets <- c("a","by","the","why","what","dracula","where","linux","omglolbbq")
# compute the emission matrices
# remember that this matrix is only important when the Markov.Chain parameter == TRUE
m.markov.chain <- get.emission.mat(l,alphabet,training.file,Markov.Chain=TRUE,pseudocount=1)
m.uniform <- get.emission.mat(l,alphabet,training.file,Markov.Chain=FALSE,pseudocount=1)
# compute the probability of typing the targets
p.markov.chain <- sapply(targets, function(target) get.p(m.markov.chain,TRUE,target))
p.uniform <- sapply(targets, function(target) get.p(m.uniform,FALSE,target))
# based on the probability values computed, now calculate
# the number of monkeys needed to have at least one successful event.
n.markov.chain<- sapply(p.markov.chain, function(p) get.n.monkeys(p))
n.uniform <- sapply(p.uniform, function(p) get.n.monkeys(p))
# *********************
# plot the relation of n of monkeys based
# on the model
# *********************
k <- length(targets)
# compute the log of base 10 of the number of monkeys
# because the numbers are big big big
n.markov.chain.log <- log(n.markov.chain,10)
n.uniform.log <- log(n.uniform,10)
# plot and sing while science works
plot(n.markov.chain.log,n.uniform.log,
ylim=c(0,16),xlim=c(0,16),pch=1:k,lwd=3,col=1:k,
main="Number of monkeys used by the model (log10(x))",xlab="log10(n.markov.chain)",ylab="log10(n.uniform)")
grid()
abline(0,1,col="blue",lty=2)
text(x=(n.markov.chain.log+0),y=(n.uniform.log+0.5),targets,col="red",cex=1.5)
# ******************************
# PLOT THE MODELS USING A NETWORK
# REPRESENTATION
# ******************************
# load library
library(qgraph)
# plot
layout(matrix(1:2,nr=1,nc=2))
qgraph(m.markov.chain,edge.color="#4d7358")
title("A) Emission matrix (Markov.Chain model)", line = 2.5)
qgraph(m.uniform,edge.color ="#d64d4d")
title("B) Emission matrix (uniform model)", line = 2.5)