forked from Bridgewater/scala-notebook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBokeh+Spark+and+Breeze.snb
160 lines (160 loc) · 10.6 KB
/
Bokeh+Spark+and+Breeze.snb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
{
"metadata":{
"name":"Bokeh Spark and Breeze",
"user_save_timestamp":"2014-11-16T20:41:03.616Z",
"auto_save_timestamp":"2014-11-20T18:07:18.024Z"
},
"worksheets":[{
"cells":[{
"cell_type":"code",
"input":"// First import is the official scala-bookeh stuff, the second one is the integration of bookeh\n// into this worksheet project.\nimport io.continuum.bokeh._\nimport notebook.front.third.Bokeh",
"language":"scala",
"collapsed":false,
"prompt_number":39,
"outputs":[]
},{
"cell_type":"code",
"input":"\nimport math.{Pi=>pi}\n\n// Generate some data, in = [-2pi,2pi], out = sin(in)\nval xss = List(0.2,0.05,0.01).map(resolution => -2*pi to 2*pi by resolution)\nval f = Math.sin _\n\n// Generate some plots of {(x,sin(x)}\nval graphs = for (xs <- xss)\n yield Bokeh.functionGraph(xs, f).title(s\"n = ${xs.size} samples\")\n// Push all plots to front-end \nBokeh.plot(graphs)",
"language":"scala",
"collapsed":false,
"prompt_number":40,
"outputs":[]
},{
"cell_type":"code",
"input":"// Other random imports, breeze is for linear algebra.\nimport breeze.linalg.linspace \n \n// Simple scatter plot example.\n// We build a grid of points in the unit square.\nval unit_interval = linspace(0,1,20)\nunit_interval.sum / unit_interval.size \nval unit_square = for (x <- unit_interval.data; y <- unit_interval.data) yield (x,y)\nunit_square.size\n// directly plot these points\nBokeh.scatterPlot(unit_square.map({ case(x,y) => Bokeh.ScatterPoint(x,y) })) ",
"language":"scala",
"collapsed":false,
"prompt_number":41,
"outputs":[]
},{
"cell_type":"code",
"input":"// Now divide the points into two groups.\n// Note: providing a scatter group is optional! \nval groupOutside = Bokeh.ScatterGroup(r = 4.0, fill_color = Color.Red)\nval groupInside = Bokeh.ScatterGroup(r = 4.0, fill_color = Color.Blue)\nval annotatedPoints = unit_square.map({ case (x,y) => {\n val length = x*x + y*y\n Bokeh.ScatterPoint(x,y, if (length >= 1) groupOutside else groupInside) \n}}) \n \nBokeh.scatterPlot(annotatedPoints)",
"language":"scala",
"collapsed":false,
"prompt_number":42,
"outputs":[]
},{
"cell_type":"code",
"input":"import scala.util.Try\n// Let us actually start to use Spark... \n// Load data from a text file (but it could be anything, like from hadoop file system, s3...)\nval in = sparkContext.textFile(\"./example/binary_classification.csv\")\n\n// Define data model\ncase class DataPoint(x: Double, y: Double, group: String)\nval validGroups = Set(\"A\",\"B\") \n// extract and clean\nval extractAttempts = in.map(line => line.split(\" \"))\n \t\t\t\t\t .map(token => Try( DataPoint(token(1).toDouble, token(2).toDouble, token(0) )))\n \nval dataPoints = extractAttempts.filter(_.isSuccess)\n .map(_.get)\n .filter(dp => validGroups contains dp.group)\n \t\t\t\t\t\t\t .cache() // we want to do a lot of stuff with this\ns\"Have ${dataPoints.count()} data points.\"\n ",
"language":"scala",
"collapsed":false,
"prompt_number":43,
"outputs":[]
},{
"cell_type":"code",
"input":"// PlotOps add some helper methods, for example += (to add sth. to render to a plot)\nimport notebook.front.third.bokeh.PlotOps._\n// sample up to 100 points and bring it into plottable format\nval scatterPoints = dataPoints.take(100)\n .map(dp => Bokeh.ScatterPoint(dp.x, dp.y, \n if (dp.group == \"A\") groupInside else groupOutside))\n\nval samplesPlot = Bokeh.scatter(scatterPoints).title(\"<= 100 samples\")\n// add a line to visualize a hand-made linear-regression seperator\nval line = Bokeh.buildLine(Seq(Bokeh.Point(0.0,1.3), Bokeh.Point(1.3,0.0)))\nsamplesPlot += line\nBokeh.plot(samplesPlot :: Nil)",
"language":"scala",
"collapsed":false,
"prompt_number":44,
"outputs":[]
},{
"cell_type":"code",
"input":"// Time for using machine learning\nimport org.apache.spark.mllib.regression.LinearRegressionWithSGD\nimport org.apache.spark.mllib.regression.LabeledPoint\nimport org.apache.spark.mllib.linalg.{Vector, Vectors}\n\n// bring data in form for MLlib and partition data into training and test set\nval labelData = dataPoints.map(dp => LabeledPoint(if (dp.group == \"A\") 1.0 else 0.0, Vectors.dense(Array(dp.x,dp.y))))\nval Array(trainingData, testData) = labelData randomSplit Array(0.8, 0.2)\n// linear regression with stochastic gradient descent with 30 iterations\nval model = LinearRegressionWithSGD.train(trainingData, 30)\n\n// now test the prediction\nval prediction = model.predict(testData.map(_.features)).map(Math.round)\n\nfor ( (real,calculated) <- (testData.map(_.label) zip prediction) )\n println(s\"Real: $real and Calculated: $calculated\")\n \n ",
"language":"scala",
"collapsed":false,
"prompt_number":45,
"outputs":[]
},{
"cell_type":"code",
"input":"",
"language":"scala",
"collapsed":false,
"outputs":[]
},{
"cell_type":"code",
"input":"",
"language":"scala",
"collapsed":false,
"outputs":[]
},{
"cell_type":"code",
"input":"",
"language":"scala",
"collapsed":false,
"outputs":[]
},{
"cell_type":"code",
"input":"",
"language":"scala",
"collapsed":false,
"outputs":[]
}]
}],
"autosaved":[{
"cells":[{
"cell_type":"code",
"input":"// First import is the official scala-bookeh stuff, the second one is the integration of bookeh\n// into this worksheet project.\nimport io.continuum.bokeh._\nimport notebook.front.third.Bokeh",
"language":"scala",
"collapsed":false,
"prompt_number":2,
"outputs":[]
},{
"cell_type":"code",
"input":"import math.{Pi=>pi}\n\n// Generate some data, in = [-2pi,2pi], out = sin(in)\nval xss = List(0.2,0.05,0.01).map(resolution => -2*pi to 2*pi by resolution)\nval f = Math.sin _\n\n// Generate some plots of {(x,sin(x)}\nval graphs = for (xs <- xss)\n yield Bokeh.functionGraph(xs, f).title(s\"n = ${xs.size} samples\")\n// Push all plots to front-end \nBokeh.plot(graphs)",
"language":"scala",
"collapsed":false,
"prompt_number":40,
"outputs":[]
},{
"cell_type":"code",
"input":"// Other random imports, breeze is for linear algebra.\nimport breeze.linalg.linspace \n \n// Simple scatter plot example.\n// We build a grid of points in the unit square.\nval unit_interval = linspace(0,1,20)\nunit_interval.sum / unit_interval.size \nval unit_square = for (x <- unit_interval.data; y <- unit_interval.data) yield (x,y)\nunit_square.size\n// directly plot these points\nBokeh.scatterPlot(unit_square.map({ case(x,y) => Bokeh.ScatterPoint(x,y) })) ",
"language":"scala",
"collapsed":false,
"prompt_number":41,
"outputs":[]
},{
"cell_type":"code",
"input":"// Now divide the points into two groups.\n// Note: providing a scatter group is optional! \nval groupOutside = Bokeh.ScatterGroup(r = 4.0, fill_color = Color.Red)\nval groupInside = Bokeh.ScatterGroup(r = 4.0, fill_color = Color.Blue)\nval annotatedPoints = unit_square.map({ case (x,y) => {\n val length = x*x + y*y\n Bokeh.ScatterPoint(x,y, if (length >= 1) groupOutside else groupInside) \n}}) \n \nBokeh.scatterPlot(annotatedPoints)",
"language":"scala",
"collapsed":false,
"prompt_number":42,
"outputs":[]
},{
"cell_type":"code",
"input":"import scala.util.Try\n// Let us actually start to use Spark... \n// Load data from a text file (but it could be anything, like from hadoop file system, s3...)\nval in = sparkContext.textFile(\"./example/binary_classification.csv\")\n\n// Define data model\ncase class DataPoint(x: Double, y: Double, group: String)\nval validGroups = Set(\"A\",\"B\") \n// extract and clean\nval extractAttempts = in.map(line => line.split(\" \"))\n \t\t\t\t\t .map(token => Try( DataPoint(token(1).toDouble, token(2).toDouble, token(0) )))\n \nval dataPoints = extractAttempts.filter(_.isSuccess)\n .map(_.get)\n .filter(dp => validGroups contains dp.group)\n \t\t\t\t\t\t\t .cache() // we want to do a lot of stuff with this\ns\"Have ${dataPoints.count()} data points.\"\n ",
"language":"scala",
"collapsed":false,
"prompt_number":43,
"outputs":[]
},{
"cell_type":"code",
"input":"// PlotOps add some helper methods, for example += (to add sth. to render to a plot)\nimport notebook.front.third.bokeh.PlotOps._\n// sample up to 100 points and bring it into plottable format\nval scatterPoints = dataPoints.take(100)\n .map(dp => Bokeh.ScatterPoint(dp.x, dp.y, \n if (dp.group == \"A\") groupInside else groupOutside))\n\nval samplesPlot = Bokeh.scatter(scatterPoints).title(\"<= 100 samples\")\n// add a line to visualize a hand-made linear-regression seperator\nval line = Bokeh.buildLine(Seq(Bokeh.Point(0.0,1.3), Bokeh.Point(1.3,0.0)))\nsamplesPlot += line\nBokeh.plot(samplesPlot :: Nil)",
"language":"scala",
"collapsed":false,
"prompt_number":44,
"outputs":[]
},{
"cell_type":"code",
"input":"// Time for using machine learning\nimport org.apache.spark.mllib.regression.LinearRegressionWithSGD\nimport org.apache.spark.mllib.regression.LabeledPoint\nimport org.apache.spark.mllib.linalg.{Vector, Vectors}\n\n// bring data in form for MLlib and partition data into training and test set\nval labelData = dataPoints.map(dp => LabeledPoint(if (dp.group == \"A\") 1.0 else 0.0, Vectors.dense(Array(dp.x,dp.y))))\nval Array(trainingData, testData) = labelData randomSplit Array(0.8, 0.2)\n// linear regression with stochastic gradient descent with 30 iterations\nval model = LinearRegressionWithSGD.train(trainingData, 30)\n\n// now test the prediction\nval prediction = model.predict(testData.map(_.features)).map(Math.round)\n\nfor ( (real,calculated) <- (testData.map(_.label) zip prediction) )\n println(s\"Real: $real and Calculated: $calculated\")\n \n ",
"language":"scala",
"collapsed":false,
"prompt_number":45,
"outputs":[]
},{
"cell_type":"code",
"input":"",
"language":"scala",
"collapsed":false,
"outputs":[]
},{
"cell_type":"code",
"input":"",
"language":"scala",
"collapsed":false,
"outputs":[]
},{
"cell_type":"code",
"input":"",
"language":"scala",
"collapsed":false,
"outputs":[]
},{
"cell_type":"code",
"input":"",
"language":"scala",
"collapsed":false,
"outputs":[]
}]
}],
"nbformat":3
}