-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.js
145 lines (120 loc) · 3.14 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/**
* Created by syzer on 4/7/2015.
*/
var jsSpark = require('js-spark')({workers: 4});
var task = jsSpark.jsSpark;
var q = jsSpark.q;
// alternative version
// var q = require('bluebird');
var _ = require('lodash');
var lib = require('./lib')(_);
_.mixin(lib);
var fs = require('fs');
var DRACULA = './data/text/dracula/';
var ch1 = fs.readFileSync(DRACULA + 'ch1.txt').toString();
var ch2 = fs.readFileSync(DRACULA + 'ch2.txt').toString();
var ch3 = fs.readFileSync(DRACULA + 'ch3.txt').toString();
var ch01 = fs.readFileSync(DRACULA + 'ch01.txt').toString();
var ch02 = fs.readFileSync(DRACULA + 'ch02.txt').toString();
ch01 = _.prepare(ch01).split(' ');
ch02 = _.prepare(ch02).split(' ');
// ch01,ch02 is a collection
var todo = [
task(ch01)
.reduce(bigramArray)
.run(),
task(ch02)
// client
.reduce(bigramArray)
.run()
];
var text = [ch1, ch2, ch3].map(function (el) {
return _.prepare(el).split(' ');
});
// cache
var dataBase;
function mergeBig(texts) {
texts = texts || text;
return q.all(texts.map(function (el) {
return task(el)
.reduce(bigramArray)
.run()
})).then(function reducer(data) {
//return task(data).add('merge').run();
return _.mergeObjectsInArr(data);
}).then(function cacheInDb(data) {
dataBase = data;
return data;
});
}
// simples would be return dataBase[word];
// string => array
function predict(word) {
var needle = dataBase[word];
if (!needle) {
return [];
}
var total = _.reduce(needle, function (acc, el) {
return acc + el;
});
needle = _.objToSortedArr(needle);
return _(needle)
.map(function (el) {
el[1] = (el[1] / total).toFixed(3);
return el;
})
.map(function (el) {
return el.join(',');
})
.value();
}
function mergeSmall() {
var task1 = task(ch01)
.map(function(el){
return el.toString();
})
.reduce(bigramArray)
.run();
var task2 = task(ch02)
.reduce(bigramArray)
.run();
return q.all([task1, task2]).then(function (data) {
return _.merge(data[0], data[1]);
});
}
module.exports = function () {
return {
_: _,
ch1: text[0], // this one should not be exposed
ch01: ch01,
bigramText: bigramText,
mergeSmall: mergeSmall, // this should be deleted
mergeBig: mergeBig,
train: train,
predict: predict
}
};
function bigramText(arr) {
return arr.reduce(bigramArray);
}
function bigramArray(acc, word, i, arr) {
if (1 === i) {
acc = {last: acc, out: {}};
}
var out = acc.out;
var last = acc.last;
out[last] = out[last] || {};
out[last][word] = out[last][word] + 1 || 1;
acc.last = word;
acc.out = out;
if (i === arr.length - 1) {
return acc.out;
}
return acc;
}
function train(fileName, splitter) {
var parts = fs.readFileSync(fileName).toString().split(splitter).map(function (el) {
return _.prepare(el).split(' ');
});
return mergeBig(parts);
}