-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfacebook_comments.js
251 lines (202 loc) · 8.88 KB
/
facebook_comments.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
// Phantombuster configuration {
"phantombuster command: nodejs"
"phantombuster package: 5"
"phantombuster flags: save-folder"
"phantombuster dependencies: lib-Mattr-Helper.js"
const Buster = require("phantombuster");
const buster = new Buster();
const Nick = require("nickjs");
const nick = new Nick();
const _ = require("lodash");
const MattrHelper = require("./lib-Mattr-Helper");
// }
nick.newTab().then(async (tab) => {
const mattrHelper = new MattrHelper(buster, nick, tab);
const { url, limit } = buster.argument;
const hardCap = limit ? limit : 1000;
//const url = "https://www.facebook.com/DonaldTrump/posts/10161487534090725";
//const url = "https://www.facebook.com/DonaldTrump/videos/vb.153080620724/10159664271045725/?type=2&theater";
//const url = "https://www.facebook.com/DonaldTrump/photos/a.10156483516640725/10161489089290725/?type=3&permPage=1"
await mattrHelper.openTab(url);
await tab.untilVisible('.userContentWrapper'); // Make sure we have loaded the right page
await tab.inject("../injectables/jquery-3.0.0.min.js"); // We're going to use jQuery to scrape
const triggerComments = (arg, done) => {
const wrapper = $('.userContentWrapper:first');
if (!wrapper.length) return done('Could not find the wrapper element');
const summary = $('[data-testid="fbFeedStoryUFI/feedbackSummary"]:first', wrapper);
if (!summary.length) return done('Could not find the summary element');
const commentsLink = $('[data-testid="UFI2CommentsCount/root"]:first', summary);
if (!commentsLink.length) return done('Could not find the commentsLink element');
const pagerLink = $('[data-testid="UFI2CommentsPagerRenderer/pager_depth_0"]:first', wrapper);
const event = new Event('click', { bubbles: true });
event.simulated = true;
if (pagerLink.length) {
// click the "more comments" link to expand comments
pagerLink[0].dispatchEvent(event);
} else {
// click the "N Comments" link to expand comments
commentsLink[0].dispatchEvent(event);
}
done(null);
};
const triggerSorting = (arg, done) => {
const wrapper = $('.userContentWrapper:first');
const dropdownButton = $('a[data-testid="UFI2ViewOptionsSelector/link"]:first', wrapper);
if (!dropdownButton.length) return done(null);
const event = new Event('click', { bubbles: true });
event.simulated = true;
dropdownButton[0].dispatchEvent(event);
setTimeout(() => {
const sortLink = $('[data-testid="UFI2ViewOptionsSelector/menuOption"]:contains(All Comments)');
if (!sortLink.length) return done(null);
const event = new Event('click', { bubbles: true });
event.simulated = true;
sortLink[0].dispatchEvent(event);
done(null);
});
};
const expandComments = (arg, done) => {
const wrapper = $('.userContentWrapper');
const moreLinks = $('.UFICommentContent .UFICommentBody a:contains(See More)', wrapper);
let timeout = 0;
moreLinks.each((i, moreLink) => {
timeout += 10;
const event = new Event('click', { bubbles: true });
event.simulated = true;
moreLink.dispatchEvent(event);
});
setTimeout(() => {
done(null);
}, timeout);
};
const scrapeComments = (arg, done) => {
const wrapper = $('.userContentWrapper:first');
if (!wrapper.length) return done('Could not find wrapper element');
const comments = $('[data-testid="UFI2Comment/root_depth_0"]:not(.tracked)', wrapper).addClass('tracked').css('opacity', 0.5);
const results = [];
comments.each((i, el) => {
const comment = $(el);
const body = $('[data-testid="UFI2Comment/body"]', comment);
if (!body.length) return done('Could not find the body element');
const timestampLink = $('[data-testid="UFI2CommentActionLinks/root"]:first a:has(abbr[data-utime]):first', comment);
if (!timestampLink.length) return done('Could not find the timestampLink element');
const authorEl = $('div:last > span:first, a[data-hovercard]:first', body).first();
if (!authorEl.length) return done('Could not find the author element');
const result = {
id: (() => {
const href = timestampLink.attr('href');
const matches = /((&|\?)comment_id=(\d*)\d*)/gi.exec(href || '');
return matches ? matches[matches.length - 1] : undefined;
})(),
timestamp: $('abbr[data-utime]:first', timestampLink).data('utime'),
authorName: authorEl.text(),
body: $('div:last > span:last', body).text().trim(),
};
if (!result.id) return done('Could not determine comment id');
results.push(result);
});
done(null, results);
};
const checkHasNextPage = (arg, done) => {
const wrapper = $('.userContentWrapper:first');
const pagerLink = $('[data-testid="UFI2CommentsPagerRenderer/pager_depth_0"]:first', wrapper);
done(null, pagerLink.length > 0);
};
const sortComments = async () => {
return await tab.evaluate(triggerSorting);
};
const initPage = async (pageNum) => {
//await tab.screenshot(`page-${pageNum}-before-triggerComments.jpg`);
await tab.evaluate(triggerComments);
await tab.waitWhileVisible('[aria-busy="true"][role="progressbar"]');
if (pageNum === 1) {
await sortComments();
await tab.waitWhileVisible('[aria-busy="true"][role="progressbar"]');
}
return await tab.evaluate(expandComments);
//return await tab.screenshot(`page-${pageNum}-after-triggerComments.jpg`);
};
const getPage = async (pageNum) => {
await initPage(pageNum);
//await tab.screenshot(`page-${pageNum}-before-scrape.jpg`);
const results = await tab.evaluate(scrapeComments);
//await tab.screenshot(`page-${pageNum}-after-scrape.jpg`);
console.log(`Page ${pageNum}: ${results.length} comments(s) scraped from DOM`);
return results;
};
// !TODO: poll the browser context for any overlay and hide them automatically
// Kept here for reference. Might not be needed and is not currently used.
/*
const ctaDismiss = (arg, done) => {
let timeout;
let delay = 500;
const fn = () => {
const ctaCloseButton = $('a#expanding_cta_close_button:visible');
if (!ctaCloseButton.length) return done(null);
const event = new Event('click', { bubbles: true });
event.simulated = true;
ctaCloseButton[0].dispatchEvent(event);
timout = setTimeout(fn, delay);
};
timeout = setTimeout(fn);
done(null);
};
await tab.evaluate(ctaDismiss);
*/
const results = [];
const tracked = {};
const addResult = (result) => {
if (results.length >= hardCap) return false;
if (tracked[result.id]) return false;
tracked[result.id] = true;
results.push(result);
}
const addResults = (page, newResults) => {
const duplicateResults = _.remove(newResults, (result) => tracked[result.id]);
if (duplicateResults.length) {
console.log(`Page ${page}: ${duplicateResults.length} comment(s) were dropped because they were already tracked`);
}
const emptyResults = _.remove(newResults, (result) => !result.body);
if (emptyResults.length) {
console.log(`Page ${page}: ${emptyResults.length} comment(s) were dropped because they didn't have text`);
}
const beforeAddCount = results.length;
newResults.forEach(result => addResult(result));
const afterAddCount = results.length;
console.log(`Page ${page}: total comments increased by ${afterAddCount - beforeAddCount}, from ${beforeAddCount} to ${afterAddCount}`);
};
let page = 1;
let running = true;
console.log(`Started scraping ${url} for comments with a hard cap limit of ${hardCap}`);
while (running) {
const pageResults = await getPage(page);
addResults(page, pageResults);
const limitReached = results.length >= hardCap;
if (limitReached) {
console.log(`Page ${page}: Reached hard cap limit of ${hardCap} comment(s). Scraping stopped.`);
running = false;
break;
}
const hasNextPage = await tab.evaluate(checkHasNextPage);
if (!hasNextPage) {
console.log(`Page ${page}: Reached last page of comments. Scraping stopped.`);
running = false;
break;
}
page += 1;
}
console.log(`${results.length} comment(s) were extracted from a total of ${page} page(s)`);
return results;
})
.then(async (results) => {
const sortedResults = _.orderBy(results, ['timestamp'], ['desc']);
await buster.setResultObject(sortedResults);
})
.then(() => {
console.log("Job done!")
nick.exit()
})
.catch((err) => {
console.log(`Something went wrong: ${err}`)
nick.exit(1)
})